def from_pajek(filename, ignore_weights=False): """ A simple pajek file reader. Only support a single network. """ with open(filename, 'r') as f: name, n_nodes, node_labels, edges, arcs = parse_pajek(f) if name is None: name = 'Unnamed' if len(arcs) == 0: # Simple undirected graph return GraphDataset.from_edges(n_nodes=n_nodes, edges=edges, weighted=not ignore_weights, directed=False, node_labels=node_labels) else: # Has directed edges if len(edges) > 0: # Merge for e in edges: if e[0] != e[1]: arcs.append((e[0], e[1], e[2])) arcs.append((e[1], e[2], e[2])) else: edges = arcs return GraphDataset.from_edges(n_nodes=n_nodes, edges=edges, weighted=not ignore_weights, directed=True, node_labels=node_labels)
def from_edge_list(filename, weighted=True, directed=True, name=None, delimiter=None, comment_line_start=None): """ Construct a undirected graph from a text file with simple edge lists: 1 2 0.1 2 3 0.1 ... Note: the node name can be strings without whitespace in them. For instance, n1 n2 0.1 n2 n3 0.2 ... However, these names will be encoded into integers. """ if name is None: name = os.path.splitext(os.path.basename(filename))[0] with open(filename, 'r') as f: n_nodes, edges, node_labels = parse_edge_list(f, delimiter, comment_line_start) return GraphDataset.from_edges(n_nodes=n_nodes, edges=edges, weighted=weighted, directed=directed, name=name, node_labels=node_labels)
def test_undirected(self): edges = [(0, 1, 0.1), (1, 2, 0.2), (2, 3, 0.3)] g = GraphDataset.from_edges(n_nodes=4, edges=edges, weighted=True) A = g.get_adj_matrix() A_expected = np.array([[0.0, 0.1, 0.0, 0.0], [0.1, 0.0, 0.2, 0.0], [0.0, 0.2, 0.0, 0.3], [0.0, 0.0, 0.3, 0.0]]) self.assertTrue(np.array_equal(A, A_expected))
def test_from_adj_mat_dense_undirected_weighted(self): edges_expected = [(0, 0, 1.0), (0, 1, 0.1), (0, 3, 0.3), (1, 3, 0.4), (2, 3, 0.9)] g = GraphDataset.from_adj_mat(self.sample_A, directed=False, weighted=True, name='Weighted') self.assertEqual(g.name, 'Weighted') self.assertSetEqual(set(g.get_edge_iter()), set(edges_expected))
def create_loop_graph(n = 5): """ Creates a simple loop with n nodes. """ if n < 0 or int(n) != n: raise ValueError('n must be a nonnegative integer.') edges = map(lambda i : (i, (i + 1) % n), range(n)) return GraphDataset.from_edges(n_nodes=n, edges=edges, weighted=False, directed=False, name="Loop-{}".format(n))
def test_from_simple_edge_list(self): edges = [(0, 0), (0, 1), (0, 2), (0, 3), (3, 4)] g = GraphDataset.from_edges(n_nodes=5, edges=edges, name='GraphX') self.assertEqual(g.name, 'GraphX') self.assertFalse(g.weighted) self.assertFalse(g.directed) self.assertEqual(g.n_nodes, 5) self.assertEqual(g.n_edges, 5) self.assertSetEqual(set(g.get_edge_iter()), set(map(lambda e: (e[0], e[1], 1), edges)))
def setUp(self): # A loop graph edges = [(0, 1, 0.1, 'a'), (1, 2, 0.2, 'b'), (2, 3, 0.3, 'c'), (3, 4, 0.4, 'd'), (4, 5, 0.5, 'e'), (5, 0, 0.6, 'f')] node_labels = [11, 22, 33, 44, 55, 66] node_attributes = ['n1', 'n2', 'n3', 'n4', 'n5', 'n6'] self.g_loop = GraphDataset.from_edges(n_nodes=6, edges=edges, weighted=True, directed=False, has_edge_data=True, node_labels=node_labels, node_attributes=node_attributes)
def test_from_adj_mat_dense_undirected_unweighted(self): node_attrs = ['a', 'b', 'c', 'd'] node_labels = [1, 2, 4, 5] edges_expected = [(0, 0, 1), (0, 1, 1), (0, 3, 1), (1, 3, 1), (2, 3, 1)] g = GraphDataset.from_adj_mat(self.sample_A, directed=False, weighted=False, name='Unweighted', node_attributes=node_attrs, node_labels=node_labels) self.assertEqual(g.name, 'Unweighted') self.assertSetEqual(set(g.get_edge_iter()), set(edges_expected)) self.assertListEqual(list(g.node_attributes), node_attrs) self.assertListEqual(list(g.node_labels), node_labels)
def create_random_graph(n = 10, p = 0.1): """ Creates a simple random graph, where each edge is added with probability p. """ if n < 0 or int(n) != n: raise ValueError('n must be a nonnegative integer.') if p < 0. or p > 1.: raise ValueError('p must be a probability.') edges = [] for i in range(n): for j in range(i + 1, n): if random.random() <= p: edges.append((i, j)) return GraphDataset.from_edges(n_nodes=n, edges=edges, weighted=False, directed=False, name="RandomGraph-{}-{}".format(n, p))
def load_data(): edges = [(0, 1), (0, 2), (1, 2), (0, 3), (1, 3), (2, 3), (0, 4), (0, 5), (0, 6), (4, 6), (5, 6), (0, 7), (1, 7), (2, 7), (3, 7), (0, 8), (2, 8), (2, 9), (0, 10), (4, 10), (5, 10), (0, 11), (0, 12), (3, 12), (0, 13), (1, 13), (2, 13), (3, 13), (5, 16), (6, 16), (0, 17), (1, 17), (0, 19), (1, 19), (0, 21), (1, 21), (23, 25), (24, 25), (2, 27), (23, 27), (24, 27), (2, 28), (23, 29), (26, 29), (1, 30), (8, 30), (0, 31), (24, 31), (25, 31), (28, 31), (2, 32), (8, 32), (14, 32), (15, 32), (18, 32), (20, 32), (22, 32), (23, 32), (29, 32), (30, 32), (31, 32), (8, 33), (9, 33), (13, 33), (14, 33), (15, 33), (18, 33), (19, 33), (20, 33), (22, 33), (23, 33), (26, 33), (27, 33), (28, 33), (29, 33), (30, 33), (31, 33), (32, 33)] return GraphDataset.from_edges(n_nodes=34, edges=edges, weighted=False, directed=False, name='Zachary\'s Karate Club')
def load_data(data_src=None): """ Loads the US power grid network (https://toreopsahl.com/datasets/#uspowergrid). :param data_src: Specifies the location of the edge list file you downloaded from http://opsahl.co.uk/tnet/datasets/USpowergrid_n4941.txt. """ if data_src is None: data_dir = get_setting('data_dir') data_src = download_file('http://opsahl.co.uk/tnet/datasets/USpowergrid_n4941.txt', data_dir) def process_line(l): splits = l.rstrip('\n').split() return int(splits[0]) - 1, int(splits[1]) - 1 with open(data_src, 'r') as f: edges = map(process_line, f) return GraphDataset.from_edges(n_nodes=4941, edges=edges, weighted=False, directed=False, name='US Power Grid')
def _load_asunetwork_dataset(data_dir, expected_n_nodes, expected_n_edges, **kwargs): """ Processes the ASU network datasets: http://socialcomputing.asu.edu/pages/datasets :param data_dir: Directory containing nodes.csv, edges.csv, and group-edges.csv """ node_file = os.path.join(data_dir, 'nodes.csv') edge_file = os.path.join(data_dir, 'edges.csv') # Load nodes with open(node_file, 'r') as f: node_names = [l.rstrip('\n') for l in f] node_name_map = {x: i for i, x in enumerate(node_names)} assert (len(node_name_map) == expected_n_nodes) # Load edges edges = [] with open(edge_file, 'r') as f: for l in f: splits = l.rstrip('\n').split(',') edges.append((node_name_map[splits[0]], node_name_map[splits[1]])) assert (len(edges) == expected_n_edges) # Process labels # Note: each node can have more than one label or zero label. label_file = os.path.join(data_dir, 'group-edges.csv') if os.path.exists(label_file): node_labels = [[] for i in range(expected_n_nodes)] with open(label_file, 'r') as f: for l in f: splits = l.rstrip('\n').split(',') nid = node_name_map[splits[0]] gid = int(splits[1]) node_labels[nid].append(gid) else: node_labels = None return GraphDataset.from_edges(n_nodes=expected_n_nodes, edges=edges, **kwargs)
def _load(dataset_name, data_dir=None, download_url=None): """ General data loader. Let n = total number of nodes m = total number of edges N = number of graphs DS_A.txt (m lines): sparse (block diagonal) adjacency matrix for all graphs, each line corresponds to (row, col) resp. (node_id, node_id). **All graphs are undirected. Hence, DS_A.txt contains two entries for each edge.** DS_graph_indicator.txt (n lines): column vector of graph identifiers for all nodes of all graphs, the value in the i-th line is the graph_id of the node with node_id i. DS_graph_labels.txt (N lines): class labels for all graphs in the data set, the value in the i-th line is the class label of the graph with graph_id i. DS_node_labels.txt (n lines): column vector of node labels, the value in the i-th line corresponds to the node with node_id i. There are optional files if the respective information is available: DS_edge_labels.txt (m lines; same size as DS_A_sparse.txt): labels for the edges in DS_A_sparse.txt. DS_edge_attributes.txt (m lines; same size as DS_A.txt): attributes for the edges in DS_A.txt. DS_node_attributes.txt (n lines): matrix of node attributes, the comma seperated values in the i-th line is the attribute vector of the node with node_id i. DS_graph_attributes.txt (N lines): regression values for all graphs in the data set, the value in the i-th line is the attribute of the graph with graph_id i. """ if data_dir is None: data_dir = os.path.join(get_setting('data_dir'), dataset_name) # i-th line corresponds to the i-th edge: (na, nb). na and nb start from 1. edge_definition_filename = os.path.join(data_dir, dataset_name + '_A.txt') # i-th line corresponds to the i-th edge's label edge_label_filename = os.path.join(data_dir, dataset_name + '_edge_labels.txt') # i-th line corresponds to the i-th edge's attribute edge_attributes_filename = os.path.join( data_dir, dataset_name + '_edge_attributes.txt') # i-th line is the graph_id of the node with node_id i graph_indicator_filename = os.path.join( data_dir, dataset_name + '_graph_indicator.txt') # i-th line is the label of the i-th graph graph_label_filename = os.path.join(data_dir, dataset_name + '_graph_labels.txt') # i-th line is the label of the i-th node node_label_filename = os.path.join(data_dir, dataset_name + '_node_labels.txt') # i-th line is the attribute of the i-th node node_attributes_filename = os.path.join( data_dir, dataset_name + '_node_attributes.txt') # Download if possible if not os.path.exists( edge_definition_filename) and download_url is not None: print('Downloading data from ' + download_url) downloaded_file = download_file(download_url, get_setting('data_dir'), unpack=True) # Load graph labels with open(graph_label_filename, 'r') as f: graph_labels = [int(l) for l in f] n_graphs = len(graph_labels) # Load edges edges = [] with open(edge_definition_filename, 'r') as f: for l in f: splits = l.split(',') # Convert to zero-based indexing. edges.append((int(splits[0]) - 1, int(splits[1]) - 1)) # Load edge labels if os.path.exists(edge_label_filename): with open(edge_label_filename, 'r') as f: edge_labels = [int(l) for l in f] if len(edge_labels) != len(edges): raise Exception( 'The length of the edge label list does not match the number of edges.' ) else: edge_labels = None # Load edge attributes if os.path.exists(edge_attributes_filename): with open(edge_definition_filename, 'r') as f: edge_attributes = [tuple(map(float, l.split(','))) for l in f] if len(edge_attributes) != len(edges): raise Exception( 'The length of the edge attribute list does not match the number of edges.' ) else: edge_attributes = None # Combine edge labels and attributes into edge data tuples if edge_attributes is None: if edge_labels is not None: edge_data = [LabelAttrTuple(label, None) for label in edge_labels] else: edge_data = None else: if edge_labels is None: edge_data = [ LabelAttrTuple(None, attr) for attr in edge_attributes ] else: edge_data = [ LabelAttrTuple(edge_labels[i], edge_attributes[i]) for i in range(len(edges)) ] # Load graph indicators graph_indicators = [[] for i in range(n_graphs)] n_nodes_total = 0 with open(graph_indicator_filename, 'r') as f: # Convert to zero-based indexing. for i, l in enumerate(f): cur_graph_id = int(l) - 1 graph_indicators[cur_graph_id].append(i) n_nodes_total += 1 # Load node labels if os.path.exists(node_label_filename): with open(node_label_filename, 'r') as f: node_labels = [int(l) for l in f] if len(node_labels) != n_nodes_total: raise Exception( 'The length of the node label list does not match the number of nodes.' ) else: node_labels = None # Load node attributes if os.path.exists(node_attributes_filename): with open(node_attributes_filename, 'r') as f: node_attributes = [tuple(map(float, l.split(','))) for l in f] if len(node_attributes) != n_nodes_total: raise Exception( 'The length of the node attribute list does not match the number of nodes.' ) else: node_attributes = None # Construct the full graph if edge_data is None: g = GraphDataset.from_edges(n_nodes=n_nodes_total, edges=edges, weighted=False, directed=False, node_labels=node_labels, node_attributes=node_attributes) else: # All graphs are undirected zipped = filter(lambda t: t[0][0] <= t[0][1], zip(edges, edge_data)) edge_iter = map(lambda t: (t[0][0], t[0][1], 1, t[1]), zipped) g = GraphDataset.from_edges(n_nodes=n_nodes_total, edges=edge_iter, weighted=False, directed=False, has_edge_data=True, node_labels=node_labels, node_attributes=node_attributes) graphs = [None] * n_graphs for i in tqdm(range(n_graphs)): graphs[i] = g.subgraph(nodes_to_keep=graph_indicators[i], name=dataset_name + '-' + str(i + 1)) return MultiGraphDataset(graphs, graph_labels, dataset_name)
def from_mtx_file(filename, weighted=True, directed=True, name=None): a = mmread(filename) return GraphDataset.from_adj_mat(a, weighted=weighted, directed=directed, name=name)
def load_temporal(data_src, start_time=None, end_time=None, name=None, notes=None, **kwargs): """ Base loader for SNAP temporal graph datasets such as http://snap.stanford.edu/data/sx-mathoverflow.html. Each line in the data file should be triplet of (SRC, TGT, UNIXTS) representing an edge: SRC: id of the source node (a user) TGT: id of the target node (a user) UNIXTS: Unix timestamp (seconds since the epoch) :param data_src: Specifies the location of the edge list file. :param start_time: Optional datetime object. All edges that are formed before start_time will not be included. :param end_time: Optional datetime object. All edges that are formed at of after end_time will not be included. """ if start_time is None: start_time = datetime.min if end_time is None: end_time = datetime.max actual_start_time = datetime.max actual_end_time = datetime.min edges = [] cur_node_id = 0 node_id_map = {} # Construct the edge list with open(data_src, 'r') as f: for l in f: splits = l.rstrip('\n').split() na = int(splits[0]) nb = int(splits[1]) ts = datetime.fromtimestamp(int(splits[2])) if na not in node_id_map: node_id_map[na] = cur_node_id cur_node_id += 1 if nb not in node_id_map: node_id_map[nb] = cur_node_id cur_node_id += 1 if ts >= start_time and ts < end_time: edges.append((node_id_map[na], node_id_map[nb])) if ts < actual_start_time: actual_start_time = ts if ts > actual_end_time: actual_end_time = ts actual_start_time = actual_start_time actual_end_time = actual_end_time # Name and notes if name is None: name = 'Unnamed' if notes is None: if actual_end_time < actual_start_time: s = 'empty' else: s = '{} - {}'.format( actual_start_time.strftime('%Y-%m-%d %H:%M:%S'), actual_end_time.strftime('%Y-%m-%d %H:%M:%S')) notes = 'timespan is ' + s return GraphDataset.from_edges(n_nodes=len(node_id_map), edges=edges, name=name, notes=notes, **kwargs)
def setUp(self): edges = [(0, 1, 2.0), (0, 2, 1.5), (1, 2, 0.4), (1, 3, 0.2)] self.g_weighted = GraphDataset.from_edges(n_nodes=4, edges=edges, weighted=True) self.g_unweighted = GraphDataset.from_edges(n_nodes=4, edges=edges)
def setUp(self): edges = [(0, 1), (0, 2), (1, 1), (1, 2), (1, 3)] self.g_undirected = GraphDataset.from_edges(n_nodes=5, edges=edges) self.g_directed = GraphDataset.from_edges(n_nodes=5, edges=edges, directed=True)