Ejemplo n.º 1
0
def from_pajek(filename, ignore_weights=False):
    """
    A simple pajek file reader. Only support a single network.
    """
    with open(filename, 'r') as f:
        name, n_nodes, node_labels, edges, arcs = parse_pajek(f)
        if name is None:
            name = 'Unnamed'
        if len(arcs) == 0:
            # Simple undirected graph
            return GraphDataset.from_edges(n_nodes=n_nodes,
                                           edges=edges,
                                           weighted=not ignore_weights,
                                           directed=False,
                                           node_labels=node_labels)
        else:
            # Has directed edges
            if len(edges) > 0:
                # Merge
                for e in edges:
                    if e[0] != e[1]:
                        arcs.append((e[0], e[1], e[2]))
                        arcs.append((e[1], e[2], e[2]))
            else:
                edges = arcs
            return GraphDataset.from_edges(n_nodes=n_nodes,
                                           edges=edges,
                                           weighted=not ignore_weights,
                                           directed=True,
                                           node_labels=node_labels)
Ejemplo n.º 2
0
def from_edge_list(filename,
                   weighted=True,
                   directed=True,
                   name=None,
                   delimiter=None,
                   comment_line_start=None):
    """
    Construct a undirected graph from a text file with simple edge lists:
    1 2 0.1
    2 3 0.1
    ...
    
    Note: the node name can be strings without whitespace in them. For instance,
    n1 n2 0.1
    n2 n3 0.2
    ...
    However, these names will be encoded into integers.
    """
    if name is None:
        name = os.path.splitext(os.path.basename(filename))[0]
    with open(filename, 'r') as f:
        n_nodes, edges, node_labels = parse_edge_list(f, delimiter,
                                                      comment_line_start)
        return GraphDataset.from_edges(n_nodes=n_nodes,
                                       edges=edges,
                                       weighted=weighted,
                                       directed=directed,
                                       name=name,
                                       node_labels=node_labels)
Ejemplo n.º 3
0
 def test_undirected(self):
     edges = [(0, 1, 0.1), (1, 2, 0.2), (2, 3, 0.3)]
     g = GraphDataset.from_edges(n_nodes=4, edges=edges, weighted=True)
     A = g.get_adj_matrix()
     A_expected = np.array([[0.0, 0.1, 0.0, 0.0], [0.1, 0.0, 0.2, 0.0],
                            [0.0, 0.2, 0.0, 0.3], [0.0, 0.0, 0.3, 0.0]])
     self.assertTrue(np.array_equal(A, A_expected))
Ejemplo n.º 4
0
 def test_from_adj_mat_dense_undirected_weighted(self):
     edges_expected = [(0, 0, 1.0), (0, 1, 0.1), (0, 3, 0.3), (1, 3, 0.4),
                       (2, 3, 0.9)]
     g = GraphDataset.from_adj_mat(self.sample_A,
                                   directed=False,
                                   weighted=True,
                                   name='Weighted')
     self.assertEqual(g.name, 'Weighted')
     self.assertSetEqual(set(g.get_edge_iter()), set(edges_expected))
Ejemplo n.º 5
0
def create_loop_graph(n = 5):
    """
    Creates a simple loop with n nodes.
    """
    if n < 0 or int(n) != n:
        raise ValueError('n must be a nonnegative integer.')
    edges = map(lambda i : (i, (i + 1) % n), range(n))
    return GraphDataset.from_edges(n_nodes=n, edges=edges, weighted=False,
                                   directed=False, name="Loop-{}".format(n))
Ejemplo n.º 6
0
 def test_from_simple_edge_list(self):
     edges = [(0, 0), (0, 1), (0, 2), (0, 3), (3, 4)]
     g = GraphDataset.from_edges(n_nodes=5, edges=edges, name='GraphX')
     self.assertEqual(g.name, 'GraphX')
     self.assertFalse(g.weighted)
     self.assertFalse(g.directed)
     self.assertEqual(g.n_nodes, 5)
     self.assertEqual(g.n_edges, 5)
     self.assertSetEqual(set(g.get_edge_iter()),
                         set(map(lambda e: (e[0], e[1], 1), edges)))
Ejemplo n.º 7
0
 def setUp(self):
     # A loop graph
     edges = [(0, 1, 0.1, 'a'), (1, 2, 0.2, 'b'), (2, 3, 0.3, 'c'),
              (3, 4, 0.4, 'd'), (4, 5, 0.5, 'e'), (5, 0, 0.6, 'f')]
     node_labels = [11, 22, 33, 44, 55, 66]
     node_attributes = ['n1', 'n2', 'n3', 'n4', 'n5', 'n6']
     self.g_loop = GraphDataset.from_edges(n_nodes=6,
                                           edges=edges,
                                           weighted=True,
                                           directed=False,
                                           has_edge_data=True,
                                           node_labels=node_labels,
                                           node_attributes=node_attributes)
Ejemplo n.º 8
0
 def test_from_adj_mat_dense_undirected_unweighted(self):
     node_attrs = ['a', 'b', 'c', 'd']
     node_labels = [1, 2, 4, 5]
     edges_expected = [(0, 0, 1), (0, 1, 1), (0, 3, 1), (1, 3, 1),
                       (2, 3, 1)]
     g = GraphDataset.from_adj_mat(self.sample_A,
                                   directed=False,
                                   weighted=False,
                                   name='Unweighted',
                                   node_attributes=node_attrs,
                                   node_labels=node_labels)
     self.assertEqual(g.name, 'Unweighted')
     self.assertSetEqual(set(g.get_edge_iter()), set(edges_expected))
     self.assertListEqual(list(g.node_attributes), node_attrs)
     self.assertListEqual(list(g.node_labels), node_labels)
Ejemplo n.º 9
0
def create_random_graph(n = 10, p = 0.1):
    """
    Creates a simple random graph, where each edge is added with
    probability p.
    """
    if n < 0 or int(n) != n:
        raise ValueError('n must be a nonnegative integer.')
    if p < 0. or p > 1.:
        raise ValueError('p must be a probability.')
    edges = []
    for i in range(n):
        for j in range(i + 1, n):
            if random.random() <= p:
                edges.append((i, j))
    return GraphDataset.from_edges(n_nodes=n, edges=edges, weighted=False,
                                   directed=False, name="RandomGraph-{}-{}".format(n, p))
Ejemplo n.º 10
0
def load_data():
    edges = [(0, 1), (0, 2), (1, 2), (0, 3), (1, 3), (2, 3), (0, 4), (0, 5),
             (0, 6), (4, 6), (5, 6), (0, 7), (1, 7), (2, 7), (3, 7), (0, 8),
             (2, 8), (2, 9), (0, 10), (4, 10), (5, 10), (0, 11), (0, 12),
             (3, 12), (0, 13), (1, 13), (2, 13), (3, 13), (5, 16), (6, 16),
             (0, 17), (1, 17), (0, 19), (1, 19), (0, 21), (1, 21), (23, 25),
             (24, 25), (2, 27),
             (23, 27), (24, 27), (2, 28), (23, 29), (26, 29), (1, 30), (8, 30),
             (0, 31), (24, 31), (25, 31), (28, 31), (2, 32), (8, 32), (14, 32),
             (15, 32), (18, 32), (20, 32), (22, 32), (23, 32), (29, 32),
             (30, 32), (31, 32), (8, 33), (9, 33), (13, 33), (14, 33),
             (15, 33), (18, 33), (19, 33), (20, 33), (22, 33), (23, 33),
             (26, 33), (27, 33), (28, 33), (29, 33), (30, 33), (31, 33),
             (32, 33)]
    return GraphDataset.from_edges(n_nodes=34,
                                   edges=edges,
                                   weighted=False,
                                   directed=False,
                                   name='Zachary\'s Karate Club')
Ejemplo n.º 11
0
def load_data(data_src=None):
    """
    Loads the US power grid network (https://toreopsahl.com/datasets/#uspowergrid).

    :param data_src: Specifies the location of the edge list file you
    downloaded from http://opsahl.co.uk/tnet/datasets/USpowergrid_n4941.txt.
    """
    if data_src is None:
        data_dir = get_setting('data_dir')
        data_src = download_file('http://opsahl.co.uk/tnet/datasets/USpowergrid_n4941.txt', data_dir)

    def process_line(l):
        splits = l.rstrip('\n').split()
        return int(splits[0]) - 1, int(splits[1]) - 1

    with open(data_src, 'r') as f:
        edges = map(process_line, f)
        return GraphDataset.from_edges(n_nodes=4941, edges=edges, weighted=False,
                                       directed=False, name='US Power Grid')
Ejemplo n.º 12
0
def _load_asunetwork_dataset(data_dir, expected_n_nodes, expected_n_edges,
                             **kwargs):
    """
    Processes the ASU network datasets:
    http://socialcomputing.asu.edu/pages/datasets

    :param data_dir: Directory containing nodes.csv, edges.csv, and group-edges.csv 
    """
    node_file = os.path.join(data_dir, 'nodes.csv')
    edge_file = os.path.join(data_dir, 'edges.csv')
    # Load nodes
    with open(node_file, 'r') as f:
        node_names = [l.rstrip('\n') for l in f]
        node_name_map = {x: i for i, x in enumerate(node_names)}
    assert (len(node_name_map) == expected_n_nodes)
    # Load edges
    edges = []
    with open(edge_file, 'r') as f:
        for l in f:
            splits = l.rstrip('\n').split(',')
            edges.append((node_name_map[splits[0]], node_name_map[splits[1]]))
    assert (len(edges) == expected_n_edges)

    # Process labels
    # Note: each node can have more than one label or zero label.
    label_file = os.path.join(data_dir, 'group-edges.csv')
    if os.path.exists(label_file):
        node_labels = [[] for i in range(expected_n_nodes)]
        with open(label_file, 'r') as f:
            for l in f:
                splits = l.rstrip('\n').split(',')
                nid = node_name_map[splits[0]]
                gid = int(splits[1])
                node_labels[nid].append(gid)
    else:
        node_labels = None
    return GraphDataset.from_edges(n_nodes=expected_n_nodes,
                                   edges=edges,
                                   **kwargs)
Ejemplo n.º 13
0
def _load(dataset_name, data_dir=None, download_url=None):
    """
    General data loader.
    
    Let

    n = total number of nodes
    m = total number of edges
    N = number of graphs

    DS_A.txt (m lines): sparse (block diagonal) adjacency matrix for all graphs,
    each line corresponds to (row, col) resp. (node_id, node_id). **All graphs
    are undirected. Hence, DS_A.txt contains two entries for each edge.**
    DS_graph_indicator.txt (n lines): column vector of graph identifiers for all
    nodes of all graphs, the value in the i-th line is the graph_id of the node
    with node_id i.
    DS_graph_labels.txt (N lines): class labels for all graphs in the data set,
    the value in the i-th line is the class label of the graph with graph_id i.
    DS_node_labels.txt (n lines): column vector of node labels, the value in the
    i-th line corresponds to the node with node_id i.

    There are optional files if the respective information is available:

    DS_edge_labels.txt (m lines; same size as DS_A_sparse.txt): labels for the
    edges in DS_A_sparse.txt.
    DS_edge_attributes.txt (m lines; same size as DS_A.txt): attributes for the
    edges in DS_A.txt.
    DS_node_attributes.txt (n lines): matrix of node attributes, the comma
    seperated values in the i-th line is the attribute vector of the node with
    node_id i.
    DS_graph_attributes.txt (N lines): regression values for all graphs in the
    data set, the value in the i-th line is the attribute of the graph
    with graph_id i.
    """

    if data_dir is None:
        data_dir = os.path.join(get_setting('data_dir'), dataset_name)

    # i-th line corresponds to the i-th edge: (na, nb). na and nb start from 1.
    edge_definition_filename = os.path.join(data_dir, dataset_name + '_A.txt')
    # i-th line corresponds to the i-th edge's label
    edge_label_filename = os.path.join(data_dir,
                                       dataset_name + '_edge_labels.txt')
    # i-th line corresponds to the i-th edge's attribute
    edge_attributes_filename = os.path.join(
        data_dir, dataset_name + '_edge_attributes.txt')
    # i-th line is the graph_id of the node with node_id i
    graph_indicator_filename = os.path.join(
        data_dir, dataset_name + '_graph_indicator.txt')
    # i-th line is the label of the i-th graph
    graph_label_filename = os.path.join(data_dir,
                                        dataset_name + '_graph_labels.txt')
    # i-th line is the label of the i-th node
    node_label_filename = os.path.join(data_dir,
                                       dataset_name + '_node_labels.txt')
    # i-th line is the attribute of the i-th node
    node_attributes_filename = os.path.join(
        data_dir, dataset_name + '_node_attributes.txt')

    # Download if possible
    if not os.path.exists(
            edge_definition_filename) and download_url is not None:
        print('Downloading data from ' + download_url)
        downloaded_file = download_file(download_url,
                                        get_setting('data_dir'),
                                        unpack=True)

    # Load graph labels
    with open(graph_label_filename, 'r') as f:
        graph_labels = [int(l) for l in f]
    n_graphs = len(graph_labels)

    # Load edges
    edges = []
    with open(edge_definition_filename, 'r') as f:
        for l in f:
            splits = l.split(',')
            # Convert to zero-based indexing.
            edges.append((int(splits[0]) - 1, int(splits[1]) - 1))

    # Load edge labels
    if os.path.exists(edge_label_filename):
        with open(edge_label_filename, 'r') as f:
            edge_labels = [int(l) for l in f]
        if len(edge_labels) != len(edges):
            raise Exception(
                'The length of the edge label list does not match the number of edges.'
            )
    else:
        edge_labels = None

    # Load edge attributes
    if os.path.exists(edge_attributes_filename):
        with open(edge_definition_filename, 'r') as f:
            edge_attributes = [tuple(map(float, l.split(','))) for l in f]
        if len(edge_attributes) != len(edges):
            raise Exception(
                'The length of the edge attribute list does not match the number of edges.'
            )
    else:
        edge_attributes = None

    # Combine edge labels and attributes into edge data tuples
    if edge_attributes is None:
        if edge_labels is not None:
            edge_data = [LabelAttrTuple(label, None) for label in edge_labels]
        else:
            edge_data = None
    else:
        if edge_labels is None:
            edge_data = [
                LabelAttrTuple(None, attr) for attr in edge_attributes
            ]
        else:
            edge_data = [
                LabelAttrTuple(edge_labels[i], edge_attributes[i])
                for i in range(len(edges))
            ]

    # Load graph indicators
    graph_indicators = [[] for i in range(n_graphs)]
    n_nodes_total = 0
    with open(graph_indicator_filename, 'r') as f:
        # Convert to zero-based indexing.
        for i, l in enumerate(f):
            cur_graph_id = int(l) - 1
            graph_indicators[cur_graph_id].append(i)
            n_nodes_total += 1

    # Load node labels
    if os.path.exists(node_label_filename):
        with open(node_label_filename, 'r') as f:
            node_labels = [int(l) for l in f]
        if len(node_labels) != n_nodes_total:
            raise Exception(
                'The length of the node label list does not match the number of nodes.'
            )
    else:
        node_labels = None

    # Load node attributes
    if os.path.exists(node_attributes_filename):
        with open(node_attributes_filename, 'r') as f:
            node_attributes = [tuple(map(float, l.split(','))) for l in f]
        if len(node_attributes) != n_nodes_total:
            raise Exception(
                'The length of the node attribute list does not match the number of nodes.'
            )
    else:
        node_attributes = None

    # Construct the full graph
    if edge_data is None:
        g = GraphDataset.from_edges(n_nodes=n_nodes_total,
                                    edges=edges,
                                    weighted=False,
                                    directed=False,
                                    node_labels=node_labels,
                                    node_attributes=node_attributes)
    else:
        # All graphs are undirected
        zipped = filter(lambda t: t[0][0] <= t[0][1], zip(edges, edge_data))
        edge_iter = map(lambda t: (t[0][0], t[0][1], 1, t[1]), zipped)
        g = GraphDataset.from_edges(n_nodes=n_nodes_total,
                                    edges=edge_iter,
                                    weighted=False,
                                    directed=False,
                                    has_edge_data=True,
                                    node_labels=node_labels,
                                    node_attributes=node_attributes)

    graphs = [None] * n_graphs
    for i in tqdm(range(n_graphs)):
        graphs[i] = g.subgraph(nodes_to_keep=graph_indicators[i],
                               name=dataset_name + '-' + str(i + 1))
    return MultiGraphDataset(graphs, graph_labels, dataset_name)
Ejemplo n.º 14
0
def from_mtx_file(filename, weighted=True, directed=True, name=None):
    a = mmread(filename)
    return GraphDataset.from_adj_mat(a,
                                     weighted=weighted,
                                     directed=directed,
                                     name=name)
Ejemplo n.º 15
0
def load_temporal(data_src,
                  start_time=None,
                  end_time=None,
                  name=None,
                  notes=None,
                  **kwargs):
    """
    Base loader for SNAP temporal graph datasets such as
    http://snap.stanford.edu/data/sx-mathoverflow.html.
    Each line in the data file should be triplet of (SRC, TGT, UNIXTS)
    representing an edge:
        SRC: id of the source node (a user)
        TGT: id of the target node (a user)
        UNIXTS: Unix timestamp (seconds since the epoch)

    :param data_src: Specifies the location of the edge list file.

    :param start_time: Optional datetime object. All edges that are formed
    before start_time will not be included.

    :param end_time: Optional datetime object. All edges that are formed at
    of after end_time will not be included.
    """
    if start_time is None:
        start_time = datetime.min
    if end_time is None:
        end_time = datetime.max
    actual_start_time = datetime.max
    actual_end_time = datetime.min
    edges = []
    cur_node_id = 0
    node_id_map = {}
    # Construct the edge list
    with open(data_src, 'r') as f:
        for l in f:
            splits = l.rstrip('\n').split()
            na = int(splits[0])
            nb = int(splits[1])
            ts = datetime.fromtimestamp(int(splits[2]))
            if na not in node_id_map:
                node_id_map[na] = cur_node_id
                cur_node_id += 1
            if nb not in node_id_map:
                node_id_map[nb] = cur_node_id
                cur_node_id += 1
            if ts >= start_time and ts < end_time:
                edges.append((node_id_map[na], node_id_map[nb]))
                if ts < actual_start_time:
                    actual_start_time = ts
                if ts > actual_end_time:
                    actual_end_time = ts
    actual_start_time = actual_start_time
    actual_end_time = actual_end_time
    # Name and notes
    if name is None:
        name = 'Unnamed'
    if notes is None:
        if actual_end_time < actual_start_time:
            s = 'empty'
        else:
            s = '{} - {}'.format(
                actual_start_time.strftime('%Y-%m-%d %H:%M:%S'),
                actual_end_time.strftime('%Y-%m-%d %H:%M:%S'))
        notes = 'timespan is ' + s
    return GraphDataset.from_edges(n_nodes=len(node_id_map),
                                   edges=edges,
                                   name=name,
                                   notes=notes,
                                   **kwargs)
Ejemplo n.º 16
0
 def setUp(self):
     edges = [(0, 1, 2.0), (0, 2, 1.5), (1, 2, 0.4), (1, 3, 0.2)]
     self.g_weighted = GraphDataset.from_edges(n_nodes=4,
                                               edges=edges,
                                               weighted=True)
     self.g_unweighted = GraphDataset.from_edges(n_nodes=4, edges=edges)
Ejemplo n.º 17
0
 def setUp(self):
     edges = [(0, 1), (0, 2), (1, 1), (1, 2), (1, 3)]
     self.g_undirected = GraphDataset.from_edges(n_nodes=5, edges=edges)
     self.g_directed = GraphDataset.from_edges(n_nodes=5,
                                               edges=edges,
                                               directed=True)