def read_graph(graph_file, dataset_name, directed=False, weighted=False):
    """
    Reads the input network in networkx.

    :param graph_file: The directory where graph in EPGM format is stored
    :param dataset_name: The name of the graph selected out of all the graph heads in EPGM file
    :return: The graph in networkx format
    """
    try:  # assume args.input points to an EPGM graph
        G_epgm = EPGM(graph_file)
        graphs = G_epgm.G["graphs"]
        if (
                dataset_name is None
        ):  # if dataset_name is not given, use the name of the 1st graph head
            dataset_name = graphs[0]["meta"]["label"]
            print(
                "WARNING: dataset name not specified, using dataset '{}' in the 1st graph head"
                .format(dataset_name))
        graph_id = None
        for g in graphs:
            if g["meta"]["label"] == dataset_name:
                graph_id = g["id"]

        g = G_epgm.to_nx(graph_id, directed)
        if weighted:
            raise NotImplementedError
        else:
            # This is the correct way to set the edge weight in a MultiGraph.
            edge_weights = {e: 1 for e in g.edges(keys=True)}
            nx.set_edge_attributes(g, name="weight", values=edge_weights)

    except:  # otherwise, assume arg.input points to an edgelist file
        if weighted:
            g = nx.read_edgelist(
                graph_file,
                nodetype=int,
                data=(("weight", float), ),
                create_using=nx.DiGraph(),
            )
        else:
            g = nx.read_edgelist(graph_file,
                                 nodetype=int,
                                 create_using=nx.DiGraph())
            for edge in g.edges():
                g[edge[0]][edge[1]]["weight"] = 1

        if not directed:
            g = g.to_undirected()

    if not nx.is_connected(g):
        print("Graph is not connected")
        # take the largest connected component as the data
        g_ccs = (g.subgraph(c).copy() for c in nx.connected_components(g))
        g = max(g_ccs, key=len)
        print("Largest subgraph statistics: {} nodes, {} edges".format(
            g.number_of_nodes(), g.number_of_edges()))

    print("Graph statistics: {} nodes, {} edges".format(
        g.number_of_nodes(), g.number_of_edges()))
    return g
Beispiel #2
0
    def test_load_epgm(self):
        """Test that the EPGM is loaded correctly from epgm path"""
        G_epgm = EPGM(self.input_dir)
        print(self.input_dir)

        assert "graphs" in G_epgm.G.keys()
        assert "vertices" in G_epgm.G.keys()
        assert "edges" in G_epgm.G.keys()

        # check that G_epgm.G['graphs] has at least one graph head:
        assert len(G_epgm.G["graphs"]) > 0

        # cora nodes should have a subject attribute
        graph_id = G_epgm.G["graphs"][0]["id"]
        assert self.target_attribute in G_epgm.node_attributes(
            graph_id, self.node_type)

        # cora should have 2708 vertices
        n_nodes = 2708
        nodes = G_epgm.G["vertices"]
        assert len(nodes) == n_nodes

        # cora nodes should have 7 unique values for subject attribute:
        assert sum(["data" in v for v in nodes]) == n_nodes
        subjects = np.unique([v["data"][self.target_attribute] for v in nodes])
        assert len(subjects) == 7
Beispiel #3
0
    def test_load_epgm(self):
        """Test that the EPGM is loaded correctly from epgm path"""
        G_epgm = EPGM(self.input_dir)

        assert "graphs" in G_epgm.G.keys()
        assert "vertices" in G_epgm.G.keys()
        assert "edges" in G_epgm.G.keys()

        # check that G_epgm.G['graphs] has at least one graph head:
        assert len(G_epgm.G["graphs"]) > 0

        # graph nodes of self.node_type type should have a self.target_attribute attribute
        graph_id = G_epgm.G["graphs"][0]["id"]
        assert self.target_attribute in G_epgm.node_attributes(
            graph_id, self.node_type)

        # graph should have 260 vertices
        n_nodes = 260
        nodes = G_epgm.G["vertices"]
        assert len(nodes) == n_nodes

        # 'user' nodes should have 3 unique values for 'elite' attribute:
        # first make sure that all nodes have 'data' key
        assert sum(["data" in v for v in nodes]) == n_nodes
        labels_all = [v["data"].get(self.target_attribute) for v in nodes]
        labels = list(filter(lambda l: l is not None, labels_all))
        assert len(np.unique(labels)) == 3
Beispiel #4
0
def load_data(path, dataset_name=None, node_type=None, target_attribute=None):
    """
    Loads the node data

     :param path: Input filename or directory where graph in EPGM format is stored
     :param node_type: For HINs, the node type to consider
     :param target_attribute: For EPGM format, the target node attribute
     :return: N x 2 numpy arrays where the first column is the node id and the second column is the node label.
    """
    if os.path.isdir(path):
        g_epgm = EPGM(path)
        graphs = g_epgm.G["graphs"]
        for g in graphs:
            if g["meta"]["label"] == dataset_name:
                g_id = g["id"]

        g_vertices = g_epgm.G["vertices"]  # retrieve all graph vertices

        if node_type is None:
            node_type = g_epgm.node_types(g_id)
            if len(node_type) == 1:
                node_type = node_type[0]
            else:
                raise Exception(
                    "Multiple node types detected in graph {}: {}.".format(
                        g_id, node_type
                    )
                )

        if target_attribute is None:
            target_attribute = g_epgm.node_attributes(g_id, node_type)
            if len(target_attribute) == 1:
                target_attribute = target_attribute[0]
            else:
                raise Exception(
                    "Multiple node attributes detected for nodes of type {} in graph {}: {}.".format(
                        node_type, g_id, target_attribute
                    )
                )

        y = np.array(
            get_nodes(
                g_vertices, node_type=node_type, target_attribute=target_attribute
            )
        )

    else:
        y_df = pd.read_csv(path, delimiter=" ", header=None, dtype=str)
        y_df.sort_values(by=[0], inplace=True)

        y = y_df.values

    return y
Beispiel #5
0
    def test_node_types(self):
        """Test the .node_types() method"""
        G_epgm = EPGM(self.input_dir)
        graph_id = G_epgm.G["graphs"][0]["id"]

        # cora has a single 'paper' node type:
        node_types = G_epgm.node_types(graph_id)

        assert len(node_types) == 1
        assert self.node_type in node_types

        with pytest.raises(Exception):
            G_epgm.node_types("invalid_graph_id")
Beispiel #6
0
    def test_node_types(self):
        """Test the .node_types() method"""
        G_epgm = EPGM(self.input_dir)
        graph_id = G_epgm.G["graphs"][0]["id"]

        # dataset has multiple node types:
        node_types = G_epgm.node_types(graph_id)

        assert len(node_types) == 3
        assert "person" in node_types
        assert "paper" in node_types
        assert "venue" in node_types

        with pytest.raises(Exception):
            G_epgm.node_types("invalid_graph_id")
Beispiel #7
0
def from_epgm(epgm_location, dataset_name=None, directed=False):
    """
    Imports a graph stored in EPGM format to a NetworkX object

    Args:
        epgm_location (str): The directory containing the EPGM data
        dataset_name (str), optional: The name of the dataset to import
        directed (bool): If True, load as a directed graph, otherwise
            load as an undirected graph

    Returns:
        A NetworkX graph containing the data for the EPGM-stored graph.
    """
    G_epgm = EPGM(epgm_location)
    graphs = G_epgm.G["graphs"]

    # if dataset_name is not given, use the name of the 1st graph head
    if not dataset_name:
        dataset_name = graphs[0]["meta"]["label"]
        warnings.warn(
            "dataset name not specified, using dataset '{}' in the 1st graph head".format(
                dataset_name
            ),
            RuntimeWarning,
            stacklevel=2,
        )

    # Select graph using dataset_name
    for g in graphs:
        if g["meta"]["label"] == dataset_name:
            graph_id = g["id"]

    # Convert to StellarGraph (via nx)
    Gnx = G_epgm.to_nx(graph_id, directed=directed)

    print(
        "Graph statistics: {} nodes, {} edges".format(
            Gnx.number_of_nodes(), Gnx.number_of_edges()
        )
    )
    return Gnx
Beispiel #8
0
    def test_node_attributes(self):
        """Test the .node_attributes() method"""
        G_epgm = EPGM(self.input_dir)
        graph_id = G_epgm.G["graphs"][0]["id"]

        # cora has 1433 unique node attributes, including 'subject'
        node_attributes = G_epgm.node_attributes(graph_id, self.node_type)

        assert self.target_attribute in node_attributes

        # after the predictions cora has 1434 attributes, including subject and subject_PREDICTED
        if self.epgm_input:
            assert (
                len(node_attributes) == 1433
            ), "There should be 1433 unique node attributes; found {}".format(
                len(node_attributes))
        else:
            assert (
                len(node_attributes) == 1434
            ), "There should be 1434 unique node attributes; found {}".format(
                len(node_attributes))

        # passing a non-existent node type should return an empty array of node attributes:
        assert len(G_epgm.node_attributes(graph_id, "person")) == 0

        # if node_type is not supplied, a TypeError should be raised:
        with pytest.raises(TypeError):
            G_epgm.node_attributes(graph_id)
Beispiel #9
0
    def test_node_attributes(self):
        """Test the .node_attributes() method"""
        G_epgm = EPGM(self.input_dir)
        graph_id = G_epgm.G["graphs"][0]["id"]

        # dataset has 1 unique 'user' node attribute, 'elite'
        node_attributes = G_epgm.node_attributes(graph_id, self.node_type)

        assert self.target_attribute in node_attributes
        assert (len(node_attributes) == 1
                ), "There should be 1 unique node attribute; found {}".format(
                    len(node_attributes))

        # passing a non-existent node type should return an empty array of node attributes:
        assert len(G_epgm.node_attributes(graph_id, "business")) == 0

        # if node_type is not supplied, a TypeError should be raised:
        with pytest.raises(TypeError):
            G_epgm.node_attributes(graph_id)
Beispiel #10
0
def read_graph(graph_file, dataset_name, is_directed=False, is_weighted=False):
    """
    Reads the input network in networkx.

    Args:
        graph_file: The directory where graph in EPGM format is stored.
        dataset_name: The name of the graph selected out of all the graph heads in EPGM file.

    Returns:
        The graph in networkx format
    """

    if graph_file.split('.')[-1] == 'gpickle':
        g = nx.read_gpickle(graph_file)
        for edge in g.edges():
            g[edge[0]][edge[1]]["weight"] = 1  # {'weight': 1}

        if not is_directed:
            g = g.to_undirected()

        return g

    try:  # assume args.input points to an EPGM graph
        G_epgm = EPGM(graph_file)
        graphs = G_epgm.G["graphs"]
        if (
                dataset_name is None
        ):  # if dataset_name is not given, use the name of the 1st graph head
            dataset_name = graphs[0]["meta"]["label"]
            print(
                "WARNING: dataset name not specified, using dataset '{}' in the 1st graph head"
                .format(dataset_name))
        graph_id = None
        for g in graphs:
            if g["meta"]["label"] == dataset_name:
                graph_id = g["id"]

        g = G_epgm.to_nx(graph_id, is_directed)
        if is_weighted:
            raise NotImplementedError
        else:
            # This is the correct way to set the edge weight in a MultiGraph.
            edge_weights = {e: 1 for e in g.edges(keys=True)}
            nx.set_edge_attributes(g, name="weight", values=edge_weights)
    except:  # otherwise, assume arg.input points to an edgelist file
        if is_weighted:
            g = nx.read_edgelist(
                graph_file,
                nodetype=int,
                data=(("weight", float), ),
                create_using=nx.DiGraph(),
            )
        else:
            g = nx.read_edgelist(graph_file,
                                 nodetype=int,
                                 create_using=nx.DiGraph())
            for edge in g.edges():
                g[edge[0]][edge[1]]["weight"] = 1  # {'weight': 1}

        if not is_directed:
            g = g.to_undirected()

    print("Graph statistics: {} nodes, {} edges".format(
        g.number_of_nodes(), g.number_of_edges()))
    return g