Beispiel #1
0
    def test_walk_generation_single_root_node_self_loner(self):
        g = create_test_graph()
        bfw = SampledBreadthFirstWalk(g)

        nodes = ["self loner"]
        n = 1

        n_size = [0]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs[0]) == expected_bfw_size(n_size=n_size)
        assert len(set(subgraphs[0])) == 1  # all elements should be same node
        assert nodes[0] in set(subgraphs[0])

        n_size = [1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs[0]) == expected_bfw_size(n_size=n_size)
        assert len(set(subgraphs[0])) == 1  # all elements should be same node
        assert nodes[0] in set(subgraphs[0])

        n_size = [2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs[0]) == expected_bfw_size(n_size=n_size)
        assert len(set(subgraphs[0])) == 1  # all elements should be same node
        assert nodes[0] in set(subgraphs[0])

        n_size = [3, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs[0]) == expected_bfw_size(n_size=n_size)
        assert len(set(subgraphs[0])) == 1  # all elements should be same node
        assert nodes[0] in set(subgraphs[0])

        n = 3
        n_size = [0]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n * len(nodes)
        assert len(subgraphs[0]) == expected_bfw_size(n_size=n_size)
        assert len(set(subgraphs[0])) == 1  # all elements should be same node
        assert nodes[0] in set(subgraphs[0])

        n_size = [1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n * len(nodes)
        assert len(subgraphs[0]) == expected_bfw_size(n_size=n_size)
        assert len(set(subgraphs[0])) == 1  # all elements should be same node
        assert nodes[0] in set(subgraphs[0])

        n_size = [2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n * len(nodes)
        assert len(subgraphs[0]) == expected_bfw_size(n_size=n_size)
        assert len(set(subgraphs[0])) == 1  # all elements should be same node
        assert nodes[0] in set(subgraphs[0])

        n_size = [3, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n * len(nodes)
        assert len(subgraphs[0]) == expected_bfw_size(n_size=n_size)
        assert len(set(subgraphs[0])) == 1  # all elements should the same node
        assert nodes[0] in set(subgraphs[0])
Beispiel #2
0
    def test_benchmark_bfs_walk(self, benchmark):
        g = create_test_graph()
        bfw = SampledBreadthFirstWalk(g)

        nodes = ["0"]
        n = 5
        n_size = [5, 5]

        benchmark(lambda: bfw.run(nodes=nodes, n=n, n_size=n_size))
    def test_benchmark_bfs_walk(self, benchmark):
        g = example_graph_random(n_nodes=100, n_edges=500)
        bfw = SampledBreadthFirstWalk(g)

        nodes = np.arange(0, 50)
        n = 5
        n_size = [5, 5]

        benchmark(lambda: bfw.run(nodes=nodes, n=n, n_size=n_size))
Beispiel #4
0
    def test_fixed_random_seed(self):

        g = create_test_graph()
        bfw = SampledBreadthFirstWalk(g)

        w0 = bfw.run(nodes=[1], n=1, n_size=[7], seed=42)
        w1 = bfw.run(nodes=[1], n=1, n_size=[7], seed=1010)

        assert len(w0) == len(w1)
        assert w0 != w1

        w0 = bfw.run(nodes=[1], n=1, n_size=[7], seed=42)
        w1 = bfw.run(nodes=[1], n=1, n_size=[7], seed=42)

        assert len(w0) == len(w1)
        assert w0 == w1

        w0 = bfw.run(nodes=[1], n=5, n_size=[12], seed=101)
        w1 = bfw.run(nodes=[1], n=5, n_size=[12], seed=101)

        assert len(w0) == len(w1)
        assert w0 == w1

        w0 = bfw.run(nodes=[9, "self loner"], n=1, n_size=[12], seed=101)
        w1 = bfw.run(nodes=[9, "self loner"], n=1, n_size=[12], seed=101)

        assert len(w0) == len(w1)
        assert w0 == w1

        w0 = bfw.run(nodes=[1, "self loner", 4], n=5, n_size=[12], seed=101)
        w1 = bfw.run(nodes=[1, "self loner", 4], n=5, n_size=[12], seed=101)

        assert len(w0) == len(w1)
        assert w0 == w1
Beispiel #5
0
    def test_walk_generation_single_root_node(self):

        g = create_test_graph()
        bfw = SampledBreadthFirstWalk(g)

        nodes = ["0"]
        n = 1
        n_size = [0]

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs[0]) == expected_bfw_size(n_size=n_size)

        # subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        # assert len(subgraphs[0]) == 2

        n_size = [2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(
            subgraphs[0]) == len(nodes) * n * expected_bfw_size(n_size=n_size)

        n_size = [3]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(
            subgraphs[0]) == len(nodes) * n * expected_bfw_size(n_size=n_size)

        n_size = [1, 1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(
            subgraphs[0]) == len(nodes) * n * expected_bfw_size(n_size=n_size)

        n_size = [2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(
            subgraphs[0]) == len(nodes) * n * expected_bfw_size(n_size=n_size)

        n_size = [2, 2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(
            subgraphs[0]) == len(nodes) * n * expected_bfw_size(n_size=n_size)

        n_size = [2, 3]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(
            subgraphs[0]) == len(nodes) * n * expected_bfw_size(n_size=n_size)

        n_size = [2, 3, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(
            subgraphs[0]) == len(nodes) * n * expected_bfw_size(n_size=n_size)
Beispiel #6
0
    def __init__(self, G, batch_size, num_samples, seed=None, name=None):
        if not isinstance(G, StellarGraphBase):
            raise TypeError("Graph must be a StellarGraph object.")

        G.check_graph_for_ml(features=True)

        self.graph = G
        self.num_samples = num_samples
        self.batch_size = batch_size
        self.name = name

        # We need a schema for compatibility with HinSAGE
        self.schema = G.create_graph_schema(create_type_maps=True)

        # The sampler used to generate random samples of neighbours
        self.sampler = SampledBreadthFirstWalk(G, graph_schema=self.schema, seed=seed)
Beispiel #7
0
    def test_weighted_all_zero(self):
        edges = pd.DataFrame({
            "source": [0, 0],
            "target": [1, 2],
            "weight": [0.0, 0]
        })

        g = StellarGraph(edges=edges)
        bfw = SampledBreadthFirstWalk(g)
        walks = bfw.run(nodes=[0], n=10, n_size=[20, 20], weighted=True)

        assert len(walks) == 10
        for walk in walks:
            assert len(walk) == 1 + 20 + 20 * 20
            assert walk[0] == 0
            np.testing.assert_array_equal(walk[1:], -1)
Beispiel #8
0
    def test_walk_generation_single_root_node_loner(self):
        g = create_test_graph()
        bfw = SampledBreadthFirstWalk(g)

        nodes = ["loner"]
        n = 1
        n_size = [0]

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        assert len(subgraphs[0]) == 1  # all elements should the same node
        assert subgraphs[0][0] == "loner"

        n_size = [1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        assert len(subgraphs[0]) == expected_bfw_size(
            n_size)  # "loner" plus None
        assert subgraphs[0][0] == "loner"
        assert subgraphs[0][1] is None

        n_size = [2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        # "loner" plus 2 * None + 2 * 2 * None
        assert len(subgraphs[0]) == expected_bfw_size(n_size)
        assert subgraphs[0][0] == "loner"
        assert subgraphs[0][1] is None
        assert subgraphs[0][2] is None
        assert subgraphs[0][6] is None

        n_size = [3, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        # "loner" plus 3 * None + 3 * 2 * None
        assert len(subgraphs[0]) == expected_bfw_size(n_size)
        assert subgraphs[0][0] == "loner"

        n = 3
        n_size = [0]

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            assert len(subgraph) == 1  # root node only
            assert subgraph[0] == "loner"

        n_size = [1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            # "loner" plus None
            assert len(subgraph) == expected_bfw_size(n_size)
            assert subgraph[0] == "loner"

        n = 99
        n_size = [2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            # "loner" plus 2 * None + 2 * 2 * None
            assert len(subgraph) == expected_bfw_size(n_size)
            assert subgraph[0] == "loner"

        n = 17
        n_size = [3, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            # "loner" plus 3 * None + 3 * 2 * None
            assert len(subgraph) == expected_bfw_size(n_size)
            assert subgraph[0] == "loner"
Beispiel #9
0
    def test_walk_generation_number_of_walks_per_root_nodes(self):

        g = create_test_graph()
        bfw = SampledBreadthFirstWalk(g)

        nodes = [1]
        n = 2
        n_size = [0]

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for i, subgraph in enumerate(subgraphs):
            assert len(subgraph) == expected_bfw_size(n_size=n_size)
            assert subgraph[0] == nodes[0]  # should equal the root node

        n_size = [1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [3]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        #############################################################
        nodes = [1, 5]
        n_size = [1]
        n = 2

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n * len(nodes)
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n * len(nodes)
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [3]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n * len(nodes)
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        #############################################################
        nodes = [1, 5]
        n_size = [2, 2]
        n = 3

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n * len(nodes)
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [3, 3]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n * len(nodes)
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [4, 4]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n * len(nodes)
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)
Beispiel #10
0
    def test_parameter_checking(self):
        g = create_test_graph()
        bfw = SampledBreadthFirstWalk(g)

        nodes = ["0", 1]
        n = 1
        n_size = [1]

        with pytest.raises(ValueError):
            # nodes should be a list of node ids even for a single node
            bfw.run(nodes=None, n=n, n_size=n_size)
        with pytest.raises(ValueError):
            bfw.run(nodes=0, n=n, n_size=n_size)

        # n has to be positive integer
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=-1, n_size=n_size)
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=10.1, n_size=n_size)
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=0, n_size=n_size)

        # n_size has to be list of positive integers
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=n, n_size=0)
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=n, n_size=[-5])
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=-1, n_size=[2.4])
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=n, n_size=(1, 2))
        # seed must be positive integer or 0
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=n, n_size=n_size, seed=-1235)
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=n, n_size=n_size, seed=10.987665)
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=n, n_size=n_size, seed=-982.4746)
        with pytest.raises(ValueError):
            bfw.run(nodes=nodes, n=n, n_size=n_size, seed="don't be random")

        # If no neighbours are sampled, then just the start node should be returned, e.g.:
        # subgraph = bfw.run(nodes=["0"], n=1, n_size=[])
        # assert len(subgraph) == 1
        # assert len(subgraph[0]) == 1
        # assert subgraph[0][0] == "0"
        # However, by consensus this is an error:
        with pytest.raises(ValueError):
            bfw.run(nodes=["0"], n=1, n_size=[])

        # If no root nodes are given, an empty list is returned which is not an error but I thought this method
        # is the best for checking this behaviour.
        nodes = []
        subgraph = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraph) == 0
Beispiel #11
0
    def test_walk_generation_many_root_nodes(self):

        g = create_test_graph()
        bfw = SampledBreadthFirstWalk(g)

        nodes = ["0", 2]
        n = 1
        n_size = [0]

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for i, subgraph in enumerate(subgraphs):
            assert len(subgraph) == 1
            assert subgraph[0] == nodes[i]  # should equal the root node

        n_size = [1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [3]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [1, 1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [3, 3]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [2, 3]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)

        n_size = [2, 3, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == len(nodes) * n
        for subgraph in subgraphs:
            assert len(subgraph) == expected_bfw_size(n_size=n_size)
Beispiel #12
0
    def test_directed_walk_generation_single_root_node(self):

        g = nx.DiGraph()
        edges = [
            ("root", 2),
            ("root", 1),
            ("root", "0"),
            (2, "c2.1"),
            (2, "c2.2"),
            (1, "c1.1"),
        ]
        g.add_edges_from(edges)
        g = StellarDiGraph(g)

        def _check_directed_walk(walk, n_size):
            if len(n_size) > 1 and n_size[0] > 0 and n_size[1] > 0:
                for child_pos in range(n_size[0]):
                    child = walk[child_pos + 1]
                    grandchildren_start = 1 + n_size[0] + child_pos * n_size[1]
                    grandchildren_end = grandchildren_start + n_size[1]
                    grandchildren = walk[grandchildren_start:grandchildren_end]
                    if child == "root":  # node with three children
                        for grandchild in grandchildren:
                            assert grandchild in [0, 1, 2]
                    elif child == "0":  # node without children
                        for grandchild in grandchildren:
                            assert grandchild == "root"
                    elif child == 1:  # node with single child
                        for grandchild in grandchildren:
                            assert grandchild in ["c1.1", "root"]
                    elif child == 2:  # node with two children
                        for grandchild in grandchildren:
                            assert grandchild in ["c2.1", "c2.2", "root"]
                    else:
                        assert 1 == 0

        bfw = SampledBreadthFirstWalk(g)

        nodes = ["root"]
        n = 1
        n_size = [0]

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        assert len(subgraphs[0]) == 1  # all elements should be the same node
        assert subgraphs[0][0] == "root"

        n_size = [1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        assert len(subgraphs[0]) == expected_bfw_size(
            n_size)  # "root" plus child
        assert subgraphs[0][0] == "root"

        n_size = [2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        # "root" plus 2 * child + 2 * 2 * grandchild or None
        assert len(subgraphs[0]) == expected_bfw_size(n_size)
        assert subgraphs[0][0] == "root"
        assert subgraphs[0][1] is not None
        assert subgraphs[0][2] is not None
        _check_directed_walk(subgraphs[0], n_size)

        n_size = [3, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        # "root" plus 3 * child + 3 * 2 * grandchild or None
        assert len(subgraphs[0]) == expected_bfw_size(n_size)
        assert subgraphs[0][0] == "root"
        _check_directed_walk(subgraphs[0], n_size)

        n = 3
        n_size = [0]

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            assert len(subgraph) == 1  # root node only
            assert subgraph[0] == "root"

        n_size = [1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            # "root" plus child
            assert len(subgraph) == expected_bfw_size(n_size)
            assert subgraph[0] == "root"

        n = 99
        n_size = [2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            # "root" plus 2 * child + 2 * 2 * grandchild or None
            assert len(subgraph) == expected_bfw_size(n_size)
            assert subgraph[0] == "root"
            _check_directed_walk(subgraph, n_size)

        n = 17
        n_size = [3, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            # "root" plus 3 * child + 3 * 2 * grandchild or None
            assert len(subgraph) == expected_bfw_size(n_size)
            assert subgraph[0] == "root"
Beispiel #13
0
class GraphSAGELinkGenerator:
    """A data generator for link prediction with Homogeneous GraphSAGE models

    At minimum, supply the StellarGraph, the batch size, and the number of
    node samples for each layer of the GraphSAGE model.

    The supplied graph should be a StellarGraph object that is ready for
    machine learning. Currently the model requires node features for all
    nodes in the graph.

    Use the :meth:`.flow` method supplying the nodes and (optionally) targets,
    or an UnsupervisedSampler instance that generates node samples on demand,
    to get an object that can be used as a Keras data generator.

    Example::

        G_generator = GraphSageLinkGenerator(G, 50, [10,10])
        train_data_gen = G_generator.flow(edge_ids)

    Args:
        G (StellarGraph): A machine-learning ready graph.
        batch_size (int): Size of batch of links to return.
        num_samples (list): List of number of neighbour node samples per GraphSAGE layer (hop) to take.
        seed (int or str), optional: Random seed for the sampling methods.
        name, optional: Name of generator
    """

    def __init__(self, G, batch_size, num_samples, seed=None, name=None):
        if not isinstance(G, StellarGraphBase):
            raise TypeError("Graph must be a StellarGraph object.")

        G.check_graph_for_ml(features=True)

        self.graph = G
        self.num_samples = num_samples
        self.batch_size = batch_size
        self.name = name

        # We need a schema for compatibility with HinSAGE
        self.schema = G.create_graph_schema(create_type_maps=True)

        # The sampler used to generate random samples of neighbours
        self.sampler = SampledBreadthFirstWalk(G, graph_schema=self.schema, seed=seed)

    def sample_features(self, head_links, sampling_schema):
        """
        Sample neighbours recursively from the head nodes, collect the features of the
        sampled nodes, and return these as a list of feature arrays for the GraphSAGE
        algorithm.

        Args:
            head_links: An iterable of edges to perform sampling for.
            sampling_schema: The sampling schema for the model

        Returns:
            A list of the same length as ``num_samples`` of collected features from
            the sampled nodes of shape:
            ``(len(head_nodes), num_sampled_at_layer, feature_size)``
            where num_sampled_at_layer is the cumulative product of `num_samples`
            for that layer.
        """
        node_type = sampling_schema[0][0][0]
        head_size = len(head_links)

        # The number of samples for each head node (not including itself)
        num_full_samples = np.sum(np.cumprod(self.num_samples))

        # Reshape node samples to sensible format
        def get_levels(loc, lsize, samples_per_hop, walks):
            end_loc = loc + lsize
            walks_at_level = list(it.chain(*[w[loc:end_loc] for w in walks]))
            if len(samples_per_hop) < 1:
                return [walks_at_level]
            return [walks_at_level] + get_levels(
                end_loc, lsize * samples_per_hop[0], samples_per_hop[1:], walks
            )

        # Get sampled nodes for the subgraphs for the edges where each edge is a tuple
        # of 2 nodes, so we are extracting 2 head nodes per edge
        batch_feats = []
        for hns in zip(*head_links):
            node_samples = self.sampler.run(nodes=hns, n=1, n_size=self.num_samples)

            # Isolated nodes will return only themselves in the sample list
            # let's correct for this by padding with None (the dummy node ID)
            node_samples = [
                ns + [None] * num_full_samples if len(ns) == 1 else ns
                for ns in node_samples
            ]

            nodes_per_hop = get_levels(0, 1, self.num_samples, node_samples)

            # Get features for the sampled nodes
            batch_feats.append(
                [
                    self.graph.get_feature_for_nodes(layer_nodes, node_type)
                    for layer_nodes in nodes_per_hop
                ]
            )

        # Resize features to (batch_size, n_neighbours, feature_size)
        # and re-pack features into a list where source, target feats alternate
        # This matches the GraphSAGE link model with (node_src, node_dst) input sockets:
        batch_feats = [
            np.reshape(feats, (head_size, -1, feats.shape[1]))
            for ab in zip(*batch_feats)
            for feats in ab
        ]
        return batch_feats

    def flow(self, link_ids, targets=None, shuffle=False):
        """
        Creates a generator/sequence object for training or evaluation
        with the supplied edge IDs and numeric targets.

        The edge IDs are the edges to train or inference on. They are
        expected to by tuples of (source_id, destination_id).

        The targets are an array of numeric targets corresponding to the
        supplied link_ids to be used by the downstream task. They should
        be given in the same order as the list of link IDs.
        If they are not specified (for example, for use in prediction),
        the targets will not be available to the downsteam task.

        Note that the shuffle argument should be True for training and
        False for prediction.

        Args:
            link_ids (list or UnsupervisedSampler): an iterable of (src_id, dst_id) tuples
                specifying the edges or an UnsupervisedSampler object that has a generator
                method to generate samples on the fly.
            targets (optional, array): a 2D array of numeric targets with shape
                `(len(link_ids), target_size)`
            shuffle (optional, bool): If True the node_ids will be shuffled at each
                epoch, if False the node_ids will be processed in order.

        Returns:
            A LinkSequence or OnDemandLinkSequence object to use with the GraphSAGE model
            methods :meth:`fit_generator`, :meth:`evaluate_generator`, and :meth:`predict_generator`
        """
        # Pass sampler to on-demand link sequence generation
        if isinstance(link_ids, UnsupervisedSampler):
            return OnDemandLinkSequence(self, link_ids)

        # Otherwise pass iterable (check?) to standard LinkSequence
        elif isinstance(link_ids, collections.Iterable):
            return LinkSequence(self, link_ids, targets, shuffle)

        else:
            raise TypeError(
                "Argument to .flow not recognised. "
                "Please pass a list of samples or a UnsupervisedSampler object."
            )
    def test_directed_walk_generation_single_root_node(self, tree_graph):
        def _check_directed_walk(walk, n_size):
            if len(n_size) > 1 and n_size[0] > 0 and n_size[1] > 0:
                for child_pos in range(n_size[0]):
                    child = walk[child_pos + 1]
                    grandchildren_start = 1 + n_size[0] + child_pos * n_size[1]
                    grandchildren_end = grandchildren_start + n_size[1]
                    grandchildren = walk[grandchildren_start:grandchildren_end]
                    if child == "root":  # node with three children
                        for grandchild in grandchildren:
                            assert grandchild in [0, 1, 2]
                    elif child == "0":  # node without children
                        for grandchild in grandchildren:
                            assert grandchild == "root"
                    elif child == 1:  # node with single child
                        for grandchild in grandchildren:
                            assert grandchild in ["c1.1", "root"]
                    elif child == 2:  # node with two children
                        for grandchild in grandchildren:
                            assert grandchild in ["c2.1", "c2.2", "root"]
                    else:
                        assert 1 == 0

        bfw = SampledBreadthFirstWalk(tree_graph)

        nodes = tree_graph.node_ids_to_ilocs(["root"])
        n = 1
        n_size = [0]

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        assert len(subgraphs[0]) == 1  # all elements should be the same node
        assert subgraphs[0][0] == tree_graph.node_ids_to_ilocs(["root"])[0]

        n_size = [1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        assert len(subgraphs[0]) == expected_bfw_size(
            n_size)  # "root" plus child
        assert subgraphs[0][0] == tree_graph.node_ids_to_ilocs(["root"])[0]

        n_size = [2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        # "root" plus 2 * child + 2 * 2 * grandchild or None
        assert len(subgraphs[0]) == expected_bfw_size(n_size)
        assert subgraphs[0][0] == tree_graph.node_ids_to_ilocs(["root"])[0]
        assert subgraphs[0][1] != -1
        assert subgraphs[0][2] != -1
        _check_directed_walk(tree_graph.node_ilocs_to_ids(subgraphs[0]),
                             n_size)

        n_size = [3, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        # "root" plus 3 * child + 3 * 2 * grandchild or None
        assert len(subgraphs[0]) == expected_bfw_size(n_size)
        assert subgraphs[0][0] == tree_graph.node_ids_to_ilocs(["root"])[0]
        _check_directed_walk(tree_graph.node_ilocs_to_ids(subgraphs[0]),
                             n_size)

        n = 3
        n_size = [0]

        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            assert len(subgraph) == 1  # root node only
            assert subgraph[0] == tree_graph.node_ids_to_ilocs(["root"])[0]

        n_size = [1]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            # "root" plus child
            assert len(subgraph) == expected_bfw_size(n_size)
            assert subgraph[0] == tree_graph.node_ids_to_ilocs(["root"])[0]

        n = 99
        n_size = [2, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            # "root" plus 2 * child + 2 * 2 * grandchild or None
            assert len(subgraph) == expected_bfw_size(n_size)
            assert subgraph[0] == tree_graph.node_ids_to_ilocs(["root"])[0]
            _check_directed_walk(tree_graph.node_ilocs_to_ids(subgraph),
                                 n_size)

        n = 17
        n_size = [3, 2]
        subgraphs = bfw.run(nodes=nodes, n=n, n_size=n_size)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            # "root" plus 3 * child + 3 * 2 * grandchild or None
            assert len(subgraph) == expected_bfw_size(n_size)
            assert subgraph[0] == tree_graph.node_ids_to_ilocs(["root"])[0]
Beispiel #15
0
    def test_weighted(self):
        g, checker = weighted_tree()
        bfw = SampledBreadthFirstWalk(g)
        walks = bfw.run(nodes=[0], n=10, n_size=[20, 20], weighted=True)

        checker(node_id for walk in walks for node_id in walk)