Example #1
0
    def test_walk_generation_loner_root_node(self):

        g = create_test_graph()
        urw = UniformRandomWalk(g)

        nodes = ["loner"]  # this node has no edges including itself
        n = 1
        length = 1
        seed = None

        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == 1
        assert (
            len(subgraphs[0]) == 1
        )  # always 1 since only the root node can every be added to the walk

        n = 10
        length = 1
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            assert (
                len(subgraph) == 1
            )  # always 1 since only the root node can ever be added to the walk

        n = 10
        length = 10
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            assert (
                len(subgraph) == 1
            )  # always 1 since only the root node can ever be added to the walk
Example #2
0
    def test_walk_generation_single_root_node(self):

        g = create_test_graph()
        urw = UniformRandomWalk(g)

        nodes = ["0"]
        n = 1
        length = 1
        seed = 42

        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs[0]) == length

        length = 2
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        for subgraph in subgraphs:
            assert len(subgraph) == length

        length = 2
        n = 2
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            assert len(subgraph) == length

        n = 3
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            assert len(subgraph) == length
Example #3
0
    def test_init_parameters(self):
        g = create_test_graph()

        nodes = ["0", 2]
        n = 1
        length = 2
        seed = 0

        urw = UniformRandomWalk(g, n=n, length=length, seed=seed)
        urw_no_params = UniformRandomWalk(g)

        run_1 = urw.run(nodes=nodes)
        run_2 = urw_no_params.run(nodes=nodes, n=n, length=length, seed=seed)
        np.testing.assert_array_equal(run_1, run_2)
Example #4
0
    def test_init_parameters(self):
        g = create_test_graph()

        nodes = ["0", 2]
        n = 1
        length = 2
        seed = 0

        urw = UniformRandomWalk(g, n=n, length=length, seed=seed)
        urw_no_params = UniformRandomWalk(g)

        assert urw.run(nodes=nodes) == urw_no_params.run(
            nodes=nodes, n=n, length=length, seed=seed
        )
Example #5
0
    def test_benchmark_uniformrandomwalk(self, benchmark):
        g = example_graph_random(n_nodes=100, n_edges=500)
        urw = UniformRandomWalk(g)

        nodes = np.arange(0, 50)
        n = 2
        n = 5
        length = 5

        benchmark(lambda: urw.run(nodes=nodes, n=n, length=length))
Example #6
0
    def test_benchmark_uniformrandomwalk(self, benchmark):

        g = create_test_graph()
        urw = UniformRandomWalk(g)

        nodes = ["0"]  # this node has no edges including itself
        n = 5
        length = 5

        benchmark(lambda: urw.run(nodes=nodes, n=n, length=length))
Example #7
0
    def test_walk_generation_many_root_nodes(self):

        g = create_test_graph()
        urw = UniformRandomWalk(g)

        nodes = ["0", 2]
        n = 1
        length = 1
        seed = None

        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n * len(nodes)
        for i, subgraph in enumerate(subgraphs):
            assert len(subgraph) == length  # should be 1
            assert subgraph[0] == nodes[i]  # should equal the root node

        length = 2
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n * len(nodes)
        for subgraph in subgraphs:
            assert len(subgraph) <= length

        n = 2
        length = 2
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n * len(nodes)
        for subgraph in subgraphs:
            assert len(subgraph) <= length

        length = 3
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n * len(nodes)
        for subgraph in subgraphs:
            assert len(subgraph) <= length

        n = 5
        length = 10
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n * len(nodes)
        for subgraph in subgraphs:
            assert len(subgraph) <= length
Example #8
0
    def test_walk_generation_self_lonely_root_node(self):

        g = create_test_graph()
        urw = UniformRandomWalk(g)

        nodes = ["self lonely"
                 ]  # this node has link to self but no other edges
        n = 1
        length = 1
        seed = None

        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == 1
        assert len(subgraphs[0]) == 1

        n = 10
        length = 1
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            assert len(subgraph) == length
            for node in subgraph:
                assert node == "self lonely"  # all nodes should be the same node

        n = 1
        length = 99
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            assert len(subgraph) == length
            for node in subgraph:
                assert node == "self lonely"  # all nodes should be the same node

        n = 10
        length = 10
        subgraphs = urw.run(nodes=nodes, n=n, length=length, seed=seed)
        assert len(subgraphs) == n
        for subgraph in subgraphs:
            assert len(subgraph) == length
            for node in subgraph:
                assert node == "self lonely"  # all nodes should be the same node
Example #9
0
    def test_parameter_checking(self):
        g = create_test_graph()
        urw = UniformRandomWalk(g)

        nodes = ["0"]
        n = 1
        length = 2
        seed = None

        # nodes should be a list of node ids even for a single node
        with pytest.raises(ValueError):
            urw.run(nodes=None, n=n, length=length, seed=seed)
        with pytest.raises(ValueError):
            urw.run(
                nodes="0", n=n, length=length,
                seed=seed)  # can't just pass a node id, need list, e.g., ["0"]
        # n has to be positive integer
        with pytest.raises(ValueError):
            urw.run(nodes=nodes, n=0, length=length, seed=seed)
        with pytest.raises(ValueError):
            urw.run(nodes=nodes, n=-121, length=length, seed=seed)
        with pytest.raises(TypeError):
            urw.run(nodes=nodes, n=21.4, length=length, seed=seed)
        with pytest.raises(TypeError):
            urw.run(nodes=nodes, n=-0.5, length=length, seed=seed)
        with pytest.raises(TypeError):
            urw.run(nodes=nodes, n=0.0001, length=length, seed=seed)
        with pytest.raises(TypeError):
            urw.run(nodes=nodes, n="2", length=length, seed=seed)

        # length has to be positive integer
        with pytest.raises(ValueError):
            urw.run(nodes=nodes, n=n, length=0, seed=seed)
        with pytest.raises(ValueError):
            urw.run(nodes=nodes, n=n, length=-5, seed=seed)
        with pytest.raises(TypeError):
            urw.run(nodes=nodes, n=n, length=11.9, seed=seed)
        with pytest.raises(TypeError):
            urw.run(nodes=nodes, n=n, length=-9.9, seed=seed)
        with pytest.raises(TypeError):
            urw.run(nodes=nodes, n=n, length="10", seed=seed)

        # seed has to be None, 0,  or positive integer
        with pytest.raises(ValueError):
            urw.run(nodes=nodes, n=n, length=length, seed=-1)
        with pytest.raises(TypeError):
            urw.run(nodes=nodes, n=n, length=length, seed=1010.8)

        # If no root nodes are given, an empty list is returned which is not an error but I thought this method
        # is the best for checking this behaviour.
        nodes = []
        subgraph = urw.run(nodes=nodes, n=n, length=length, seed=None)
        assert len(subgraph) == 0
class UnsupervisedSampler:
    """
        The UnsupervisedSampler is responsible for sampling walks in the given graph
        and returning positive and negative samples w.r.t. those walks, on demand.

        The positive samples are all the (target, context) pairs from the walks and the negative
        samples are contexts generated for each target based on a sampling distribtution.

        By default, a UniformRandomWalk is used, but a custom `walker` can be specified instead. An
        error will be raised if other parameters are specified along with a custom `walker`.

        Args:
            G (StellarGraph): A stellargraph with features.
            nodes (iterable, optional) The root nodes from which individual walks start.
                If not provided, all nodes in the graph are used.
            length (int): Length of the walks for the default UniformRandomWalk walker. Length must
                be at least 2.
            number_of_walks (int): Number of walks from each root node for the default
                UniformRandomWalk walker.
            seed (int, optional): Random seed for the default UniformRandomWalk walker.
            walker (RandomWalk, optional): A RandomWalk object to use instead of the default
                UniformRandomWalk walker.
    """
    def __init__(
        self,
        G,
        nodes=None,
        length=2,
        number_of_walks=1,
        seed=None,
        walker=None,
    ):
        if not isinstance(G, StellarGraph):
            raise ValueError(
                "({}) Graph must be a StellarGraph or StellarDigraph object.".
                format(type(self).__name__))
        else:
            self.graph = G

        # Instantiate the walker class used to generate random walks in the graph
        if walker is not None:
            _warn_if_ignored(length, 2, "length")
            _warn_if_ignored(number_of_walks, 1, "number_of_walks")
            _warn_if_ignored(seed, None, "seed")
            self.walker = walker
        else:
            self.walker = UniformRandomWalk(G,
                                            n=number_of_walks,
                                            length=length,
                                            seed=seed)

        # Define the root nodes for the walks
        # if no root nodes are provided for sampling defaulting to using all nodes as root nodes.
        if nodes is None:
            self.nodes = list(G.nodes())
        elif is_real_iterable(
                nodes):  # check whether the nodes provided are valid.
            self.nodes = list(nodes)
        else:
            raise ValueError(
                "nodes parameter should be an iterableof node IDs.")

        # Require walks of at lease length two because to create a sample pair we need at least two nodes.
        if length < 2:
            raise ValueError(
                "({}) For generating (target,context) samples, walk length has to be at least 2"
                .format(type(self).__name__))
        else:
            self.length = length

        if number_of_walks < 1:
            raise ValueError(
                "({}) At least 1 walk from each head node has to be done".
                format(type(self).__name__))
        else:
            self.number_of_walks = number_of_walks

        # Setup an interal random state with the given seed
        _, self.np_random = random_state(seed)

    def run(self, batch_size):
        """
        This method returns a batch_size number of positive and negative samples from the graph.
        A random walk is generated from each root node, which are transformed into positive context
        pairs, and the same number of negative pairs are generated from a global node sampling
        distribution. The resulting list of context pairs are shuffled and converted to batches of
        size ``batch_size``.

        Currently the global node sampling distribution for the negative pairs is the degree
        distribution to the 3/4 power. This is the same used in node2vec
        (https://snap.stanford.edu/node2vec/).

        Args:
             batch_size (int): The number of samples to generate for each batch.
                This must be an even number.

        Returns:
            List of batches, where each batch is a tuple of (list context pairs, list of labels)
        """
        self._check_parameter_values(batch_size)

        all_nodes = list(self.graph.nodes(use_ilocs=True))
        # Use the sampling distribution as per node2vec
        degrees = self.graph.node_degrees(use_ilocs=True)
        sampling_distribution = np.array([degrees[n]**0.75 for n in all_nodes])
        sampling_distribution_norm = sampling_distribution / np.sum(
            sampling_distribution)

        walks = self.walker.run(nodes=self.nodes)

        # first item in each walk is the target/head node
        targets = [walk[0] for walk in walks]

        positive_pairs = np.array([(target, positive_context)
                                   for target, walk in zip(targets, walks)
                                   for positive_context in walk[1:]])

        positive_pairs = self.graph.node_ids_to_ilocs(
            positive_pairs.flatten()).reshape(positive_pairs.shape)

        negative_samples = self.np_random.choice(all_nodes,
                                                 size=len(positive_pairs),
                                                 p=sampling_distribution_norm)

        negative_pairs = np.column_stack((positive_pairs[:,
                                                         0], negative_samples))

        pairs = np.concatenate((positive_pairs, negative_pairs), axis=0)
        labels = np.repeat([1, 0], len(positive_pairs))

        # shuffle indices - note this doesn't ensure an equal number of positive/negative examples in
        # each batch, just an equal number overall
        indices = self.np_random.permutation(len(pairs))

        batch_indices = [
            indices[i:i + batch_size]
            for i in range(0, len(indices), batch_size)
        ]

        return [(pairs[i], labels[i]) for i in batch_indices]

    def _check_parameter_values(self, batch_size):
        """
        Checks that the parameter values are valid or raises ValueError exceptions with a message indicating the
        parameter (the first one encountered in the checks) with invalid value.

        Args:
            batch_size: <int> number of samples to generate in each call of generator

        """

        if (
                batch_size is None
        ):  # must provide a batch size since this is an indicator of how many samples to return
            raise ValueError(
                "({}) The batch_size must be provided to generate samples for each batch in the epoch"
                .format(type(self).__name__))

        if type(batch_size) != int:  # must be an integer
            raise TypeError(
                "({}) The batch_size must be positive integer.".format(
                    type(self).__name__))

        if batch_size < 1:  # must be greater than 0
            raise ValueError(
                "({}) The batch_size must be positive integer.".format(
                    type(self).__name__))

        if (
                batch_size % 2 != 0
        ):  # should be even since we generate 1 negative sample for each positive one.
            raise ValueError(
                "({}) The batch_size must be an even integer since equal number of positive and negative samples are generated in each batch."
                .format(type(self).__name__))
Example #11
0
class UnsupervisedSampler:
    """
        The UnsupervisedSampler is responsible for sampling walks in the given graph
        and returning positive and negative samples w.r.t. those walks, on demand.

        The positive samples are all the (target, context) pairs from the walks and the negative
        samples are contexts generated for each target based on a sampling distribtution.

        Currently only uniform random walks are performed, other walk strategies (such as
        second order walks) will be enabled in the future.

        Args:
            G (StellarGraph): A stellargraph with features.
            nodes (optional, iterable) The root nodes from which individual walks start.
                If not provided, all nodes in the graph are used.
            length (int): An integer giving the length of the walks. Length must be at least 2.
            number_of_walks (int): Number of walks from each root node.
    """
    def __init__(self, G, nodes=None, length=2, number_of_walks=1, seed=None):
        if not isinstance(G, StellarGraphBase):
            raise ValueError(
                "({}) Graph must be a StellarGraph object.".format(
                    type(self).__name__))
        else:
            self.graph = G

        # Instantiate the walker class used to generate random walks in the graph
        self.walker = UniformRandomWalk(G, seed=seed)

        # This code will enable alternative walker classes
        # TODO: Enable this code, but figure out how to pass required options to run
        # if walker is not None:
        #     if not isinstance(
        #         walker, UniformRandomWalk
        #     ):  # only work with Uniform Random Walker at the moment
        #         raise TypeError(
        #             "({}) Only Uniform Random Walks are possible".format(
        #                 type(self).__name__
        #             )
        #         )
        #     else:
        #         self.walker = walker(G, seed=seed)
        # else:
        #         self.walker = UniformRandomWalk(G, seed=seed)

        # Define the root nodes for the walks
        # if no root nodes are provided for sampling defaulting to using all nodes as root nodes.
        if nodes is None:
            self.nodes = list(G.nodes())
        elif is_real_iterable(
                nodes):  # check whether the nodes provided are valid.
            self.nodes = list(nodes)
        else:
            raise ValueError(
                "nodes parameter should be an iterableof node IDs.")

        # Require walks of at lease length two because to create a sample pair we need at least two nodes.
        if length < 2:
            raise ValueError(
                "({}) For generating (target,context) samples, walk length has to be at least 2"
                .format(type(self).__name__))
        else:
            self.length = length

        if number_of_walks < 1:
            raise ValueError(
                "({}) At least 1 walk from each head node has to be done".
                format(type(self).__name__))
        else:
            self.number_of_walks = number_of_walks

        # Setup an interal random state with the given seed
        self.random = random.Random(seed)

    def generator(self, batch_size):
        """
        This method yields a batch_size number of positive and negative samples from the graph.
        This method generates one walk at a time of a given length from each root node and returns
        the positive pairs from the walks and the same number of negative pairs from a global
        node sampling distribution.

        Currently the global node sampling distribution for the negative pairs is the degree
        distribution to the 3/4 power. This is the same used in node2vec
        (https://snap.stanford.edu/node2vec/).

        Args:
             batch_size (int): The number of samples to generate for each batch.
                This must be an even number.

        Returns:
            Tuple of lists of target/context pairs and labels – 0 for a negative and 1 for a
             positive pair: ([[target, context] ,... ], [label, ...])
        """
        self._check_parameter_values(batch_size)

        positive_pairs = list()
        negative_pairs = list()

        sample_counter = 0

        all_nodes = list(self.graph.nodes())

        # Use the sampling distribution as per node2vec
        degrees = self.graph.degree()
        sampling_distribution = [degrees[n]**0.75 for n in all_nodes]

        done = False
        while not done:
            self.random.shuffle(self.nodes)
            for node in self.nodes:  # iterate over root nodes
                # Get 1 walk at a time. For now its assumed that its a uniform random walker
                walk = self.walker.run(
                    nodes=[node],  # root nodes
                    length=self.length,  # maximum length of a random walk
                    n=1,  # number of random walks per root node
                )

                # (target,contect) pair sampling - GraphSAGE way
                target = walk[0][0]
                context_window = walk[0][1:]
                for context in context_window:
                    # Don't add self pairs
                    if context == target:
                        continue

                    positive_pairs.append((target, context))
                    sample_counter += 1

                    # For each positive sample, add a negative sample.
                    random_sample = self.random.choices(
                        all_nodes, weights=sampling_distribution, k=1)
                    negative_pairs.append((target, *random_sample))
                    sample_counter += 1

                    # If the batch_size number of samples are accumulated, yield.
                    if sample_counter == batch_size:
                        all_pairs = positive_pairs + negative_pairs
                        all_targets = [1] * len(positive_pairs) + [0] * len(
                            negative_pairs)

                        positive_pairs.clear()
                        negative_pairs.clear()
                        sample_counter = 0

                        edge_ids_labels = list(zip(all_pairs, all_targets))
                        self.random.shuffle(edge_ids_labels)
                        edge_ids, edge_labels = [
                            [z[i] for z in edge_ids_labels] for i in (0, 1)
                        ]

                        yield edge_ids, edge_labels

    def _check_parameter_values(self, batch_size):
        """
        Checks that the parameter values are valid or raises ValueError exceptions with a message indicating the
        parameter (the first one encountered in the checks) with invalid value.

        Args:
            batch_size: <int> number of samples to generate in each call of generator

        """

        if (
                batch_size is None
        ):  # must provide a batch size since this is an indicator of how many samples to return
            raise ValueError(
                "({}) The batch_size must be provided to generate samples for each batch in the epoch"
                .format(type(self).__name__))

        if type(batch_size) != int:  # must be an integer
            raise TypeError(
                "({}) The batch_size must be positive integer.".format(
                    type(self).__name__))

        if batch_size < 1:  # must be greater than 0
            raise ValueError(
                "({}) The batch_size must be positive integer.".format(
                    type(self).__name__))

        if (
                batch_size % 2 != 0
        ):  # should be even since we generate 1 negative sample for each positive one.
            raise ValueError(
                "({}) The batch_size must be an even integer since equal number of positive and negative samples are generated in each batch."
                .format(type(self).__name__))