Esempio n. 1
0
    def fit(self, graph):
        """
        Fitting a Walklets model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """
        walker = RandomWalker(self.walk_number, self.walk_length)
        walker.do_walks(graph)
        num_of_nodes = graph.number_of_nodes()

        self._embedding = []
        for power in range(1, self.window_size + 1):
            walklets = self._select_walklets(walker.walks, power)
            model = Word2Vec(walklets,
                             hs=0,
                             alpha=self.learning_rate,
                             iter=self.epochs,
                             size=self.dimensions,
                             window=1,
                             min_count=self.min_count,
                             workers=self.workers)

            embedding = np.array([model[str(n)] for n in range(num_of_nodes)])
            self._embedding.append(embedding)
Esempio n. 2
0
    def fit(self, graph: nx.classes.graph.Graph, X: Union[np.array,
                                                          coo_matrix]):
        """
        Fitting a SINE model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
            * **X** *(Scipy COO array)* - The matrix of node features.
        """
        self._set_seed()
        self._check_graph(graph)
        self._walker = RandomWalker(self.walk_length, self.walk_number)
        self._walker.do_walks(graph)
        self._features = self._feature_transform(graph, X)
        self._select_walklets()

        model = Word2Vec(self._walklets,
                         hs=0,
                         alpha=self.learning_rate,
                         size=self.dimensions,
                         window=1,
                         min_count=self.min_count,
                         workers=self.workers,
                         seed=self.seed,
                         iter=self.epochs)

        self.embedding = np.array(
            [model[str(n)] for n in range(graph.number_of_nodes())])
Esempio n. 3
0
    def fit(self, graph: nx.classes.graph.Graph):
        """
        Fitting a DeepWalk model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """
        self._set_seed()
        graph = self._check_graph(graph)
        walker = RandomWalker(self.walk_length, self.walk_number)
        walker.do_walks(graph)

        model = Word2Vec(
            walker.walks,
            hs=1,
            alpha=self.learning_rate,
            epochs=self.epochs,
            vector_size=self.dimensions,
            window=self.window_size,
            min_count=self.min_count,
            workers=self.workers,
            seed=self.seed,
        )

        num_of_nodes = graph.number_of_nodes()
        self._embedding = [model.wv[str(n)] for n in range(num_of_nodes)]
Esempio n. 4
0
    def fit(self, graph: nx.classes.graph.Graph):
        """
        Fitting a Role2vec model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """
        self._set_seed()
        self._check_graph(graph)
        walker = RandomWalker(self.walk_length, self.walk_number)
        walker.do_walks(graph)

        hasher = WeisfeilerLehmanHashing(
            graph=graph,
            wl_iterations=self.wl_iterations,
            attributed=False,
            erase_base_features=self.erase_base_features)

        node_features = hasher.get_node_features()
        documents = self._create_documents(walker.walks, node_features)

        model = Doc2Vec(documents,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        dm=0,
                        workers=self.workers,
                        sample=self.down_sampling,
                        iter=self.epochs,
                        alpha=self.learning_rate,
                        seed=self.seed)

        self._embedding = [
            model.docvecs[str(i)] for i, _ in enumerate(documents)
        ]
Esempio n. 5
0
    def fit(self, graph):
        """
        Fitting a DeepWalk model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """

        if not self.check_if_walk_exists():
            self._check_graph(graph)
            # This creates the sentences and keeps it in memory
            walker = RandomWalker(self.walk_length, self.walk_number)
            walker.do_walks(graph)
            sentences = walker.walks
        else:
            # This returns a iterator
            sentences = self.load_prewalk()

        self.model = Word2Vec(sentences,
                         hs=1,
                         alpha=self.learning_rate,
                         iter=self.epochs,
                         size=self.dimensions,
                         window=self.window_size,
                         min_count=self.min_count,
                         workers=self.workers)
Esempio n. 6
0
    def fit(self, graph):
        """
        Fitting a GEMSEC model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """
        self._check_graph(graph)
        self._setup_sampling_weights(graph)
        self.walker = RandomWalker(self.walk_length, self.walk_number)
        self.walker.do_walks(graph)
        self._initialize_node_embeddings(graph)
        self._initialize_cluster_centers(graph)
        self._do_gradient_descent()
Esempio n. 7
0
    def fit(self, graph, X):
        """
        Fitting a MUSAE model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
            * **X** *(Scipy COO array)* - The binary matrix of node features.
        """
        self.graph = graph
        self.walker = RandomWalker(self.walk_length, self.walk_number)
        self.walker.do_walks(graph)
        self.features = self._feature_transform(graph, X)
        self.base_docs = self._create_base_docs()
        self.embeddings = [self._create_single_embedding(self.base_docs)]
        self._learn_musae_embedding()
Esempio n. 8
0
    def fit(self, graph: nx.classes.graph.Graph, X: Union[np.array, coo_matrix]):
        """
        Fitting an AE model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
            * **X** *(Scipy COO array)* - The binary matrix of node features.
        """
        self._set_seed()
        self._check_graph(graph)
        self.graph = graph
        self._walker = RandomWalker(self.walk_length, self.walk_number)
        self._walker.do_walks(graph)
        self.features = self._feature_transform(graph, X)
        self._base_docs = self._create_base_docs()
        self._embeddings = [self._create_single_embedding(self._base_docs)]
        self._learn_ae_embedding()
Esempio n. 9
0
    def fit(self, graph):
        """
        Fitting a DeepWalk model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """
        walker = RandomWalker(self.walk_number, self.walk_length)
        walker.do_walks(graph)

        model = Word2Vec(walker.walks,
                         hs=1,
                         alpha=self.learning_rate,
                         iter=self.epochs,
                         size=self.dimensions,
                         window=self.window_size,
                         min_count=self.min_count,
                         workers=self.workers)

        num_of_nodes = graph.number_of_nodes()
        self._embedding = [model[str(n)] for n in range(num_of_nodes)]
Esempio n. 10
0
class MUSAE(Estimator):
    r"""An implementation of `"MUSAE" <https://arxiv.org/abs/1909.13021>`_
    from the Arxiv '19 paper "MUSAE: Multi-Scale Attributed Node Embedding". The
    procedure does attributed random walks to approximate the adjacency matrix power
    node feature matrix products. The matrices are decomposed implicitly by a Skip-Gram
    style optimizer. The individual embeddings are concatenated together to form a 
    multi-scale attributed node embedding. This way the feature distributions at different scales
    are separable.
       
    Args:
        walk_number (int): Number of random walks. Default is 10.
        walk_length (int): Length of random walks. Default is 80.
        dimensions (int): Dimensionality of embedding. Default is 32.
        workers (int): Number of cores. Default is 4.
        window_size (int): Matrix power order. Default is 3.
        epochs (int): Number of epochs. Default is 1.
        learning_rate (float): HogWild! learning rate. Default is 0.05.
        down_sampling (float): Down sampling rate in the corpus. Default is 0.0001.
        min_count (int): Minimal count of node occurences. Default is 1.
    """
    def __init__(self,
                 walk_number=5,
                 walk_length=80,
                 dimensions=32,
                 workers=4,
                 window_size=3,
                 epochs=5,
                 learning_rate=0.05,
                 down_sampling=0.0001,
                 min_count=1):

        self.walk_number = walk_number
        self.walk_length = walk_length
        self.dimensions = dimensions
        self.workers = workers
        self.window_size = window_size
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.down_sampling = down_sampling
        self.min_count = min_count

    def _feature_transform(self, graph, X):
        features = {str(node): [] for node in graph.nodes()}
        nodes = X.row
        for i, node in enumerate(nodes):
            features[str(node)].append("feature_" + str(X.col[i]))
        return features

    def _create_single_embedding(self, document_collections):
        model = Doc2Vec(document_collections,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        alpha=self.learning_rate,
                        dm=0,
                        sample=self.down_sampling,
                        workers=self.workers,
                        epochs=self.epochs)

        emb = np.array([
            model.docvecs[str(n)] for n in range(self.graph.number_of_nodes())
        ])
        return emb

    def _create_documents(self, features):
        features_out = [
            TaggedDocument(words=[
                str(feat) for feat_elems in feature_set for feat in feat_elems
            ],
                           tags=[str(node)])
            for node, feature_set in features.items()
        ]
        return features_out

    def _setup_musae_features(self, approximation):
        features = {str(node): [] for node in self.graph.nodes()}
        for walk in self.walker.walks:
            for i in range(len(walk) - approximation):
                source = walk[i]
                target = walk[i + approximation]
                features[str(source)].append(self.features[str(target)] +
                                             [str(target)])
                features[str(target)].append(self.features[str(source)] +
                                             [str(source)])

        return self._create_documents(features)

    def _learn_musae_embedding(self):

        for approximation in range(self.window_size):

            features = self._setup_musae_features(approximation + 1)
            embedding = self._create_single_embedding(features)
            self.embeddings.append(embedding)

    def _create_base_docs(self):
        features_out = [
            TaggedDocument(words=[str(feature) for feature in features],
                           tags=[str(node)])
            for node, features in self.features.items()
        ]
        return features_out

    def fit(self, graph, X):
        """
        Fitting a MUSAE model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
            * **X** *(Scipy COO array)* - The binary matrix of node features.
        """
        self._check_graph(graph)
        self.graph = graph
        self.walker = RandomWalker(self.walk_length, self.walk_number)
        self.walker.do_walks(graph)
        self.features = self._feature_transform(graph, X)
        self.base_docs = self._create_base_docs()
        self.embeddings = [self._create_single_embedding(self.base_docs)]
        self._learn_musae_embedding()

    def get_embedding(self):
        r"""Getting the node embedding.

        Return types:
            * **embedding** *(Numpy array)* - The embedding of nodes.
        """
        embedding = np.concatenate(self.embeddings, axis=1)
        return embedding
Esempio n. 11
0
class SINE(Estimator):
    r"""An implementation of `"SINE" <https://arxiv.org/pdf/1810.06768.pdf>`_
    from the ICDM '18 paper "SINE: Scalable Incomplete Network Embedding". The 
    procedure implicitly factorizes a joint adjacency matrix power and feature matrix.
    The decomposition happens on truncated random walks and the adjacency matrix powers
    are pooled together.
       
    Args:
        walk_number (int): Number of random walks. Default is 10.
        walk_length (int): Length of random walks. Default is 80.
        dimensions (int): Dimensionality of embedding. Default is 128.
        workers (int): Number of cores. Default is 4.
        window_size (int): Matrix power order. Default is 5.
        epochs (int): Number of epochs. Default is 1.
        learning_rate (float): HogWild! learning rate. Default is 0.05.
        min_count (int): Minimal count of node occurrences. Default is 1.
        seed (int): Random seed value. Default is 42.
    """
    def __init__(self,
                 walk_number: int = 10,
                 walk_length: int = 80,
                 dimensions: int = 128,
                 workers: int = 4,
                 window_size: int = 5,
                 epochs: int = 1,
                 learning_rate: float = 0.05,
                 min_count: int = 1,
                 seed: int = 42):

        self.walk_number = walk_number
        self.walk_length = walk_length
        self.dimensions = dimensions
        self.workers = workers
        self.window_size = window_size
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.min_count = min_count
        self.seed = seed

    def _feature_transform(self, graph, X):
        features = {str(node): [] for node in graph.nodes()}
        nodes = X.row
        for i, node in enumerate(nodes):
            features[str(node)].append("feature_" + str(X.col[i]))
        return features

    def _select_walklets(self):
        self._walklets = []
        for walk in self._walker.walks:
            for power in range(1, self.window_size + 1):
                for step in range(power + 1):
                    neighbors = [
                        n for i, n in enumerate(walk[step:]) if i % power == 0
                    ]
                    neighbors = [n for n in neighbors for _ in range(0, 3)]
                    neighbors = [
                        random.choice(self._features[val])
                        if i % 3 == 1 and self._features[val] else val
                        for i, val in enumerate(neighbors)
                    ]
                    self._walklets.append(neighbors)
        del self._walker

    def fit(self, graph: nx.classes.graph.Graph, X: Union[np.array,
                                                          coo_matrix]):
        """
        Fitting a SINE model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
            * **X** *(Scipy COO array)* - The matrix of node features.
        """
        self._set_seed()
        self._check_graph(graph)
        self._walker = RandomWalker(self.walk_length, self.walk_number)
        self._walker.do_walks(graph)
        self._features = self._feature_transform(graph, X)
        self._select_walklets()

        model = Word2Vec(self._walklets,
                         hs=0,
                         alpha=self.learning_rate,
                         size=self.dimensions,
                         window=1,
                         min_count=self.min_count,
                         workers=self.workers,
                         seed=self.seed,
                         iter=self.epochs)

        self.embedding = np.array(
            [model[str(n)] for n in range(graph.number_of_nodes())])

    def get_embedding(self) -> np.array:
        r"""Getting the node embedding.

        Return types:
            * **embedding** *(Numpy array)* - The embedding of nodes.
        """
        embedding = self.embedding
        return embedding
Esempio n. 12
0
class GEMSEC(Estimator):
    r"""An implementation of `"GEMSEC" <https://arxiv.org/abs/1802.03997>`_
    from the ASONAM '19 paper "GEMSEC: Graph Embedding with Self Clustering".
    The procedure uses random walks to approximate the pointwise mutual information
    matrix obtained by pooling normalized adjacency matrix powers. This matrix
    is decomposed by an approximate factorization technique which is combined
    with a k-means like clustering cost. A node embedding and clustering are
    learned jointly.

    Args:
        walk_number (int): Number of random walks. Default is 5.
        walk_length (int): Length of random walks. Default is 80.
        dimensions (int): Dimensionality of embedding. Default is 32.
        negative_samples (int): Number of negative samples. Default is 5.
        window_size (int): Matrix power order. Default is 5.
        learning_rate (float): Gradient descent learning rate. Default is 0.1.
        clusters (int): Number of cluster centers. Default is 10.
        gamma (float): Clustering cost weight coefficient. Default is 0.1.
        seed (int): Random seed value. Default is 42.
    """
    def __init__(self,
                 walk_number: int = 5,
                 walk_length: int = 80,
                 dimensions: int = 32,
                 negative_samples: int = 5,
                 window_size: int = 5,
                 learning_rate: float = 0.1,
                 clusters: int = 10,
                 gamma: float = 0.1,
                 seed: int = 42):

        self.walk_number = walk_number
        self.walk_length = walk_length
        self.dimensions = dimensions
        self.negative_samples = negative_samples
        self.window_size = window_size
        self.learning_rate = learning_rate
        self.clusters = clusters
        self.gamma = gamma
        self.seed = seed

    def _setup_sampling_weights(self, graph):
        """
        Creating a negative sampling table.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph for negative sampling.
        """
        self._sampler = {}
        index = 0
        for node in graph.nodes():
            for _ in range(graph.degree(node)):
                self._sampler[index] = node
                index = index + 1
        self._global_index = index - 1

    def _initialize_node_embeddings(self, graph):
        """
        Creating a node embedding array.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph for negative sampling.
        """
        shape = (graph.number_of_nodes(), self.dimensions)
        self._base_embedding = np.random.normal(0, 1.0 / self.dimensions,
                                                shape)

    def _initialize_cluster_centers(self, graph):
        """
        Creating a cluster center array.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph for negative sampling.
        """
        shape = (self.dimensions, self.clusters)
        self._cluster_centers = np.random.normal(0, 1.0 / self.dimensions,
                                                 shape)

    def _sample_negative_samples(self):
        """
        Sampling a batch of nodes as negative samples.

        Return types:
            * **negative_samples** *(list)*: List of negative sampled nodes.
        """
        negative_samples = [
            self._sampler[random.randint(0, self._global_index)]
            for _ in range(self.negative_samples)
        ]
        return negative_samples

    def _calculcate_noise_vector(self, negative_samples, source_node):
        """
        Getting the noise vector for the weight update.

        Arg types:
            * **negative_samples** *(list)*: List of negative sampled nodes.
            * **source_node** *(int)* - Source node in the walk.

        Return types:
            * **noise_vector** *(NumPy array) - Noise update vector.
        """
        noise_vectors = self._base_embedding[negative_samples, :]
        source_vector = self._base_embedding[int(source_node), :]
        raw_scores = noise_vectors.dot(source_vector.T)
        raw_scores = np.exp(np.clip(raw_scores, -15, 15))
        scores = raw_scores / np.sum(raw_scores)
        scores = scores.reshape(-1, 1)
        noise_vector = np.sum(scores * noise_vectors, axis=0)
        return noise_vector

    def _calculate_cluster_vector(self, source_node):
        """
        Getting the cluster vector for the weight update.

        Arg types:
            * **source_node** *(int)* - Source node in the walk.

        Return types:
            * **cluster_vector** *(NumPy array) - Cluster update vector.
            * **cluster_index** *(int)*: Node cluster membership index.
        """
        distances = self._base_embedding[int(source_node), :].reshape(
            -1, 1) - self._cluster_centers
        scores = np.power(np.sum(np.power(distances, 2), axis=0), 0.5)
        cluster_index = np.argmin(scores)
        cluster_vector = distances[:, cluster_index] / scores[cluster_index]
        return cluster_vector, cluster_index

    def _do_descent_for_pair(self, negative_samples, source_node, target_node):
        """
        Updating the cluster center and the node embedding.

        Arg types:
            * **negative_samples** *(list)* - Negative samples.
            * **source_node** *(int)* - Source node in the walk.
            * **target_node** *(int)* - Target node in the walk.
        """
        noise_vector = self._calculcate_noise_vector(negative_samples,
                                                     source_node)
        target_vector = self._base_embedding[int(target_node), :]
        cluster_vector, cluster_index = self._calculate_cluster_vector(
            source_node)
        node_gradient = noise_vector - target_vector + self.gamma * cluster_vector
        node_gradient = node_gradient / np.linalg.norm(node_gradient)
        self._base_embedding[
            int(source_node), :] += -self.learning_rate * node_gradient
        self._cluster_centers[:,
                              cluster_index] += self.learning_rate * self.gamma * cluster_vector

    def _update_a_weight(self, source_node, target_node):
        """
        Updating the weights for a pair of nodes.

        Arg types:
            * **source_node** *(int)* - Source node in the walk.
            * **target_node** *(int)* - Target node in the walk.
        """
        negative_samples = self._sample_negative_samples()
        self._do_descent_for_pair(negative_samples, source_node, target_node)
        self._do_descent_for_pair(negative_samples, target_node, source_node)

    def _do_gradient_descent(self):
        """
        Updating the embedding weights and cluster centers with gradient descent.
        """
        random.shuffle(self._walker.walks)
        for walk in self._walker.walks:
            for i, source_node in enumerate(walk[:self.walk_length -
                                                 self.window_size]):
                for step in range(1, self.window_size + 1):
                    target_node = walk[i + step]
                    self._update_a_weight(source_node, target_node)

    def fit(self, graph: nx.classes.graph.Graph):
        """
        Fitting a GEMSEC model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """
        self._set_seed()
        self._check_graph(graph)
        self._setup_sampling_weights(graph)
        self._walker = RandomWalker(self.walk_length, self.walk_number)
        self._walker.do_walks(graph)
        self._initialize_node_embeddings(graph)
        self._initialize_cluster_centers(graph)
        self._do_gradient_descent()

    def get_embedding(self) -> np.array:
        r"""Getting the node embedding.

        Return types:
            * **embedding** *(Numpy array)*: The embedding of nodes.
        """
        return np.array(self._base_embedding)

    def _get_membership(self, node):
        """Getting the cluster membership of a node.

        Arg types:
            * **node** *(int)* - The graph to be clustered.

        Return types:
            * **cluster_index** *(int)*: Node cluster membership index.
        """
        distances = self._base_embedding[node, :].reshape(
            -1, 1) - self._cluster_centers
        scores = np.power(np.sum(np.power(distances, 2), axis=0), 0.5)
        cluster_index = np.argmin(scores)
        return cluster_index

    def get_memberships(self) -> Dict[int, int]:
        r"""Getting the cluster membership of nodes.

        Return types:
            * **memberships** *(dict)*: Node cluster memberships.
        """
        memberships = {
            node: self._get_membership(node)
            for node in range(self._base_embedding.shape[0])
        }
        return memberships