def fit(self, graph): """ Fitting a Walklets model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ walker = RandomWalker(self.walk_number, self.walk_length) walker.do_walks(graph) num_of_nodes = graph.number_of_nodes() self._embedding = [] for power in range(1, self.window_size + 1): walklets = self._select_walklets(walker.walks, power) model = Word2Vec(walklets, hs=0, alpha=self.learning_rate, iter=self.epochs, size=self.dimensions, window=1, min_count=self.min_count, workers=self.workers) embedding = np.array([model[str(n)] for n in range(num_of_nodes)]) self._embedding.append(embedding)
def fit(self, graph: nx.classes.graph.Graph, X: Union[np.array, coo_matrix]): """ Fitting a SINE model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. * **X** *(Scipy COO array)* - The matrix of node features. """ self._set_seed() self._check_graph(graph) self._walker = RandomWalker(self.walk_length, self.walk_number) self._walker.do_walks(graph) self._features = self._feature_transform(graph, X) self._select_walklets() model = Word2Vec(self._walklets, hs=0, alpha=self.learning_rate, size=self.dimensions, window=1, min_count=self.min_count, workers=self.workers, seed=self.seed, iter=self.epochs) self.embedding = np.array( [model[str(n)] for n in range(graph.number_of_nodes())])
def fit(self, graph: nx.classes.graph.Graph): """ Fitting a DeepWalk model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ self._set_seed() graph = self._check_graph(graph) walker = RandomWalker(self.walk_length, self.walk_number) walker.do_walks(graph) model = Word2Vec( walker.walks, hs=1, alpha=self.learning_rate, epochs=self.epochs, vector_size=self.dimensions, window=self.window_size, min_count=self.min_count, workers=self.workers, seed=self.seed, ) num_of_nodes = graph.number_of_nodes() self._embedding = [model.wv[str(n)] for n in range(num_of_nodes)]
def fit(self, graph: nx.classes.graph.Graph): """ Fitting a Role2vec model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ self._set_seed() self._check_graph(graph) walker = RandomWalker(self.walk_length, self.walk_number) walker.do_walks(graph) hasher = WeisfeilerLehmanHashing( graph=graph, wl_iterations=self.wl_iterations, attributed=False, erase_base_features=self.erase_base_features) node_features = hasher.get_node_features() documents = self._create_documents(walker.walks, node_features) model = Doc2Vec(documents, vector_size=self.dimensions, window=0, min_count=self.min_count, dm=0, workers=self.workers, sample=self.down_sampling, iter=self.epochs, alpha=self.learning_rate, seed=self.seed) self._embedding = [ model.docvecs[str(i)] for i, _ in enumerate(documents) ]
def fit(self, graph): """ Fitting a DeepWalk model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ if not self.check_if_walk_exists(): self._check_graph(graph) # This creates the sentences and keeps it in memory walker = RandomWalker(self.walk_length, self.walk_number) walker.do_walks(graph) sentences = walker.walks else: # This returns a iterator sentences = self.load_prewalk() self.model = Word2Vec(sentences, hs=1, alpha=self.learning_rate, iter=self.epochs, size=self.dimensions, window=self.window_size, min_count=self.min_count, workers=self.workers)
def fit(self, graph): """ Fitting a GEMSEC model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ self._check_graph(graph) self._setup_sampling_weights(graph) self.walker = RandomWalker(self.walk_length, self.walk_number) self.walker.do_walks(graph) self._initialize_node_embeddings(graph) self._initialize_cluster_centers(graph) self._do_gradient_descent()
def fit(self, graph, X): """ Fitting a MUSAE model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. * **X** *(Scipy COO array)* - The binary matrix of node features. """ self.graph = graph self.walker = RandomWalker(self.walk_length, self.walk_number) self.walker.do_walks(graph) self.features = self._feature_transform(graph, X) self.base_docs = self._create_base_docs() self.embeddings = [self._create_single_embedding(self.base_docs)] self._learn_musae_embedding()
def fit(self, graph: nx.classes.graph.Graph, X: Union[np.array, coo_matrix]): """ Fitting an AE model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. * **X** *(Scipy COO array)* - The binary matrix of node features. """ self._set_seed() self._check_graph(graph) self.graph = graph self._walker = RandomWalker(self.walk_length, self.walk_number) self._walker.do_walks(graph) self.features = self._feature_transform(graph, X) self._base_docs = self._create_base_docs() self._embeddings = [self._create_single_embedding(self._base_docs)] self._learn_ae_embedding()
def fit(self, graph): """ Fitting a DeepWalk model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ walker = RandomWalker(self.walk_number, self.walk_length) walker.do_walks(graph) model = Word2Vec(walker.walks, hs=1, alpha=self.learning_rate, iter=self.epochs, size=self.dimensions, window=self.window_size, min_count=self.min_count, workers=self.workers) num_of_nodes = graph.number_of_nodes() self._embedding = [model[str(n)] for n in range(num_of_nodes)]
class MUSAE(Estimator): r"""An implementation of `"MUSAE" <https://arxiv.org/abs/1909.13021>`_ from the Arxiv '19 paper "MUSAE: Multi-Scale Attributed Node Embedding". The procedure does attributed random walks to approximate the adjacency matrix power node feature matrix products. The matrices are decomposed implicitly by a Skip-Gram style optimizer. The individual embeddings are concatenated together to form a multi-scale attributed node embedding. This way the feature distributions at different scales are separable. Args: walk_number (int): Number of random walks. Default is 10. walk_length (int): Length of random walks. Default is 80. dimensions (int): Dimensionality of embedding. Default is 32. workers (int): Number of cores. Default is 4. window_size (int): Matrix power order. Default is 3. epochs (int): Number of epochs. Default is 1. learning_rate (float): HogWild! learning rate. Default is 0.05. down_sampling (float): Down sampling rate in the corpus. Default is 0.0001. min_count (int): Minimal count of node occurences. Default is 1. """ def __init__(self, walk_number=5, walk_length=80, dimensions=32, workers=4, window_size=3, epochs=5, learning_rate=0.05, down_sampling=0.0001, min_count=1): self.walk_number = walk_number self.walk_length = walk_length self.dimensions = dimensions self.workers = workers self.window_size = window_size self.epochs = epochs self.learning_rate = learning_rate self.down_sampling = down_sampling self.min_count = min_count def _feature_transform(self, graph, X): features = {str(node): [] for node in graph.nodes()} nodes = X.row for i, node in enumerate(nodes): features[str(node)].append("feature_" + str(X.col[i])) return features def _create_single_embedding(self, document_collections): model = Doc2Vec(document_collections, vector_size=self.dimensions, window=0, min_count=self.min_count, alpha=self.learning_rate, dm=0, sample=self.down_sampling, workers=self.workers, epochs=self.epochs) emb = np.array([ model.docvecs[str(n)] for n in range(self.graph.number_of_nodes()) ]) return emb def _create_documents(self, features): features_out = [ TaggedDocument(words=[ str(feat) for feat_elems in feature_set for feat in feat_elems ], tags=[str(node)]) for node, feature_set in features.items() ] return features_out def _setup_musae_features(self, approximation): features = {str(node): [] for node in self.graph.nodes()} for walk in self.walker.walks: for i in range(len(walk) - approximation): source = walk[i] target = walk[i + approximation] features[str(source)].append(self.features[str(target)] + [str(target)]) features[str(target)].append(self.features[str(source)] + [str(source)]) return self._create_documents(features) def _learn_musae_embedding(self): for approximation in range(self.window_size): features = self._setup_musae_features(approximation + 1) embedding = self._create_single_embedding(features) self.embeddings.append(embedding) def _create_base_docs(self): features_out = [ TaggedDocument(words=[str(feature) for feature in features], tags=[str(node)]) for node, features in self.features.items() ] return features_out def fit(self, graph, X): """ Fitting a MUSAE model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. * **X** *(Scipy COO array)* - The binary matrix of node features. """ self._check_graph(graph) self.graph = graph self.walker = RandomWalker(self.walk_length, self.walk_number) self.walker.do_walks(graph) self.features = self._feature_transform(graph, X) self.base_docs = self._create_base_docs() self.embeddings = [self._create_single_embedding(self.base_docs)] self._learn_musae_embedding() def get_embedding(self): r"""Getting the node embedding. Return types: * **embedding** *(Numpy array)* - The embedding of nodes. """ embedding = np.concatenate(self.embeddings, axis=1) return embedding
class SINE(Estimator): r"""An implementation of `"SINE" <https://arxiv.org/pdf/1810.06768.pdf>`_ from the ICDM '18 paper "SINE: Scalable Incomplete Network Embedding". The procedure implicitly factorizes a joint adjacency matrix power and feature matrix. The decomposition happens on truncated random walks and the adjacency matrix powers are pooled together. Args: walk_number (int): Number of random walks. Default is 10. walk_length (int): Length of random walks. Default is 80. dimensions (int): Dimensionality of embedding. Default is 128. workers (int): Number of cores. Default is 4. window_size (int): Matrix power order. Default is 5. epochs (int): Number of epochs. Default is 1. learning_rate (float): HogWild! learning rate. Default is 0.05. min_count (int): Minimal count of node occurrences. Default is 1. seed (int): Random seed value. Default is 42. """ def __init__(self, walk_number: int = 10, walk_length: int = 80, dimensions: int = 128, workers: int = 4, window_size: int = 5, epochs: int = 1, learning_rate: float = 0.05, min_count: int = 1, seed: int = 42): self.walk_number = walk_number self.walk_length = walk_length self.dimensions = dimensions self.workers = workers self.window_size = window_size self.epochs = epochs self.learning_rate = learning_rate self.min_count = min_count self.seed = seed def _feature_transform(self, graph, X): features = {str(node): [] for node in graph.nodes()} nodes = X.row for i, node in enumerate(nodes): features[str(node)].append("feature_" + str(X.col[i])) return features def _select_walklets(self): self._walklets = [] for walk in self._walker.walks: for power in range(1, self.window_size + 1): for step in range(power + 1): neighbors = [ n for i, n in enumerate(walk[step:]) if i % power == 0 ] neighbors = [n for n in neighbors for _ in range(0, 3)] neighbors = [ random.choice(self._features[val]) if i % 3 == 1 and self._features[val] else val for i, val in enumerate(neighbors) ] self._walklets.append(neighbors) del self._walker def fit(self, graph: nx.classes.graph.Graph, X: Union[np.array, coo_matrix]): """ Fitting a SINE model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. * **X** *(Scipy COO array)* - The matrix of node features. """ self._set_seed() self._check_graph(graph) self._walker = RandomWalker(self.walk_length, self.walk_number) self._walker.do_walks(graph) self._features = self._feature_transform(graph, X) self._select_walklets() model = Word2Vec(self._walklets, hs=0, alpha=self.learning_rate, size=self.dimensions, window=1, min_count=self.min_count, workers=self.workers, seed=self.seed, iter=self.epochs) self.embedding = np.array( [model[str(n)] for n in range(graph.number_of_nodes())]) def get_embedding(self) -> np.array: r"""Getting the node embedding. Return types: * **embedding** *(Numpy array)* - The embedding of nodes. """ embedding = self.embedding return embedding
class GEMSEC(Estimator): r"""An implementation of `"GEMSEC" <https://arxiv.org/abs/1802.03997>`_ from the ASONAM '19 paper "GEMSEC: Graph Embedding with Self Clustering". The procedure uses random walks to approximate the pointwise mutual information matrix obtained by pooling normalized adjacency matrix powers. This matrix is decomposed by an approximate factorization technique which is combined with a k-means like clustering cost. A node embedding and clustering are learned jointly. Args: walk_number (int): Number of random walks. Default is 5. walk_length (int): Length of random walks. Default is 80. dimensions (int): Dimensionality of embedding. Default is 32. negative_samples (int): Number of negative samples. Default is 5. window_size (int): Matrix power order. Default is 5. learning_rate (float): Gradient descent learning rate. Default is 0.1. clusters (int): Number of cluster centers. Default is 10. gamma (float): Clustering cost weight coefficient. Default is 0.1. seed (int): Random seed value. Default is 42. """ def __init__(self, walk_number: int = 5, walk_length: int = 80, dimensions: int = 32, negative_samples: int = 5, window_size: int = 5, learning_rate: float = 0.1, clusters: int = 10, gamma: float = 0.1, seed: int = 42): self.walk_number = walk_number self.walk_length = walk_length self.dimensions = dimensions self.negative_samples = negative_samples self.window_size = window_size self.learning_rate = learning_rate self.clusters = clusters self.gamma = gamma self.seed = seed def _setup_sampling_weights(self, graph): """ Creating a negative sampling table. Arg types: * **graph** *(NetworkX graph)* - The graph for negative sampling. """ self._sampler = {} index = 0 for node in graph.nodes(): for _ in range(graph.degree(node)): self._sampler[index] = node index = index + 1 self._global_index = index - 1 def _initialize_node_embeddings(self, graph): """ Creating a node embedding array. Arg types: * **graph** *(NetworkX graph)* - The graph for negative sampling. """ shape = (graph.number_of_nodes(), self.dimensions) self._base_embedding = np.random.normal(0, 1.0 / self.dimensions, shape) def _initialize_cluster_centers(self, graph): """ Creating a cluster center array. Arg types: * **graph** *(NetworkX graph)* - The graph for negative sampling. """ shape = (self.dimensions, self.clusters) self._cluster_centers = np.random.normal(0, 1.0 / self.dimensions, shape) def _sample_negative_samples(self): """ Sampling a batch of nodes as negative samples. Return types: * **negative_samples** *(list)*: List of negative sampled nodes. """ negative_samples = [ self._sampler[random.randint(0, self._global_index)] for _ in range(self.negative_samples) ] return negative_samples def _calculcate_noise_vector(self, negative_samples, source_node): """ Getting the noise vector for the weight update. Arg types: * **negative_samples** *(list)*: List of negative sampled nodes. * **source_node** *(int)* - Source node in the walk. Return types: * **noise_vector** *(NumPy array) - Noise update vector. """ noise_vectors = self._base_embedding[negative_samples, :] source_vector = self._base_embedding[int(source_node), :] raw_scores = noise_vectors.dot(source_vector.T) raw_scores = np.exp(np.clip(raw_scores, -15, 15)) scores = raw_scores / np.sum(raw_scores) scores = scores.reshape(-1, 1) noise_vector = np.sum(scores * noise_vectors, axis=0) return noise_vector def _calculate_cluster_vector(self, source_node): """ Getting the cluster vector for the weight update. Arg types: * **source_node** *(int)* - Source node in the walk. Return types: * **cluster_vector** *(NumPy array) - Cluster update vector. * **cluster_index** *(int)*: Node cluster membership index. """ distances = self._base_embedding[int(source_node), :].reshape( -1, 1) - self._cluster_centers scores = np.power(np.sum(np.power(distances, 2), axis=0), 0.5) cluster_index = np.argmin(scores) cluster_vector = distances[:, cluster_index] / scores[cluster_index] return cluster_vector, cluster_index def _do_descent_for_pair(self, negative_samples, source_node, target_node): """ Updating the cluster center and the node embedding. Arg types: * **negative_samples** *(list)* - Negative samples. * **source_node** *(int)* - Source node in the walk. * **target_node** *(int)* - Target node in the walk. """ noise_vector = self._calculcate_noise_vector(negative_samples, source_node) target_vector = self._base_embedding[int(target_node), :] cluster_vector, cluster_index = self._calculate_cluster_vector( source_node) node_gradient = noise_vector - target_vector + self.gamma * cluster_vector node_gradient = node_gradient / np.linalg.norm(node_gradient) self._base_embedding[ int(source_node), :] += -self.learning_rate * node_gradient self._cluster_centers[:, cluster_index] += self.learning_rate * self.gamma * cluster_vector def _update_a_weight(self, source_node, target_node): """ Updating the weights for a pair of nodes. Arg types: * **source_node** *(int)* - Source node in the walk. * **target_node** *(int)* - Target node in the walk. """ negative_samples = self._sample_negative_samples() self._do_descent_for_pair(negative_samples, source_node, target_node) self._do_descent_for_pair(negative_samples, target_node, source_node) def _do_gradient_descent(self): """ Updating the embedding weights and cluster centers with gradient descent. """ random.shuffle(self._walker.walks) for walk in self._walker.walks: for i, source_node in enumerate(walk[:self.walk_length - self.window_size]): for step in range(1, self.window_size + 1): target_node = walk[i + step] self._update_a_weight(source_node, target_node) def fit(self, graph: nx.classes.graph.Graph): """ Fitting a GEMSEC model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ self._set_seed() self._check_graph(graph) self._setup_sampling_weights(graph) self._walker = RandomWalker(self.walk_length, self.walk_number) self._walker.do_walks(graph) self._initialize_node_embeddings(graph) self._initialize_cluster_centers(graph) self._do_gradient_descent() def get_embedding(self) -> np.array: r"""Getting the node embedding. Return types: * **embedding** *(Numpy array)*: The embedding of nodes. """ return np.array(self._base_embedding) def _get_membership(self, node): """Getting the cluster membership of a node. Arg types: * **node** *(int)* - The graph to be clustered. Return types: * **cluster_index** *(int)*: Node cluster membership index. """ distances = self._base_embedding[node, :].reshape( -1, 1) - self._cluster_centers scores = np.power(np.sum(np.power(distances, 2), axis=0), 0.5) cluster_index = np.argmin(scores) return cluster_index def get_memberships(self) -> Dict[int, int]: r"""Getting the cluster membership of nodes. Return types: * **memberships** *(dict)*: Node cluster memberships. """ memberships = { node: self._get_membership(node) for node in range(self._base_embedding.shape[0]) } return memberships