Esempio n. 1
0
    def forward(self, graph, return_dict=False):
        # run LINE algorithm, 1-order, 2-order or 3(1-order + 2-order)
        nx_g = graph.to_networkx()
        self.G = nx_g
        self.is_directed = nx.is_directed(self.G)
        self.num_node = nx_g.number_of_nodes()
        self.num_edge = nx_g.number_of_edges()
        self.num_sampling_edge = self.walk_length * self.walk_num * self.num_node

        node2id = dict([(node, vid) for vid, node in enumerate(nx_g.nodes())])
        self.edges = [[node2id[e[0]], node2id[e[1]]] for e in self.G.edges()]
        self.edges_prob = np.asarray([nx_g[u][v].get("weight", 1.0) for u, v in nx_g.edges()])
        self.edges_prob /= np.sum(self.edges_prob)
        self.edges_table, self.edges_prob = alias_setup(self.edges_prob)

        degree_weight = np.asarray([0] * self.num_node)
        for u, v in nx_g.edges():
            degree_weight[node2id[u]] += nx_g[u][v].get("weight", 1.0)
            if not self.is_directed:
                degree_weight[node2id[v]] += nx_g[u][v].get("weight", 1.0)
        self.node_prob = np.power(degree_weight, 0.75)
        self.node_prob /= np.sum(self.node_prob)
        self.node_table, self.node_prob = alias_setup(self.node_prob)

        if self.order == 3:
            self.dimension = int(self.dimension / 2)
        if self.order == 1 or self.order == 3:
            print("train line with 1-order")
            print(type(self.dimension))
            self.emb_vertex = (np.random.random((self.num_node, self.dimension)) - 0.5) / self.dimension
            self._train_line(order=1)
            embedding1 = preprocessing.normalize(self.emb_vertex, "l2")

        if self.order == 2 or self.order == 3:
            print("train line with 2-order")
            self.emb_vertex = (np.random.random((self.num_node, self.dimension)) - 0.5) / self.dimension
            self.emb_context = self.emb_vertex
            self._train_line(order=2)
            embedding2 = preprocessing.normalize(self.emb_vertex, "l2")

        if self.order == 1:
            embeddings = embedding1
        elif self.order == 2:
            embeddings = embedding2
        else:
            print("concatenate two embedding...")
            embeddings = np.hstack((embedding1, embedding2))

        if return_dict:
            features_matrix = dict()
            for vid, node in enumerate(nx_g.nodes()):
                features_matrix[node] = embeddings[vid]
        else:
            features_matrix = np.zeros((graph.num_nodes, embeddings.shape[1]))
            nx_nodes = nx_g.nodes()
            features_matrix[nx_nodes] = embeddings[np.arange(graph.num_nodes)]
        return features_matrix
Esempio n. 2
0
    def forward(self, data):
        G = nx.DiGraph()
        row, col = data.edge_index
        G.add_edges_from(list(zip(row.numpy(), col.numpy())))
        self.G = G
        self.node_type = data.pos.tolist()
        self.num_node = G.number_of_nodes()
        self.num_edge = G.number_of_edges()
        self.num_sampling_edge = self.walk_length * self.walk_num * self.num_node

        self.num_node_type = len(set(self.node_type))
        context_node = [nid for nid, ntype in enumerate(self.node_type) if ntype == 0]

        self.edges, self.edges_prob = [[] for _ in range(self.num_node_type)], []
        self.node_prob, self.id2node = [], [dict() for _ in range(self.num_node_type)]

        subgraphs = []
        for i in range(self.num_node_type):
            for j in range(i + 1, self.num_node_type):
                context_node = [nid for nid, ntype in enumerate(self.node_type) if ntype == i or ntype == j]
                sub_graph = nx.Graph()
                sub_graph = self.G.subgraph(context_node)
                if sub_graph.number_of_edges() != 0:
                    subgraphs.append(sub_graph)
        self.num_graph = len(subgraphs)
        print("number of subgraph", self.num_graph)

        for i in range(self.num_graph):
            self.edges[i] = [[e[0], e[1]] for e in subgraphs[i].edges()]
            edges_prob = np.asarray([subgraphs[i][u][v].get("weight", 1.0) for u, v in self.edges[i]])
            edges_prob /= np.sum(edges_prob)
            edges_table_prob = alias_setup(edges_prob)
            self.edges_prob.append(edges_table_prob)

            context_node = subgraphs[i].nodes()
            self.id2node[i] = dict(zip(range(len(context_node)), context_node))
            node2id = dict(zip(context_node, range(len(context_node))))

            degree_weight = np.asarray([0] * len(context_node))
            for u in context_node:
                for v in list(subgraphs[i].neighbors(u)):
                    degree_weight[node2id[u]] += subgraphs[i][u][v].get("weight", 1.0)

            node_prob = np.power(degree_weight, 0.75)
            node_prob /= np.sum(node_prob)
            nodes_table_prob = alias_setup(node_prob)
            self.node_prob.append(nodes_table_prob)

        print("train pte with 2-order")
        self.emb_vertex = (np.random.random((self.num_node, self.dimension)) - 0.5) / self.dimension
        self.emb_context = self.emb_vertex
        self._train_line()
        embedding = preprocessing.normalize(self.emb_vertex, "l2")
        return embedding
Esempio n. 3
0
    def train(self, G):
        # run LINE algorithm, 1-order, 2-order or 3(1-order + 2-order)
        self.G = G
        self.is_directed = nx.is_directed(self.G)
        self.num_node = G.number_of_nodes()
        self.num_edge = G.number_of_edges()
        self.num_sampling_edge = self.walk_length * self.walk_num * self.num_node

        node2id = dict([(node, vid) for vid, node in enumerate(G.nodes())])
        self.edges = [[node2id[e[0]], node2id[e[1]]] for e in self.G.edges()]
        self.edges_prob = np.asarray([G[u][v].get("weight", 1.0) for u, v in G.edges()])
        self.edges_prob /= np.sum(self.edges_prob)
        self.edges_table, self.edges_prob = alias_setup(self.edges_prob)

        degree_weight = np.asarray([0] * self.num_node)
        for u, v in G.edges():
            degree_weight[node2id[u]] += G[u][v].get("weight", 1.0)
            if not self.is_directed:
                degree_weight[node2id[v]] += G[u][v].get("weight", 1.0)
        self.node_prob = np.power(degree_weight, 0.75)
        self.node_prob /= np.sum(self.node_prob)
        self.node_table, self.node_prob = alias_setup(self.node_prob)

        if self.order == 3:
            self.dimension = int(self.dimension / 2)
        if self.order == 1 or self.order == 3:
            print("train line with 1-order")
            print(type(self.dimension))
            self.emb_vertex = (np.random.random((self.num_node, self.dimension)) - 0.5) / self.dimension
            self._train_line(order=1)
            embedding1 = preprocessing.normalize(self.emb_vertex, "l2")

        if self.order == 2 or self.order == 3:
            print("train line with 2-order")
            self.emb_vertex = (np.random.random((self.num_node, self.dimension)) - 0.5) / self.dimension
            self.emb_context = self.emb_vertex
            self._train_line(order=2)
            embedding2 = preprocessing.normalize(self.emb_vertex, "l2")

        if self.order == 1:
            self.embeddings = embedding1
        elif self.order == 2:
            self.embeddings = embedding2
        else:
            print("concatenate two embedding...")
            self.embeddings = np.hstack((embedding1, embedding2))
        return self.embeddings
Esempio n. 4
0
    def _get_alias_edge(self, src, dst):
        # Get the alias edge setup lists for a given edge.
        G = self.G
        unnormalized_probs = []
        for dst_nbr in G.neighbors(dst):
            if dst_nbr == src:
                unnormalized_probs.append(G[dst][dst_nbr]["weight"] / self.p)
            elif G.has_edge(dst_nbr, src):
                unnormalized_probs.append(G[dst][dst_nbr]["weight"])
            else:
                unnormalized_probs.append(G[dst][dst_nbr]["weight"] / self.q)
        norm_const = sum(unnormalized_probs)
        normalized_probs = [
            float(u_prob) / norm_const for u_prob in unnormalized_probs
        ]

        return alias_setup(normalized_probs)
Esempio n. 5
0
    def _preprocess_transition_probs(self):
        # Preprocessing of transition probabilities for guiding the random walks.
        G = self.G
        is_directed = nx.is_directed(self.G)

        print(len(list(G.nodes())))
        print(len(list(G.edges())))

        s = time.time()
        alias_nodes = {}
        for node in G.nodes():
            unnormalized_probs = [
                G[node][nbr]["weight"] for nbr in G.neighbors(node)
            ]
            norm_const = sum(unnormalized_probs)
            normalized_probs = [
                float(u_prob) / norm_const for u_prob in unnormalized_probs
            ]
            alias_nodes[node] = alias_setup(normalized_probs)

        t = time.time()
        print("alias_nodes", t - s)

        alias_edges = {}
        s = time.time()

        if is_directed:
            for edge in G.edges():
                alias_edges[edge] = self._get_alias_edge(edge[0], edge[1])
        else:
            for edge in G.edges():
                alias_edges[edge] = self._get_alias_edge(edge[0], edge[1])
                alias_edges[(edge[1], edge[0])] = self._get_alias_edge(
                    edge[1], edge[0])

        t = time.time()
        print("alias_edges", t - s)

        self.alias_nodes = alias_nodes
        self.alias_edges = alias_edges

        return
Esempio n. 6
0
    def train(self, G):
        self.G = G
        node2id = dict([(node, vid) for vid, node in enumerate(G.nodes())])
        self.is_directed = nx.is_directed(self.G)
        self.num_node = self.G.number_of_nodes()
        self.num_edge = G.number_of_edges()
        self.edges = [[node2id[e[0]], node2id[e[1]]] for e in self.G.edges()]

        id2node = dict(zip(node2id.values(), node2id.keys()))

        self.num_neigh = np.asarray([len(list(self.G.neighbors(id2node[i]))) for i in range(self.num_node)])
        self.neighbors = [[node2id[v] for v in self.G.neighbors(id2node[i])] for i in range(self.num_node)]
        s = time.time()
        self.alias_nodes = {}
        self.node_weight = {}
        for i in range(self.num_node):
            unnormalized_probs = [G[id2node[i]][nbr].get("weight", 1.0) for nbr in G.neighbors(id2node[i])]
            norm_const = sum(unnormalized_probs)
            normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
            self.alias_nodes[i] = alias_setup(normalized_probs)
            self.node_weight[i] = dict(
                zip(
                    [node2id[nbr] for nbr in G.neighbors(id2node[i])],
                    unnormalized_probs,
                )
            )

        t = time.time()
        print("alias_nodes", t - s)

        # run netsmf algorithm with multiprocessing and apply randomized svd
        print("number of sample edges ", self.num_round * self.num_edge * self.window_size)
        print("random walk start...")
        t0 = time.time()
        results = []
        pool = Pool(processes=self.worker)
        for i in range(self.worker):
            results.append(pool.apply_async(func=self._random_walk_matrix, args=(i,)))
        pool.close()
        pool.join()
        print("random walk time", time.time() - t0)

        matrix = sp.csr_matrix((self.num_node, self.num_node))
        A = sp.csr_matrix(nx.adjacency_matrix(self.G))
        degree = sp.diags(np.array(A.sum(axis=0))[0], format="csr")
        degree_inv = degree.power(-1)

        t1 = time.time()
        for res in results:
            matrix += res.get()
        t2 = time.time()
        print("construct random walk matrix time", time.time() - t1)

        L = sp.csgraph.laplacian(matrix, normed=False, return_diag=False)
        M = degree_inv.dot(degree - L).dot(degree_inv)
        M = M * A.sum() / self.negative
        M.data[M.data <= 1] = 1
        M.data = np.log(M.data)
        M.eliminate_zeros()
        print("number of nzz", M.nnz)
        print("construct matrix sparsifier time", time.time() - t2)

        embedding = self._get_embedding_rand(M)
        return embedding