def test_distance_matrix_symmetric(data): X, y = data model = SimilarityForestCluster() model.fit(X) distance_matrix = model.distance_matrix_ check_symmetric(squareform(distance_matrix))
def _scipy_to_igraph(self, matrix): matrix.eliminate_zeros() sources, targets = matrix.nonzero() weights = matrix[sources, targets] graph = ig.Graph(n=matrix.shape[0], edges=list(zip(sources, targets)), directed=True, edge_attrs={'weight': weights}) try: check_symmetric(matrix, raise_exception=True) graph = graph.as_undirected() except ValueError: pass return graph
def _create_graph(self): distance_to_adjacent = np.vectorize(lambda x: 1 if x < self.alpha else 0) if self.data_type == 'cloud': self.adjacent_matrix = distance_to_adjacent( distance_matrix(self.data, self.data, p=2)) else: check_symmetric(self.data) self.adjacent_matrix = distance_to_adjacent(self.data) self.adjacent_matrix = self.adjacent_matrix - np.identity( self.data.shape[0]) return nx.from_numpy_matrix(self.adjacent_matrix)
def snf4st(*aff, K=20, t=20, alpha=1.0, beta=0.4, gamma=0.3): # ckm 0.4 0.3 '''该函数是对原来函数的改进''' aff = _check_SNF_inputs(aff) Wk = [0] * len(aff) Wsum = np.zeros(aff[0].shape) # get number of modalities informing each subject x subject affinity n_aff = len(aff) - np.sum([np.isnan(a) for a in aff], axis=0) for n, mat in enumerate(aff): # normalize affinity matrix based on strength of edges mat = mat / np.nansum(mat, axis=1, keepdims=True) # 每行元素除以每行元素之和,因此导致了矩阵是非对称的 aff[n] = check_symmetric(mat, raise_warning=False) # 这个得到的矩阵是对称的 # apply KNN threshold to normalized affinity matrix Wk[n] = _find_dominate_set(aff[n], int(K)) # 这个得到的权重邻接矩阵不是对称的,可以考虑有向? # take sum of all normalized (not thresholded) affinity matrices Wsum = np.nansum(aff, axis=0) for iteration in range(t): # t为迭代次数 for n, mat in enumerate(aff): # temporarily convert nans to 0 to avoid propagation errors nzW = np.nan_to_num(Wk[n]) # 表示其中一个网络的邻接矩阵W之一 aw = np.nan_to_num(mat) # 规则化一下,将该网络中带权的权重矩阵中异常数据转化为0 # 计算二阶邻居的权重相似性 #dia_index = range(len(nzW)) #nzW[dia_index, dia_index] = 1 nzwdig = nzW - np.diag(np.diag(nzW)) # 设置对角线元素为0,自环节点的权重为0 nzw_clo = np.nansum(nzwdig, axis=0) # 将矩阵按行求和,即为每个矩阵的权重之和,即为分母 ps = nzwdig / np.array([nzw_clo]).T ps = np.nan_to_num(ps) nzthird = ps @ nzwdig # 求增强网络迭代过程中的分子的计算 # propagate `Wsum` through masked affinity matrix (`nzW`) aff0 = (1 - gamma - beta) * ( nzW @ (Wsum - aw) / (n_aff - 1)) + beta * (nzW @ (Wsum - aw) @ nzW.T / (n_aff - 1)) + gamma * ( nzthird @ (Wsum - aw) @ nzthird.T / (n_aff - 1)) # TODO: / by 0 # ensure diagonal retains highest similarity aff0 = np.nan_to_num(aff0) aff[n] = _B0_normalized(aff0, alpha=alpha) # compute updated sum of normalized affinity matrices Wsum = np.nansum(aff, axis=0) # all entries of `aff` should be identical after the fusion procedure # dividing by len(aff) is hypothetically equivalent to selecting one # however, if fusion didn't converge then this is just average of `aff` W = Wsum / len(aff) # normalize fused matrix and update diagonal similarity W = W / np.nansum(W, axis=1, keepdims=True) # TODO: / by NaN W = (W + W.T + np.eye(len(W))) / 2 return np.nan_to_num(W)
def _check_SNF2_inputs(aff): """ Confirms inputs to SNF2 are appropriate Parameters ---------- aff : `m`-list of (N x N) array_like Input similarity arrays. All arrays should be square but no need to be equal size. """ prep = [] for a in _flatten(aff): ac = check_array(a, force_all_finite=True, copy=True) prep.append(check_symmetric(ac, raise_warning=False)) return prep
def _B0_normalized(W, alpha=1.0): """ Adds `alpha` to the diagonal of `W` Parameters ---------- W : (N, N) array_like Similarity array from SNF alpha : (0, 1) float, optional Factor to add to diagonal of `W` to increase subject self-affinity. Default: 1.0 Returns ------- W : (N, N) np.ndarray Normalized similiarity array """ # add `alpha` to the diagonal and symmetrize `W` W = W + (alpha * np.eye(len(W))) W = check_symmetric(W, raise_warning=False) return W
def _check_SNF_inputs(aff): """ Confirms inputs to SNF are appropriate Parameters ---------- aff : `m`-list of (N x N) array_like Input similarity arrays. All arrays should be square and of equal size. """ prep = [] for a in _flatten(aff): ac = check_array(a, force_all_finite=True, copy=True) prep.append(check_symmetric(ac, raise_warning=False)) check_consistent_length(*prep) # TODO: actually do this check for missing data nanaff = len(prep) - np.sum([np.isnan(a) for a in prep], axis=0) if np.any(nanaff == 0): pass return prep
def split(self, reads): if self.aggressive: maxReads = len(reads) - 1 else: maxReads = len(reads) - self.minCount #vTable = self.sigVar.loc[reads] vTable = self.sigVar.reindex(reads) counts = vTable.apply(pd.Series.value_counts).fillna(0) for ch in '.*': if ch in counts.index: counts.drop(ch, inplace=True) counts.drop(columns=counts.columns[counts.sum() == 0], inplace=True) ent = self._rankEntropy(counts) useCols = ent.index[:self.maxFeatures] #ent = counts.apply(lambda p: p.sum()*entropy(p.dropna()))\ # .sort_values(ascending=False) #useCols = ent[ent>=np.percentile(ent,80)].index[:self.maxFeatures] if self.log: self.log.debug(f'Checking for groups using pos {tuple(useCols)}') features = self.sigVar.reindex(reads)[useCols] similarity = self._similarity(features, ent.entropy) spectral = SpectralClustering(n_clusters=2, affinity='precomputed') scaled = check_symmetric(MinMaxScaler().fit_transform(similarity), raise_warning=False) clustv = spectral.fit_predict(scaled) #use group with most non-ref calls useClust = features.groupby(clustv).apply( lambda d: ((d != '.').sum() / len(d)).mean()).idxmax() size = sum(clustv == useClust) if size >= self.minCount and size <= maxReads: subset = features.index[clustv == useClust] var = features.reindex(subset).apply( pd.Series.value_counts).idxmax().values return subset, tuple(useCols), tuple(var) else: return None, None, None
def _stable_normalized(W): """ Adds `alpha` to the diagonal of `W` Parameters ---------- W : (N, N) array_like Similarity array from SNF Returns ------- W : (N, N) np.ndarray Stable-normalized similiarity array """ # add `alpha` to the diagonal and symmetrize `W` rowSum = np.sum(W, 1) - np.diag(W) rowSum[rowSum == 0] = 1 W = W / (2 * rowSum) np.fill_diagonal(W, 0.5) W = check_symmetric(W, raise_warning=False) return W
def remove_distance(coexpression, atlas, atlas_info=None, labels=None): """ Corrects for distance-dependent correlation effects in `coexpression` Regresses Euclidean distance between regions in `atlas` from correlated gene expression array `coexpression`. If `atlas_info` is provided different connection types (e.g., cortex-cortex, cortex-subcortex, subcortex- subcortex) will be residualized independently. Parameters ---------- coexpression : (R x R) array_like Correlated gene expression array, where `R` is the number of regions, as generated with e.g., `numpy.corrcoef(expression)`. atlas : niimg-like object A parcellation image in MNI space, where each parcel is identified by a unique integer ID atlas_info : str or pandas.DataFrame, optional Filepath to or pre-loaded dataframe containing information about `atlas`. Must have at least columns 'id', 'hemisphere', and 'structure' containing information mapping atlas IDs to hemisphere (i.e, "L", "R") and broad structural class (i.e., "cortex", "subcortex", "cerebellum"). Default: None labels : (N,) array_like, optional If only a subset `N` of the ROIs in `atlas` were used to generate the `coexpression` array this array should specify which to consider. Not specifying this may cause a ValueError if `atlas` and `atlas_info` do not match. Default: None Returns ------- residualized : (R, R) numpy.ndarray Provided `coexpression` data residualized against spatial distance between region pairs """ # load atlas_info, if provided atlas = check_niimg_3d(atlas) if atlas_info is not None: atlas_info = utils.check_atlas_info(atlas, atlas_info, labels=labels) if labels is not None and len(labels) != len(coexpression): raise ValueError('Provided labels {} are a different length than ' 'provided coexpression matrix of size {}. Please ' 'confirm inputs and try again.'.format( labels, coexpression.shape)) # check that provided coexpression array is symmetric check_symmetric(coexpression, raise_exception=True) # we'll do basic Euclidean distance correction for now # TODO: implement gray matter volume / cortical surface path distance centroids = utils.get_centroids(atlas, labels=labels) dist = cdist(centroids, centroids, metric='euclidean') corr_resid = np.zeros_like(coexpression) triu_inds = np.triu_indices_from(coexpression, k=1) # if no atlas_info, just residualize all correlations against distance if atlas_info is None: corr_resid[triu_inds] = _resid_dist(coexpression[triu_inds], dist[triu_inds]) # otherwise, we can residualize the different connection types separately else: triu_inds = np.ravel_multi_index(triu_inds, corr_resid.shape) coexpression, dist = coexpression.ravel(), dist.ravel() types = ['cortex', 'subcortex'] for src, tar in itertools.combinations_with_replacement(types, 2): # get indices of sources and targets sources = np.where(atlas_info.structure == src)[0] targets = np.where(atlas_info.structure == tar)[0] inds = np.ravel_multi_index(np.ix_(sources, targets), corr_resid.shape) if src != tar: # e.g., cortex + subcortex rev = np.ravel_multi_index(np.ix_(targets, sources), corr_resid.shape) inds = np.append(inds.ravel(), rev.ravel()) # find intersection of source / target indices + upper triangle inds = np.intersect1d(triu_inds, inds) back = np.unravel_index(inds, corr_resid.shape) # residualize corr_resid[back] = _resid_dist(coexpression[inds], dist[inds]) corr_resid = (corr_resid + corr_resid.T + np.eye(len(corr_resid))) return corr_resid
def snf(*aff, K=20, t=20, alpha=1.0): r""" Performs Similarity Network Fusion on `aff` matrices Parameters ---------- *aff : (N, N) array_like Input similarity arrays; all arrays should be square and of equal size. K : (0, N) int, optional Hyperparameter normalization factor for scaling. Default: 20 t : int, optional Number of iterations to perform information swapping. Default: 20 alpha : (0, 1) float, optional Hyperparameter normalization factor for scaling. Default: 1.0 Returns ------- W: (N, N) np.ndarray Fused similarity network of input arrays Notes ----- In order to fuse the supplied :math:`m` arrays, each must be normalized. A traditional normalization on an affinity matrix would suffer from numerical instabilities due to the self-similarity along the diagonal; thus, a modified normalization is used: .. math:: \mathbf{P}(i,j) = \left\{\begin{array}{rr} \frac{\mathbf{W}_(i,j)} {2 \sum_{k\neq i}^{} \mathbf{W}_(i,k)} ,& j \neq i \\ 1/2 ,& j = i \end{array}\right. Under the assumption that local similarities are more important than distant ones, a more sparse weight matrix is calculated based on a KNN framework: .. math:: \mathbf{S}(i,j) = \left\{\begin{array}{rr} \frac{\mathbf{W}_(i,j)} {\sum_{k\in N_{i}}^{}\mathbf{W}_(i,k)} ,& j \in N_{i} \\ 0 ,& \text{otherwise} \end{array}\right. The two weight matrices :math:`\mathbf{P}` and :math:`\mathbf{S}` thus provide information about a given patient's similarity to all other patients and the `K` most similar patients, respectively. These :math:`m` matrices are then iteratively fused. At each iteration, the matrices are made more similar to each other via: .. math:: \mathbf{P}^{(v)} = \mathbf{S}^{(v)} \times \frac{\sum_{k\neq v}^{}\mathbf{P}^{(k)}}{m-1} \times (\mathbf{S}^{(v)})^{T}, v = 1, 2, ..., m After each iteration, the resultant matrices are normalized via the equation above. Fusion stops after `t` iterations, or when the matrices :math:`\mathbf{P}^{(v)}, v = 1, 2, ..., m` converge. The output fused matrix is full rank and can be subjected to clustering and classification. """ aff = _check_SNF_inputs(aff) Wk = [0] * len(aff) Wsum = np.zeros(aff[0].shape) # get number of modalities informing each subject x subject affinity n_aff = len(aff) - np.sum([np.isnan(a) for a in aff], axis=0) for n, mat in enumerate(aff): # normalize affinity matrix based on strength of edges mat = mat / np.nansum(mat, axis=1, keepdims=True) aff[n] = check_symmetric(mat, raise_warning=False) # apply KNN threshold to normalized affinity matrix Wk[n] = _find_dominate_set(aff[n], int(K)) # take sum of all normalized (not thresholded) affinity matrices Wsum = np.nansum(aff, axis=0) for iteration in range(t): for n, mat in enumerate(aff): # temporarily convert nans to 0 to avoid propagation errors nzW = np.nan_to_num(Wk[n]) aw = np.nan_to_num(mat) # propagate `Wsum` through masked affinity matrix (`nzW`) aff0 = nzW @ (Wsum - aw) @ nzW.T / (n_aff - 1) # TODO: / by 0 # ensure diagonal retains highest similarity aff[n] = _B0_normalized(aff0, alpha=alpha) # compute updated sum of normalized affinity matrices Wsum = np.nansum(aff, axis=0) # all entries of `aff` should be identical after the fusion procedure # dividing by len(aff) is hypothetically equivalent to selecting one # however, if fusion didn't converge then this is just average of `aff` W = Wsum / len(aff) # normalize fused matrix and update diagonal similarity W = W / np.nansum(W, axis=1, keepdims=True) # TODO: / by NaN W = (W + W.T + np.eye(len(W))) / 2 return W
def affinity_matrix(dist, *, K=20, mu=0.5): r""" Calculates affinity matrix given distance matrix `dist` Uses a scaled exponential similarity kernel to determine the weight of each edge based on `dist`. Optional hyperparameters `K` and `mu` determine the extent of the scaling (see `Notes`). You'd probably be best to use :py:func`snf.compute.make_affinity` instead of this, as that command also handles normalizing the inputs and creating the distance matrix. Parameters ---------- dist : (N, N) array_like Distance matrix K : (0, N) int, optional Number of neighbors to consider. Default: 20 mu : (0, 1) float, optional Normalization factor to scale similarity kernel. Default: 0.5 Returns ------- W : (N, N) np.ndarray Affinity matrix Notes ----- The scaled exponential similarity kernel, based on the probability density function of the normal distribution, takes the form: .. math:: \mathbf{W}(i, j) = \frac{1}{\sqrt{2\pi\sigma^2}} \ exp^{-\frac{\rho^2(x_{i},x_{j})}{2\sigma^2}} where :math:`\rho(x_{i},x_{j})` is the Euclidean distance (or other distance metric, as appropriate) between patients :math:`x_{i}` and :math:`x_{j}`. The value for :math:`\\sigma` is calculated as: .. math:: \sigma = \mu\ \frac{\overline{\rho}(x_{i},N_{i}) + \overline{\rho}(x_{j},N_{j}) + \rho(x_{i},x_{j})} {3} where :math:`\overline{\rho}(x_{i},N_{i})` represents the average value of distances between :math:`x_{i}` and its neighbors :math:`N_{1..K}`, and :math:`\mu\in(0, 1)\subset\mathbb{R}`. Examples -------- >>> from snf import datasets >>> simdata = datasets.load_simdata() We need to construct a distance matrix before we can create a similarity matrix using :py:func:`snf.compute.affinity_matrix`: >>> from scipy.spatial.distance import cdist >>> dist = cdist(simdata.data[0], simdata.data[0]) >>> from snf import compute >>> aff = compute.affinity_matrix(dist) >>> aff.shape (200, 200) """ # check inputs dist = check_array(dist, force_all_finite=False) dist = check_symmetric(dist, raise_warning=False) # get mask for potential NaN values and set diagonals zero mask = np.isnan(dist) dist[np.diag_indices_from(dist)] = 0 # sort array and get average distance to K nearest neighbors T = np.sort(dist, axis=1) TT = np.vstack(T[:, 1:K + 1].mean(axis=1) + np.spacing(1)) # compute sigma (see equation in Notes) sigma = (TT + TT.T + dist) / 3 msigma = np.ma.array(sigma, mask=mask) # mask for NaN sigma = sigma * np.ma.greater(msigma, np.spacing(1)).data + np.spacing(1) # get probability density function with scale = mu*sigma and symmetrize scale = (mu * np.nan_to_num(sigma)) + mask W = stats.norm.pdf(np.nan_to_num(dist), loc=0, scale=scale) W[mask] = np.nan W = check_symmetric(W, raise_warning=False) return W
pairwise_dist_mat = np.array(df_pairwise_distances.values) #print (pairwise_dist_mat) #print (pairwise_dist_mat.shape) #dists = squareform(pairwise_dist_mat) #print (dists) # def check_symmetric(a, rtol=1e-05, atol=1e-08): # return np.allclose(a, a.T, rtol=rtol, atol=atol) df_pairwise_distances.fillna(value=np.nan, inplace=True) print(df_pairwise_distances) #print (np.around(df_pairwise_distances.values, 3)) #check_symmetric(np.around(df_pairwise_distances.values, 3), rtol=1e-05, atol=1e-08) path_todir = os.getcwd() pairwise_dist_mat = df_pairwise_distances.values #print(np.where(~np.allclose(pairwise_dist_mat, pairwise_dist_mat.T,rtol=1e-05, atol=1e-08))) pairwise_distmat_repaired = check_symmetric(pairwise_dist_mat) print('max error: ', np.amax(np.abs(pairwise_dist_mat - pairwise_dist_mat.T))) print('max error repaired: ', np.amax(pairwise_distmat_repaired - pairwise_distmat_repaired.T)) print(np.around(pairwise_distmat_repaired)) #t = np.around(pairwise_distmat_repaired) #pd.DataFrame(t).to_excel("round_dist.xlsx") # pd.DataFrame(pairwise_distmat_repaired).to_excel(pltname+"_repaired.xlsx") # pd.DataFrame(pairwise_dist_mat).to_excel(pltname+".xlsx") #df_pairwise_distances.to_excel(pltname+".xlsx") #os.system("pause") plt.figure(figsize=(25, 25)) sns.heatmap(df_pairwise_distances, cmap='Blues', linewidth=1) #plt.show() plt.savefig('allmondays_distancematrix.png')
def Main_function(): # 数据加载阶段 graphs_path = './Code/' graph_datasets = RMN.read_graph_pickle(graphs_path) # 表示学习参数设置阶段 p = Parameter["p"] q = Parameter["q"] num_walks = Parameter["num_walks"] walk_length = Parameter["walk_length"] dimensions = Parameter["dimensions"] knei = [10, 15, 20, 25] mu = [0.4, 0.5, 0.6] for name, dets in graph_datasets.items(): print("---------------%s---------------" % name) wvecs = [] # 训练数据集的加载与测试 nx_graph = dets['train_ng'] merge_graph = dets['train_mg'] # 测试验证集的加载与验证 train_edges = [] ground_truth = [] test_edges = dets["test_edges"] test_labels = dets["test_labels"] # 对网络中的节点标签进行修改,需要进行排序 nodes = sorted(list(merge_graph.nodes())) if nodes[0] > 0: train_edges.extend([[i, e[0] - 1, e[1] - 1, 1] for i in range(len(nx_graph)) for e in nx_graph[i].edges()]) train_merge = nx.relabel_nodes(merge_graph, lambda x: int(x) - 1) train_nxgraph = [ nx.relabel_nodes(g, lambda x: int(x) - 1) for g in nx_graph ] test_edges = [[e[0] - 1, e[1] - 1] for i in test_edges for e in i] nodes = list(train_merge.nodes()) else: train_edges.extend([[i, e[0], e[1], 1] for i in range(len(nx_graph)) for e in nx_graph[i].edges()]) train_nxgraph = copy.deepcopy(nx_graph) train_merge = copy.deepcopy(merge_graph) # 有的节点编号并不是连续的,下面语句是为了使节点的编号连续 restru_test_edges = [] for i in test_edges: restru_test_edges.append([[nodes.index(e[0]), nodes.index(e[1])] for e in i]) str_graph = nx.relabel_nodes(train_merge, lambda x: str(x)) # 下面操作的是opennet定义的网络,为了使用现有的单层网络算法做对比 G = opgraph.Graph() DG = str_graph.to_directed() G.read_g(DG) nx_para_graph = [] for g in train_nxgraph: str_graph = nx.relabel_nodes(g, lambda x: str(x)) G = opgraph.Graph() DG = str_graph.to_directed() G.read_g(DG) nx_para_graph.append(G) ################################对比实验部分############################### #1# merge_network auc = [] for index, layer in enumerate(restru_test_edges): y_pred = [] for e in layer: if e[0] in train_merge.nodes() and e[1] in train_merge.nodes(): y_pred.append( list(nx.adamic_adar_index(train_merge, [e]))[0][2]) else: y_pred.append(0) # 当不存在这个节点的时候,应该概率为0 auc.append(roc_auc_score(test_labels[index], y_pred)) print("merge-network:%f" % (sum(auc) / len(auc))) #2# Ohmnet 实现多层网络嵌入 Bioinformatics'2017 ohmnet_walks = [] orignal_walks = [] LG = copy.deepcopy(train_nxgraph) on = ohmnet.OhmNet(LG, p=p, q=q, num_walks=num_walks, walk_length=walk_length, dimension=dimensions, window_size=10, n_workers=8, n_iter=5, out_dir='.') for ns in on.embed_multilayer(): orignal_walks.append(ns) on_walks = [n.split("_")[2] for n in ns] ohmnet_walks.append([str(step) for step in on_walks]) Ohmnet_model = Node2vec.N2V.learn_embeddings(ohmnet_walks, dimensions, workers=5, window_size=10, niter=5) Ohmnet_wvecs = np.array( [Ohmnet_model.get_vector(str(i)) for i in nodes]) y_pred = [] auc = [] for index, layer in enumerate(restru_test_edges): y_pred = [] for e in layer: if str(e[0]) in Ohmnet_model.index2entity and str( e[1] ) in Ohmnet_model.index2entity: # 如果关键字没有在字典Key中,则设置为0.5 y_pred.append( cosine_similarity([ Ohmnet_model.get_vector(str(e[0])), Ohmnet_model.get_vector(str(e[1])) ])[0][1]) else: y_pred.append(0) auc.append(roc_auc_score(test_labels[index], y_pred)) print("ohmnet-network:%f" % (sum(auc) / len(auc))) # # #3# MNE 实现可扩展的Multiplex network的嵌入,IJCAI'2018 edge_data_by_type = {} all_edges = list() all_nodes = list() for e in train_edges: if e[0] not in edge_data_by_type: edge_data_by_type[e[0]] = list() edge_data_by_type[e[0]].append((e[1], e[2])) all_edges.append((e[1], e[2])) all_nodes.append(e[1]) all_nodes.append(e[2]) all_nodes = list(set(all_nodes)) all_edges = list(set(all_edges)) edge_data_by_type['Base'] = all_edges MNE_model = MNE.train_model(edge_data_by_type) local_model = dict() auc = [] for index, layer in enumerate(restru_test_edges): y_pred = [] for pos in range(len(MNE_model['index2word'])): local_model[MNE_model['index2word'] [pos]] = MNE_model['base'][pos] + 0.5 * np.dot( MNE_model['addition'][index][pos], MNE_model['tran'][index]) for e in layer: if str(e[0]) in MNE_model['index2word'] and str( e[1] ) in MNE_model['index2word']: # 如果关键字没有在字典Key中,则设置为0.5 y_pred.append( cosine_similarity( [local_model[str(e[0])], local_model[str(e[1])]])[0][1]) else: y_pred.append(0) auc.append(roc_auc_score(test_labels[index], y_pred)) print("MNE:%f" % (sum(auc) / len(auc))) #4# PMNE的3种算法 merged_networks = dict() merged_networks['training'] = dict() merged_networks['test_true'] = dict() merged_networks['test_false'] = dict() for index, g in enumerate(train_nxgraph): merged_networks['training'][index] = set(g.edges()) merged_networks['test_true'][index] = restru_test_edges[index] merged_networks['test_false'][index] = test_edges[index][ len(test_edges):] performance_1, performance_2, performance_3 = main.Evaluate_PMNE_methods( merged_networks) print("PMNE(n):%f" % (performance_1)) print("PMNE(r):%f" % (performance_2)) print("MNE(c):%f" % (performance_3)) #5# MELL实现多层网络的节点表示学习,WWW’2018 L = len(nx_graph) N = max([int(n) for n in train_merge.nodes()]) + 1 N = max(N, train_merge.number_of_nodes()) # 为了构造邻接矩阵需要找到行的标准 directed = True d = 128 k = 3 lamm = 10 beta = 1 gamma = 1 MELL_wvecs = MELL_model(L, N, directed, train_edges, d, k, lamm, beta, gamma) MELL_wvecs.train(30) # 之前是500,但是有的数据集500会报错,因此设置为30 auc = [] for index, layer in enumerate(restru_test_edges): y_pred = [] for e in layer: if e[0] in all_nodes and e[ 1] in all_nodes: # 如果关键字没有在字典Key中,则设置为0.5 y_pred.append(MELL_wvecs.predict((index, e[0], e[1]))) else: y_pred.append(0) auc.append(roc_auc_score(test_labels[index], y_pred)) print("MELL:%f" % (sum(auc) / len(auc))) #6# 基本相似性度量方法:CN JC AA auc1 = [] auc2 = [] auc3 = [] for index, layer in enumerate(restru_test_edges): y_pred_cn = [] y_pred_jc = [] y_pred_AA = [] for e in layer: if e[0] in train_nxgraph[index].nodes( ) and e[1] in train_nxgraph[index].nodes(): y_pred_cn.append( len( list( nx.common_neighbors(train_nxgraph[index], e[0], e[1])))) y_pred_jc.append( list(nx.jaccard_coefficient(train_nxgraph[index], [e]))[0][2]) # y_pred_AA.append(list(nx.adamic_adar_index(train_nxgraph[index], [e]))[0][2]) else: y_pred_cn.append(0) # 如果不存在这个节点,那么为共有邻居为0 y_pred_jc.append(0) # y_pred_AA.append(0) auc1.append(roc_auc_score(test_labels[index], y_pred_cn)) # 计算AUC值 auc2.append(roc_auc_score(test_labels[index], y_pred_jc)) auc3.append(roc_auc_score(test_labels[index], y_pred_AA)) print("CN-network:%f" % (sum(auc1) / len(auc1))) print("JC-network:%f" % (sum(auc2) / len(auc2))) print("AA-network:%f" % (sum(auc3) / len(auc3))) #7# Single-layer Node2vec auc = [] for index, G in enumerate(nx_para_graph): model_nf = node2vec.Node2vec(G, walk_length, num_walks, dimensions, p=p, q=q, dw=True) index_num = sorted([int(i) for i in model_nf.vectors.keys()]) g_embedding = [model_nf.vectors[str(i)] for i in index_num] y_pred = [] for e in restru_test_edges[index]: if str(e[0]) in G.G.nodes() and str( e[1]) in G.G.nodes(): # 如果关键字没有在字典Key中,则设置为0.5 y_pred.append( cosine_similarity([ model_nf.vectors[str(e[0])], model_nf.vectors[str(e[1])] ])[0][1]) else: y_pred.append(0) auc.append(roc_auc_score(test_labels[index], y_pred)) print("Node2vec: %f" % (sum(auc) / len(auc))) #7# Network + Embedding(N2V) + SNF4st 网络的表示学习 for k in knei: for m in mu: auc_final = [] for i in range(2, 10): # 为了求平均值 # 第一个参数是KNN的K值,第二个是mu值,第三个是其他过程使用的K值,最后一个参数使迭代次数,一般情况下20次就会达到收敛 network, groundtruth, best, second = NFC.cluster_E( nx_para_graph, ground_truth, Parameter, nodes, k, m, k, 30) # CKM\V(20, 0.5, 20, 20) Network_Adj = _find_dominate_set( check_symmetric(network, raise_warning=False), K=k) # 从网络的相似性矩阵中构建邻接矩阵 CKM(20) Vickers(15) g = nx.from_numpy_matrix(Network_Adj) # 基于邻接矩阵构建网络 auc = [] for index, layer in enumerate(restru_test_edges): y_pred = [] for e in layer: if e[0] in train_nxgraph[index].nodes( ) and e[1] in train_nxgraph[index].nodes(): y_pred.append( list( nx.adamic_adar_index( g, [ (nodes.index( e[0]), nodes.index(e[1])) ]))[0][2]) # 利用RA相似性计算测试集两点之间概率 else: y_pred.append(0) auc.append(roc_auc_score(test_labels[index], y_pred)) # 计算AUC值 auc_final.append(sum(auc) / len(auc)) value = max(auc_final) average = sum(auc_final) / len(auc_final) print("K=%d Mu=%.2f Max:index({%d})->%f" % (k, m, auc_final.index(value), value)) print("K=%d Mu=%f Ave:->%f" % (k, m, average))
def my_mds_plot(): data = pd.io.parsers.read_csv( #pandas handles in a better way 'pca_counts.csv', header="infer" #the first row contains the recipiennts names ) if (DEBUG): data = pd.io.parsers.read_csv( #pandas handles in a better way 'pca_counts_min.csv', header="infer" #the first row contains the recipiennts names ) print("Data\n", data) dissM = data.values[:, 1:] #the first column contains the suppliers names countries = data.values[:, 0] #amax = np.amax(dissM) #dissM /= amax #you can uncomment these lines to plot MDS in a normalized 2D dissM = np.array(dissM, dtype=float) #Repair dissM which is not symmetric dissM = check_symmetric(dissM) if (DEBUG): print(dissM) with open("log.txt", "w") as out: out.write("".join([x for x in countries])) out.write("\n".join([str(x) for x in dissM])) mds = manifold.MDS(n_components=2, n_init=20, metric=True, max_iter=3000, eps=1e-9, dissimilarity="precomputed", random_state=RS) pos = mds.fit(dissM).embedding_ stress = mds.fit(dissM).stress_ plt.scatter(pos[:, 0], pos[:, 1], marker='o') for label, x, y in zip(countries, pos[:, 0], pos[:, 1]): plt.annotate(label, xy=(x, y), xytext=(-20, 20), textcoords='offset points', ha='right', va='bottom', bbox=dict(boxstyle='round,pad=0.3', fc='yellow', alpha=0.1), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0')) plt.show() print(stress) # from sklearn import datasets # digits = datasets.load_digits() # print(digits) # tsne = manifold.TSNE(n_components=2,random_state=2).fit_transform(dissM) # plt.scatter(tsne,tsne,3, marker = 'o') # for label, x, y in zip(countries, pos[:, 0], pos[:, 1]): # plt.annotate( # label, # xy = (x, y), xytext = (-20, 20), # textcoords = 'offset points', ha = 'right', va = 'bottom', # bbox = dict(boxstyle = 'round,pad=0.3', fc = 'yellow', alpha = 0.1), # arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.show()
def plot_partial_corrcoef(partial_corrcoef, ax=None, cbar=True, figsize=None, filename=None, title='Partial correlation', **kwargs): """Plot the partial correlation coefficient matrix. Parameters ---------- partial_corrcoef : array-like of shape (n_features, n_features) Partial correlation coefficient matrix. ax : matplotlib Axes, default None Target axes instance. cbar : bool, default True. If True, draw a colorbar. figsize : tuple, default None Tuple denoting figure size of the plot. filename : str, default None If provided, save the current figure. title : string, default 'Partial correlation' Axes title. To disable, pass None. **kwargs : dict Other keywords passed to ``ax.pcolormesh``. Returns ------- ax : matplotlib Axes Axes on which the plot was drawn. Examples -------- >>> import matplotlib.pyplot as plt >>> from kenchi.plotting import plot_partial_corrcoef >>> from sklearn.datasets import make_sparse_spd_matrix >>> A = make_sparse_spd_matrix(dim=20, norm_diag=True, random_state=0) >>> plot_partial_corrcoef(A) # doctest: +ELLIPSIS <matplotlib.axes._subplots.AxesSubplot object at 0x...> >>> plt.show() # doctest: +SKIP .. figure:: images/plot_partial_corrcoef.png """ import matplotlib.pyplot as plt from mpl_toolkits.axes_grid1 import make_axes_locatable partial_corrcoef = check_array(partial_corrcoef) partial_corrcoef = check_symmetric(partial_corrcoef, raise_exception=True) if ax is None: _, ax = plt.subplots(figsize=figsize) if title is not None: ax.set_title(title) # Add the pcolormesh kwargs here kwargs.setdefault('cmap', 'RdBu') kwargs.setdefault('edgecolors', 'white') kwargs.setdefault('vmin', -1.) kwargs.setdefault('vmax', 1.) # Draw the heatmap mesh = ax.pcolormesh(np.ma.masked_equal(partial_corrcoef, 0.), **kwargs) ax.set_aspect('equal') ax.set_facecolor('grey') # Invert the y axis to show the plot in matrix form ax.invert_yaxis() if cbar: # Create an axes on the right side of ax divider = make_axes_locatable(ax) cax = divider.append_axes('right', '5%', pad=0.1) ax.get_figure().colorbar(mesh, cax=cax) if filename is not None: ax.get_figure().savefig(filename) return ax
def snf2_np(*aff, numofCom, K=20, t=20, alpha=1.0): """ Performs Similarity Network Fusion on `aff` matrices Parameters ---------- *aff : (N, N) array_like Input similarity arrays; all arrays should be square but no need to be equal size. Note: these arrays must have all the common samples appeared first in the matrix numofCom: int, required Number of common samples across all the matrices K : (0, N) int, optional Hyperparameter normalization factor for scaling. Default: 20 t : int, optional Number of iterations to perform information swapping. Default: 20 alpha : (0, 1) float, optional Hyperparameter normalization factor for scaling. Default: 1.0 Returns ------- W: (N, N) Ouputs similarity arrays Fused similarity networks of input arrays """ print("Start applying diffusion!") aff = _check_SNF2_inputs(aff) newW = [0] * len(aff) aff_com = [0] * len(aff) # First, normalize different networks to avoid scale problems for n, mat in enumerate(aff): # normalize affinity matrix based on strength of edges # mat = mat / np.nansum(mat, axis=1, keepdims=True) mat = _stable_normalized(mat) aff[n] = check_symmetric(mat, raise_warning=False) aff_com[n] = aff[n][0:numofCom, :][:, 0:numofCom] # apply KNN threshold to normalized affinity matrix # We need to crop the intersecting samples from newW matrices newW[n] = _find_dominate_set(aff[n], int(K)) newW[n] = newW[n][:, 0:numofCom] # take sum of all normalized (not thresholded) affinity matrices of the intersections part Wsum = np.nansum(aff_com, axis=0) # get number of modalities informing each subject x subject affinity n_aff = len(aff_com) - np.sum([np.isnan(a) for a in aff_com], axis=0) for iteration in range(t): for n, mat in enumerate(aff): # temporarily convert nans to 0 to avoid propagation errors nzW = np.nan_to_num(newW[n]) mat = mat[0:numofCom, :][:, 0:numofCom] aw = np.nan_to_num(mat) # propagate `Wsum` through masked affinity matrix (`nzW`) aff0 = np.matmul(np.matmul(nzW, (Wsum - aw) / (n_aff - 1)), nzW.T) # TODO: / by 0 # ensure diagonal retains highest similarity aff[n] = _B0_normalized(aff0, alpha=alpha) aff_com[n] = aff[n][0:numofCom, :][:, 0:numofCom] # compute updated sum of normalized affinity matrices Wsum = np.nansum(aff_com, axis=0) for n, mat in enumerate(aff): mat = _stable_normalized(mat) aff[n] = check_symmetric(mat, raise_warning=False) return aff
def snf2(args, aff, dicts_common, dicts_unique, original_order): """ Performs Similarity Network Fusion on `aff` matrices Parameters ---------- aff : (N, N) pandas dataframe Input similarity arrays; all arrays should be square but no need to be equal size. dicts_common: dictionaries, required Dictionaries for getting common samples from different views Example: dicts_common[(0, 1)] == dicts_common[(1, 0)], meaning the common patients between view 1&2 dicts_unique: dictionaries, required Dictionaries for getting unique samples for different views Example: dicts_unique[(0, 1)], meaning the unique samples from view 1 that are not in view 2 dicts_unique[(1, 0)], meaning the unique samples from view 2 that are not in view 1 original_order: lists, required The original order of each view K : (0, N) int, optional Hyperparameter normalization factor for scaling. Default: 20 t : int, optional Number of iterations to perform information swapping. Default: 20 alpha : (0, 1) float, optional Hyperparameter normalization factor for scaling. Default: 1.0 Returns ------- W: (N, N) Ouputs similarity arrays Fused similarity networks of input arrays """ print("Start applying diffusion! with new method") start_time = time.time() newW = [0] * len(aff) # First, normalize different networks to avoid scale problems, it is compatible with pandas dataframe for n, mat in enumerate(aff): # normalize affinity matrix based on strength of edges # mat = mat / np.nansum(mat, axis=1, keepdims=True) mat = _stable_normalized_pd(mat) aff[n] = check_symmetric(mat, raise_warning=False) # apply KNN threshold to normalized affinity matrix # We need to crop the intersecting samples from newW matrices neighbor_size = min(int(args.neighbor_size), aff[n].shape[0]) newW[n] = _find_dominate_set(aff[n], neighbor_size) for iteration in range(args.fusing_iteration): for n, mat in enumerate(aff): # temporarily convert nans to 0 to avoid propagation errors nzW = newW[n] # TODO: not sure this is a deep copy or not # Your goal is to update aff[n], but it is the average of all the defused matrices. # Make a copy of add[n], and set it to 0 aff0_copy = aff[n].copy() for col in aff0_copy.columns: aff0_copy[col].values[:] = 0 for j, mat_tofuse in enumerate(aff): if n == j: continue # reorder mat_tofuse to have the common samples mat_tofuse = mat_tofuse.reindex( (sorted(dicts_common[(j, n)]) + sorted(dicts_unique[(j, n)])), axis=1, ) mat_tofuse = mat_tofuse.reindex( (sorted(dicts_common[(j, n)]) + sorted(dicts_unique[(j, n)])), axis=0, ) # Next, let's crop mat_tofuse num_common = len(dicts_common[(n, j)]) to_drop_mat = mat_tofuse.columns[num_common:mat_tofuse. shape[1]].values.tolist() mat_tofuse_crop = mat_tofuse.drop(to_drop_mat, axis=1) mat_tofuse_crop = mat_tofuse_crop.drop(to_drop_mat, axis=0) # Next, add the similarity from the view to fused to the current view identity matrix nzW_identity = pd.DataFrame( data=np.identity(nzW.shape[0]), index=original_order[n], columns=original_order[n], ) mat_tofuse_union = nzW_identity + mat_tofuse_crop mat_tofuse_union.fillna(0.0, inplace=True) mat_tofuse_union = _stable_normalized_pd(mat_tofuse_union) mat_tofuse_union = mat_tofuse_union.reindex(original_order[n], axis=1) mat_tofuse_union = mat_tofuse_union.reindex(original_order[n], axis=0) # Now we are ready to do the diffusion nzW_T = np.transpose(nzW) aff0_temp = nzW.dot(mat_tofuse_union.dot( nzW_T)) # Matmul is not working, but .dot() is good aff0_temp = _B0_normalized(aff0_temp, alpha=args.normalization_factor) aff0_copy = np.add(aff0_temp, aff0_copy) aff[n] = np.divide(aff0_copy, len(aff) - 1) for n, mat in enumerate(aff): mat = _stable_normalized_pd(mat) aff[n] = check_symmetric(mat, raise_warning=False) end_time = time.time() print("Diffusion ends! Times: {}s".format(end_time - start_time)) return aff