def test_distance_matrix_symmetric(data):
    X, y = data
    model = SimilarityForestCluster()

    model.fit(X)
    distance_matrix = model.distance_matrix_
    check_symmetric(squareform(distance_matrix))
 def _scipy_to_igraph(self, matrix):
     matrix.eliminate_zeros()
     sources, targets = matrix.nonzero()
     weights = matrix[sources, targets]
     graph = ig.Graph(n=matrix.shape[0], edges=list(zip(sources, targets)), directed=True, edge_attrs={'weight': weights})
     
     try:
         check_symmetric(matrix, raise_exception=True)
         graph = graph.as_undirected()
     except ValueError:
         pass
     
     return graph
    def _create_graph(self):
        distance_to_adjacent = np.vectorize(lambda x:
                                            1 if x < self.alpha else 0)

        if self.data_type == 'cloud':
            self.adjacent_matrix = distance_to_adjacent(
                distance_matrix(self.data, self.data, p=2))
        else:
            check_symmetric(self.data)
            self.adjacent_matrix = distance_to_adjacent(self.data)

        self.adjacent_matrix = self.adjacent_matrix - np.identity(
            self.data.shape[0])
        return nx.from_numpy_matrix(self.adjacent_matrix)
Example #4
0
def snf4st(*aff, K=20, t=20, alpha=1.0, beta=0.4, gamma=0.3):  # ckm 0.4 0.3
    '''该函数是对原来函数的改进'''

    aff = _check_SNF_inputs(aff)
    Wk = [0] * len(aff)
    Wsum = np.zeros(aff[0].shape)

    # get number of modalities informing each subject x subject affinity
    n_aff = len(aff) - np.sum([np.isnan(a) for a in aff], axis=0)

    for n, mat in enumerate(aff):
        # normalize affinity matrix based on strength of edges
        mat = mat / np.nansum(mat, axis=1,
                              keepdims=True)  # 每行元素除以每行元素之和,因此导致了矩阵是非对称的
        aff[n] = check_symmetric(mat, raise_warning=False)  # 这个得到的矩阵是对称的
        # apply KNN threshold to normalized affinity matrix
        Wk[n] = _find_dominate_set(aff[n], int(K))  # 这个得到的权重邻接矩阵不是对称的,可以考虑有向?

    # take sum of all normalized (not thresholded) affinity matrices
    Wsum = np.nansum(aff, axis=0)

    for iteration in range(t):  # t为迭代次数
        for n, mat in enumerate(aff):
            # temporarily convert nans to 0 to avoid propagation errors
            nzW = np.nan_to_num(Wk[n])  # 表示其中一个网络的邻接矩阵W之一
            aw = np.nan_to_num(mat)  # 规则化一下,将该网络中带权的权重矩阵中异常数据转化为0

            # 计算二阶邻居的权重相似性
            #dia_index = range(len(nzW))
            #nzW[dia_index, dia_index] = 1
            nzwdig = nzW - np.diag(np.diag(nzW))  # 设置对角线元素为0,自环节点的权重为0
            nzw_clo = np.nansum(nzwdig, axis=0)  # 将矩阵按行求和,即为每个矩阵的权重之和,即为分母
            ps = nzwdig / np.array([nzw_clo]).T
            ps = np.nan_to_num(ps)
            nzthird = ps @ nzwdig  # 求增强网络迭代过程中的分子的计算

            # propagate `Wsum` through masked affinity matrix (`nzW`)
            aff0 = (1 - gamma - beta) * (
                nzW @ (Wsum - aw) /
                (n_aff - 1)) + beta * (nzW @ (Wsum - aw) @ nzW.T /
                                       (n_aff - 1)) + gamma * (
                                           nzthird @ (Wsum - aw) @ nzthird.T /
                                           (n_aff - 1))  # TODO: / by 0
            # ensure diagonal retains highest similarity
            aff0 = np.nan_to_num(aff0)
            aff[n] = _B0_normalized(aff0, alpha=alpha)
        # compute updated sum of normalized affinity matrices
        Wsum = np.nansum(aff, axis=0)

    # all entries of `aff` should be identical after the fusion procedure
    # dividing by len(aff) is hypothetically equivalent to selecting one
    # however, if fusion didn't converge then this is just average of `aff`
    W = Wsum / len(aff)

    # normalize fused matrix and update diagonal similarity
    W = W / np.nansum(W, axis=1, keepdims=True)  # TODO: / by NaN
    W = (W + W.T + np.eye(len(W))) / 2

    return np.nan_to_num(W)
Example #5
0
def _check_SNF2_inputs(aff):
    """
    Confirms inputs to SNF2 are appropriate

    Parameters
    ----------
    aff : `m`-list of (N x N) array_like
        Input similarity arrays. All arrays should be square but no need to be equal size.
    """

    prep = []
    for a in _flatten(aff):
        ac = check_array(a, force_all_finite=True, copy=True)
        prep.append(check_symmetric(ac, raise_warning=False))

    return prep
Example #6
0
def _B0_normalized(W, alpha=1.0):
    """
    Adds `alpha` to the diagonal of `W`
    Parameters
    ----------
    W : (N, N) array_like
        Similarity array from SNF
    alpha : (0, 1) float, optional
        Factor to add to diagonal of `W` to increase subject self-affinity.
        Default: 1.0
    Returns
    -------
    W : (N, N) np.ndarray
        Normalized similiarity array
    """

    # add `alpha` to the diagonal and symmetrize `W`
    W = W + (alpha * np.eye(len(W)))
    W = check_symmetric(W, raise_warning=False)

    return W
Example #7
0
def _check_SNF_inputs(aff):
    """
    Confirms inputs to SNF are appropriate
    Parameters
    ----------
    aff : `m`-list of (N x N) array_like
        Input similarity arrays. All arrays should be square and of equal size.
    """

    prep = []
    for a in _flatten(aff):
        ac = check_array(a, force_all_finite=True, copy=True)
        prep.append(check_symmetric(ac, raise_warning=False))
    check_consistent_length(*prep)

    # TODO: actually do this check for missing data
    nanaff = len(prep) - np.sum([np.isnan(a) for a in prep], axis=0)
    if np.any(nanaff == 0):
        pass

    return prep
Example #8
0
 def split(self, reads):
     if self.aggressive:
         maxReads = len(reads) - 1
     else:
         maxReads = len(reads) - self.minCount
     #vTable = self.sigVar.loc[reads]
     vTable = self.sigVar.reindex(reads)
     counts = vTable.apply(pd.Series.value_counts).fillna(0)
     for ch in '.*':
         if ch in counts.index:
             counts.drop(ch, inplace=True)
     counts.drop(columns=counts.columns[counts.sum() == 0], inplace=True)
     ent = self._rankEntropy(counts)
     useCols = ent.index[:self.maxFeatures]
     #ent = counts.apply(lambda p: p.sum()*entropy(p.dropna()))\
     #            .sort_values(ascending=False)
     #useCols    = ent[ent>=np.percentile(ent,80)].index[:self.maxFeatures]
     if self.log:
         self.log.debug(f'Checking for groups using pos {tuple(useCols)}')
     features = self.sigVar.reindex(reads)[useCols]
     similarity = self._similarity(features, ent.entropy)
     spectral = SpectralClustering(n_clusters=2, affinity='precomputed')
     scaled = check_symmetric(MinMaxScaler().fit_transform(similarity),
                              raise_warning=False)
     clustv = spectral.fit_predict(scaled)
     #use group with most non-ref calls
     useClust = features.groupby(clustv).apply(
         lambda d: ((d != '.').sum() / len(d)).mean()).idxmax()
     size = sum(clustv == useClust)
     if size >= self.minCount and size <= maxReads:
         subset = features.index[clustv == useClust]
         var = features.reindex(subset).apply(
             pd.Series.value_counts).idxmax().values
         return subset, tuple(useCols), tuple(var)
     else:
         return None, None, None
Example #9
0
def _stable_normalized(W):
    """
    Adds `alpha` to the diagonal of `W`

    Parameters
    ----------
    W : (N, N) array_like
        Similarity array from SNF

    Returns
    -------
    W : (N, N) np.ndarray
        Stable-normalized similiarity array
    """

    # add `alpha` to the diagonal and symmetrize `W`
    rowSum = np.sum(W, 1) - np.diag(W)
    rowSum[rowSum == 0] = 1

    W = W / (2 * rowSum)
    np.fill_diagonal(W, 0.5)
    W = check_symmetric(W, raise_warning=False)

    return W
Example #10
0
def remove_distance(coexpression, atlas, atlas_info=None, labels=None):
    """
    Corrects for distance-dependent correlation effects in `coexpression`

    Regresses Euclidean distance between regions in `atlas` from correlated
    gene expression array `coexpression`. If `atlas_info` is provided different
    connection types (e.g., cortex-cortex, cortex-subcortex, subcortex-
    subcortex) will be residualized independently.

    Parameters
    ----------
    coexpression : (R x R) array_like
        Correlated gene expression array, where `R` is the number of regions,
        as generated with e.g., `numpy.corrcoef(expression)`.
    atlas : niimg-like object
        A parcellation image in MNI space, where each parcel is identified by a
        unique integer ID
    atlas_info : str or pandas.DataFrame, optional
        Filepath to or pre-loaded dataframe containing information about
        `atlas`. Must have at least columns 'id', 'hemisphere', and 'structure'
        containing information mapping atlas IDs to hemisphere (i.e, "L", "R")
        and broad structural class (i.e., "cortex", "subcortex", "cerebellum").
        Default: None
    labels : (N,) array_like, optional
        If only a subset `N` of the ROIs in `atlas` were used to generate the
        `coexpression` array this array should specify which to consider. Not
        specifying this may cause a ValueError if `atlas` and `atlas_info` do
        not match. Default: None

    Returns
    -------
    residualized : (R, R) numpy.ndarray
        Provided `coexpression` data residualized against spatial distance
         between region pairs
    """

    # load atlas_info, if provided
    atlas = check_niimg_3d(atlas)
    if atlas_info is not None:
        atlas_info = utils.check_atlas_info(atlas, atlas_info, labels=labels)
        if labels is not None and len(labels) != len(coexpression):
            raise ValueError('Provided labels {} are a different length than '
                             'provided coexpression matrix of size {}. Please '
                             'confirm inputs and try again.'.format(
                                 labels, coexpression.shape))

    # check that provided coexpression array is symmetric
    check_symmetric(coexpression, raise_exception=True)

    # we'll do basic Euclidean distance correction for now
    # TODO: implement gray matter volume / cortical surface path distance
    centroids = utils.get_centroids(atlas, labels=labels)
    dist = cdist(centroids, centroids, metric='euclidean')

    corr_resid = np.zeros_like(coexpression)
    triu_inds = np.triu_indices_from(coexpression, k=1)
    # if no atlas_info, just residualize all correlations against distance
    if atlas_info is None:
        corr_resid[triu_inds] = _resid_dist(coexpression[triu_inds],
                                            dist[triu_inds])
    # otherwise, we can residualize the different connection types separately
    else:
        triu_inds = np.ravel_multi_index(triu_inds, corr_resid.shape)
        coexpression, dist = coexpression.ravel(), dist.ravel()
        types = ['cortex', 'subcortex']
        for src, tar in itertools.combinations_with_replacement(types, 2):
            # get indices of sources and targets
            sources = np.where(atlas_info.structure == src)[0]
            targets = np.where(atlas_info.structure == tar)[0]
            inds = np.ravel_multi_index(np.ix_(sources, targets),
                                        corr_resid.shape)
            if src != tar:  # e.g., cortex + subcortex
                rev = np.ravel_multi_index(np.ix_(targets, sources),
                                           corr_resid.shape)
                inds = np.append(inds.ravel(), rev.ravel())
            # find intersection of source / target indices + upper triangle
            inds = np.intersect1d(triu_inds, inds)
            back = np.unravel_index(inds, corr_resid.shape)
            # residualize
            corr_resid[back] = _resid_dist(coexpression[inds], dist[inds])

    corr_resid = (corr_resid + corr_resid.T + np.eye(len(corr_resid)))

    return corr_resid
Example #11
0
def snf(*aff, K=20, t=20, alpha=1.0):
    r"""
    Performs Similarity Network Fusion on `aff` matrices

    Parameters
    ----------
    *aff : (N, N) array_like
        Input similarity arrays; all arrays should be square and of equal size.
    K : (0, N) int, optional
        Hyperparameter normalization factor for scaling. Default: 20
    t : int, optional
        Number of iterations to perform information swapping. Default: 20
    alpha : (0, 1) float, optional
        Hyperparameter normalization factor for scaling. Default: 1.0

    Returns
    -------
    W: (N, N) np.ndarray
        Fused similarity network of input arrays

    Notes
    -----
    In order to fuse the supplied :math:`m` arrays, each must be normalized. A
    traditional normalization on an affinity matrix would suffer from numerical
    instabilities due to the self-similarity along the diagonal; thus, a
    modified normalization is used:

    .. math::

       \mathbf{P}(i,j) =
         \left\{\begin{array}{rr}
           \frac{\mathbf{W}_(i,j)}
                 {2 \sum_{k\neq i}^{} \mathbf{W}_(i,k)} ,& j \neq i \\
                                                       1/2 ,& j = i
         \end{array}\right.

    Under the assumption that local similarities are more important than
    distant ones, a more sparse weight matrix is calculated based on a KNN
    framework:

    .. math::

       \mathbf{S}(i,j) =
         \left\{\begin{array}{rr}
           \frac{\mathbf{W}_(i,j)}
                 {\sum_{k\in N_{i}}^{}\mathbf{W}_(i,k)} ,& j \in N_{i} \\
                                                         0 ,& \text{otherwise}
         \end{array}\right.

    The two weight matrices :math:`\mathbf{P}` and :math:`\mathbf{S}` thus
    provide information about a given patient's similarity to all other
    patients and the `K` most similar patients, respectively.

    These :math:`m` matrices are then iteratively fused. At each iteration, the
    matrices are made more similar to each other via:

    .. math::

       \mathbf{P}^{(v)} = \mathbf{S}^{(v)}
                          \times
                          \frac{\sum_{k\neq v}^{}\mathbf{P}^{(k)}}{m-1}
                          \times
                          (\mathbf{S}^{(v)})^{T},
                          v = 1, 2, ..., m

    After each iteration, the resultant matrices are normalized via the
    equation above. Fusion stops after `t` iterations, or when the matrices
    :math:`\mathbf{P}^{(v)}, v = 1, 2, ..., m` converge.

    The output fused matrix is full rank and can be subjected to clustering and
    classification.
    """

    aff = _check_SNF_inputs(aff)
    Wk = [0] * len(aff)
    Wsum = np.zeros(aff[0].shape)

    # get number of modalities informing each subject x subject affinity
    n_aff = len(aff) - np.sum([np.isnan(a) for a in aff], axis=0)

    for n, mat in enumerate(aff):
        # normalize affinity matrix based on strength of edges
        mat = mat / np.nansum(mat, axis=1, keepdims=True)
        aff[n] = check_symmetric(mat, raise_warning=False)
        # apply KNN threshold to normalized affinity matrix
        Wk[n] = _find_dominate_set(aff[n], int(K))

    # take sum of all normalized (not thresholded) affinity matrices
    Wsum = np.nansum(aff, axis=0)

    for iteration in range(t):
        for n, mat in enumerate(aff):
            # temporarily convert nans to 0 to avoid propagation errors
            nzW = np.nan_to_num(Wk[n])
            aw = np.nan_to_num(mat)
            # propagate `Wsum` through masked affinity matrix (`nzW`)
            aff0 = nzW @ (Wsum - aw) @ nzW.T / (n_aff - 1)  # TODO: / by 0
            # ensure diagonal retains highest similarity
            aff[n] = _B0_normalized(aff0, alpha=alpha)

        # compute updated sum of normalized affinity matrices
        Wsum = np.nansum(aff, axis=0)

    # all entries of `aff` should be identical after the fusion procedure
    # dividing by len(aff) is hypothetically equivalent to selecting one
    # however, if fusion didn't converge then this is just average of `aff`
    W = Wsum / len(aff)

    # normalize fused matrix and update diagonal similarity
    W = W / np.nansum(W, axis=1, keepdims=True)  # TODO: / by NaN
    W = (W + W.T + np.eye(len(W))) / 2

    return W
Example #12
0
def affinity_matrix(dist, *, K=20, mu=0.5):
    r"""
    Calculates affinity matrix given distance matrix `dist`

    Uses a scaled exponential similarity kernel to determine the weight of each
    edge based on `dist`. Optional hyperparameters `K` and `mu` determine the
    extent of the scaling (see `Notes`).

    You'd probably be best to use :py:func`snf.compute.make_affinity` instead
    of this, as that command also handles normalizing the inputs and creating
    the distance matrix.

    Parameters
    ----------
    dist : (N, N) array_like
        Distance matrix
    K : (0, N) int, optional
        Number of neighbors to consider. Default: 20
    mu : (0, 1) float, optional
        Normalization factor to scale similarity kernel. Default: 0.5

    Returns
    -------
    W : (N, N) np.ndarray
        Affinity matrix

    Notes
    -----
    The scaled exponential similarity kernel, based on the probability density
    function of the normal distribution, takes the form:

    .. math::

       \mathbf{W}(i, j) = \frac{1}{\sqrt{2\pi\sigma^2}}
                          \ exp^{-\frac{\rho^2(x_{i},x_{j})}{2\sigma^2}}

    where :math:`\rho(x_{i},x_{j})` is the Euclidean distance (or other
    distance metric, as appropriate) between patients :math:`x_{i}` and
    :math:`x_{j}`. The value for :math:`\\sigma` is calculated as:

    .. math::

       \sigma = \mu\ \frac{\overline{\rho}(x_{i},N_{i}) +
                           \overline{\rho}(x_{j},N_{j}) +
                           \rho(x_{i},x_{j})}
                          {3}

    where :math:`\overline{\rho}(x_{i},N_{i})` represents the average value
    of distances between :math:`x_{i}` and its neighbors :math:`N_{1..K}`,
    and :math:`\mu\in(0, 1)\subset\mathbb{R}`.

    Examples
    --------
    >>> from snf import datasets
    >>> simdata = datasets.load_simdata()

    We need to construct a distance matrix before we can create a similarity
    matrix using :py:func:`snf.compute.affinity_matrix`:

    >>> from scipy.spatial.distance import cdist
    >>> dist = cdist(simdata.data[0], simdata.data[0])

    >>> from snf import compute
    >>> aff = compute.affinity_matrix(dist)
    >>> aff.shape
    (200, 200)
    """

    # check inputs
    dist = check_array(dist, force_all_finite=False)
    dist = check_symmetric(dist, raise_warning=False)

    # get mask for potential NaN values and set diagonals zero
    mask = np.isnan(dist)
    dist[np.diag_indices_from(dist)] = 0

    # sort array and get average distance to K nearest neighbors
    T = np.sort(dist, axis=1)
    TT = np.vstack(T[:, 1:K + 1].mean(axis=1) + np.spacing(1))

    # compute sigma (see equation in Notes)
    sigma = (TT + TT.T + dist) / 3
    msigma = np.ma.array(sigma, mask=mask)  # mask for NaN
    sigma = sigma * np.ma.greater(msigma, np.spacing(1)).data + np.spacing(1)

    # get probability density function with scale = mu*sigma and symmetrize
    scale = (mu * np.nan_to_num(sigma)) + mask
    W = stats.norm.pdf(np.nan_to_num(dist), loc=0, scale=scale)
    W[mask] = np.nan
    W = check_symmetric(W, raise_warning=False)

    return W
Example #13
0
pairwise_dist_mat = np.array(df_pairwise_distances.values)
#print (pairwise_dist_mat)
#print (pairwise_dist_mat.shape)
#dists = squareform(pairwise_dist_mat)
#print (dists)
# def check_symmetric(a, rtol=1e-05, atol=1e-08):
#     return np.allclose(a, a.T, rtol=rtol, atol=atol)

df_pairwise_distances.fillna(value=np.nan, inplace=True)
print(df_pairwise_distances)
#print (np.around(df_pairwise_distances.values, 3))
#check_symmetric(np.around(df_pairwise_distances.values, 3), rtol=1e-05, atol=1e-08)
path_todir = os.getcwd()
pairwise_dist_mat = df_pairwise_distances.values
#print(np.where(~np.allclose(pairwise_dist_mat, pairwise_dist_mat.T,rtol=1e-05, atol=1e-08)))
pairwise_distmat_repaired = check_symmetric(pairwise_dist_mat)
print('max error: ', np.amax(np.abs(pairwise_dist_mat - pairwise_dist_mat.T)))
print('max error repaired: ',
      np.amax(pairwise_distmat_repaired - pairwise_distmat_repaired.T))
print(np.around(pairwise_distmat_repaired))
#t = np.around(pairwise_distmat_repaired)
#pd.DataFrame(t).to_excel("round_dist.xlsx")
# pd.DataFrame(pairwise_distmat_repaired).to_excel(pltname+"_repaired.xlsx")
# pd.DataFrame(pairwise_dist_mat).to_excel(pltname+".xlsx")
#df_pairwise_distances.to_excel(pltname+".xlsx")
#os.system("pause")

plt.figure(figsize=(25, 25))
sns.heatmap(df_pairwise_distances, cmap='Blues', linewidth=1)
#plt.show()
plt.savefig('allmondays_distancematrix.png')
Example #14
0
def Main_function():

    # 数据加载阶段
    graphs_path = './Code/'
    graph_datasets = RMN.read_graph_pickle(graphs_path)

    # 表示学习参数设置阶段
    p = Parameter["p"]
    q = Parameter["q"]
    num_walks = Parameter["num_walks"]
    walk_length = Parameter["walk_length"]
    dimensions = Parameter["dimensions"]
    knei = [10, 15, 20, 25]
    mu = [0.4, 0.5, 0.6]
    for name, dets in graph_datasets.items():
        print("---------------%s---------------" % name)
        wvecs = []

        # 训练数据集的加载与测试
        nx_graph = dets['train_ng']
        merge_graph = dets['train_mg']

        # 测试验证集的加载与验证
        train_edges = []
        ground_truth = []
        test_edges = dets["test_edges"]
        test_labels = dets["test_labels"]

        # 对网络中的节点标签进行修改,需要进行排序
        nodes = sorted(list(merge_graph.nodes()))
        if nodes[0] > 0:
            train_edges.extend([[i, e[0] - 1, e[1] - 1, 1]
                                for i in range(len(nx_graph))
                                for e in nx_graph[i].edges()])
            train_merge = nx.relabel_nodes(merge_graph, lambda x: int(x) - 1)
            train_nxgraph = [
                nx.relabel_nodes(g, lambda x: int(x) - 1) for g in nx_graph
            ]
            test_edges = [[e[0] - 1, e[1] - 1] for i in test_edges for e in i]
            nodes = list(train_merge.nodes())
        else:
            train_edges.extend([[i, e[0], e[1], 1]
                                for i in range(len(nx_graph))
                                for e in nx_graph[i].edges()])
            train_nxgraph = copy.deepcopy(nx_graph)
            train_merge = copy.deepcopy(merge_graph)

        # 有的节点编号并不是连续的,下面语句是为了使节点的编号连续
        restru_test_edges = []
        for i in test_edges:
            restru_test_edges.append([[nodes.index(e[0]),
                                       nodes.index(e[1])] for e in i])
        str_graph = nx.relabel_nodes(train_merge, lambda x: str(x))

        # 下面操作的是opennet定义的网络,为了使用现有的单层网络算法做对比
        G = opgraph.Graph()
        DG = str_graph.to_directed()
        G.read_g(DG)
        nx_para_graph = []
        for g in train_nxgraph:
            str_graph = nx.relabel_nodes(g, lambda x: str(x))
            G = opgraph.Graph()
            DG = str_graph.to_directed()
            G.read_g(DG)
            nx_para_graph.append(G)

        ################################对比实验部分###############################
        #1# merge_network
        auc = []
        for index, layer in enumerate(restru_test_edges):
            y_pred = []
            for e in layer:
                if e[0] in train_merge.nodes() and e[1] in train_merge.nodes():
                    y_pred.append(
                        list(nx.adamic_adar_index(train_merge, [e]))[0][2])
                else:
                    y_pred.append(0)  # 当不存在这个节点的时候,应该概率为0
            auc.append(roc_auc_score(test_labels[index], y_pred))
        print("merge-network:%f" % (sum(auc) / len(auc)))

        #2# Ohmnet 实现多层网络嵌入 Bioinformatics'2017
        ohmnet_walks = []
        orignal_walks = []
        LG = copy.deepcopy(train_nxgraph)
        on = ohmnet.OhmNet(LG,
                           p=p,
                           q=q,
                           num_walks=num_walks,
                           walk_length=walk_length,
                           dimension=dimensions,
                           window_size=10,
                           n_workers=8,
                           n_iter=5,
                           out_dir='.')
        for ns in on.embed_multilayer():
            orignal_walks.append(ns)
            on_walks = [n.split("_")[2] for n in ns]
            ohmnet_walks.append([str(step) for step in on_walks])
        Ohmnet_model = Node2vec.N2V.learn_embeddings(ohmnet_walks,
                                                     dimensions,
                                                     workers=5,
                                                     window_size=10,
                                                     niter=5)
        Ohmnet_wvecs = np.array(
            [Ohmnet_model.get_vector(str(i)) for i in nodes])
        y_pred = []
        auc = []
        for index, layer in enumerate(restru_test_edges):
            y_pred = []
            for e in layer:
                if str(e[0]) in Ohmnet_model.index2entity and str(
                        e[1]
                ) in Ohmnet_model.index2entity:  # 如果关键字没有在字典Key中,则设置为0.5
                    y_pred.append(
                        cosine_similarity([
                            Ohmnet_model.get_vector(str(e[0])),
                            Ohmnet_model.get_vector(str(e[1]))
                        ])[0][1])
                else:
                    y_pred.append(0)
            auc.append(roc_auc_score(test_labels[index], y_pred))
        print("ohmnet-network:%f" % (sum(auc) / len(auc)))
        #
        # #3# MNE 实现可扩展的Multiplex network的嵌入,IJCAI'2018
        edge_data_by_type = {}
        all_edges = list()
        all_nodes = list()
        for e in train_edges:
            if e[0] not in edge_data_by_type:
                edge_data_by_type[e[0]] = list()
            edge_data_by_type[e[0]].append((e[1], e[2]))
            all_edges.append((e[1], e[2]))
            all_nodes.append(e[1])
            all_nodes.append(e[2])
        all_nodes = list(set(all_nodes))
        all_edges = list(set(all_edges))
        edge_data_by_type['Base'] = all_edges
        MNE_model = MNE.train_model(edge_data_by_type)
        local_model = dict()

        auc = []
        for index, layer in enumerate(restru_test_edges):
            y_pred = []
            for pos in range(len(MNE_model['index2word'])):
                local_model[MNE_model['index2word']
                            [pos]] = MNE_model['base'][pos] + 0.5 * np.dot(
                                MNE_model['addition'][index][pos],
                                MNE_model['tran'][index])
            for e in layer:
                if str(e[0]) in MNE_model['index2word'] and str(
                        e[1]
                ) in MNE_model['index2word']:  # 如果关键字没有在字典Key中,则设置为0.5
                    y_pred.append(
                        cosine_similarity(
                            [local_model[str(e[0])],
                             local_model[str(e[1])]])[0][1])
                else:
                    y_pred.append(0)
            auc.append(roc_auc_score(test_labels[index], y_pred))
        print("MNE:%f" % (sum(auc) / len(auc)))

        #4# PMNE的3种算法
        merged_networks = dict()
        merged_networks['training'] = dict()
        merged_networks['test_true'] = dict()
        merged_networks['test_false'] = dict()
        for index, g in enumerate(train_nxgraph):
            merged_networks['training'][index] = set(g.edges())
            merged_networks['test_true'][index] = restru_test_edges[index]
            merged_networks['test_false'][index] = test_edges[index][
                len(test_edges):]

        performance_1, performance_2, performance_3 = main.Evaluate_PMNE_methods(
            merged_networks)
        print("PMNE(n):%f" % (performance_1))
        print("PMNE(r):%f" % (performance_2))
        print("MNE(c):%f" % (performance_3))

        #5# MELL实现多层网络的节点表示学习,WWW’2018
        L = len(nx_graph)
        N = max([int(n) for n in train_merge.nodes()]) + 1
        N = max(N, train_merge.number_of_nodes())  # 为了构造邻接矩阵需要找到行的标准
        directed = True
        d = 128
        k = 3
        lamm = 10
        beta = 1
        gamma = 1
        MELL_wvecs = MELL_model(L, N, directed, train_edges, d, k, lamm, beta,
                                gamma)
        MELL_wvecs.train(30)  # 之前是500,但是有的数据集500会报错,因此设置为30
        auc = []
        for index, layer in enumerate(restru_test_edges):
            y_pred = []
            for e in layer:
                if e[0] in all_nodes and e[
                        1] in all_nodes:  # 如果关键字没有在字典Key中,则设置为0.5
                    y_pred.append(MELL_wvecs.predict((index, e[0], e[1])))
                else:
                    y_pred.append(0)
            auc.append(roc_auc_score(test_labels[index], y_pred))
        print("MELL:%f" % (sum(auc) / len(auc)))

        #6# 基本相似性度量方法:CN JC AA
        auc1 = []
        auc2 = []
        auc3 = []
        for index, layer in enumerate(restru_test_edges):
            y_pred_cn = []
            y_pred_jc = []
            y_pred_AA = []
            for e in layer:
                if e[0] in train_nxgraph[index].nodes(
                ) and e[1] in train_nxgraph[index].nodes():
                    y_pred_cn.append(
                        len(
                            list(
                                nx.common_neighbors(train_nxgraph[index], e[0],
                                                    e[1]))))
                    y_pred_jc.append(
                        list(nx.jaccard_coefficient(train_nxgraph[index],
                                                    [e]))[0][2])
                    # y_pred_AA.append(list(nx.adamic_adar_index(train_nxgraph[index], [e]))[0][2])
                else:
                    y_pred_cn.append(0)  # 如果不存在这个节点,那么为共有邻居为0
                    y_pred_jc.append(0)
                    # y_pred_AA.append(0)

            auc1.append(roc_auc_score(test_labels[index], y_pred_cn))  # 计算AUC值
            auc2.append(roc_auc_score(test_labels[index], y_pred_jc))
            auc3.append(roc_auc_score(test_labels[index], y_pred_AA))
        print("CN-network:%f" % (sum(auc1) / len(auc1)))
        print("JC-network:%f" % (sum(auc2) / len(auc2)))
        print("AA-network:%f" % (sum(auc3) / len(auc3)))

        #7# Single-layer Node2vec
        auc = []
        for index, G in enumerate(nx_para_graph):
            model_nf = node2vec.Node2vec(G,
                                         walk_length,
                                         num_walks,
                                         dimensions,
                                         p=p,
                                         q=q,
                                         dw=True)
            index_num = sorted([int(i) for i in model_nf.vectors.keys()])
            g_embedding = [model_nf.vectors[str(i)] for i in index_num]
            y_pred = []
            for e in restru_test_edges[index]:
                if str(e[0]) in G.G.nodes() and str(
                        e[1]) in G.G.nodes():  # 如果关键字没有在字典Key中,则设置为0.5
                    y_pred.append(
                        cosine_similarity([
                            model_nf.vectors[str(e[0])],
                            model_nf.vectors[str(e[1])]
                        ])[0][1])
                else:
                    y_pred.append(0)
            auc.append(roc_auc_score(test_labels[index], y_pred))
        print("Node2vec: %f" % (sum(auc) / len(auc)))

        #7# Network + Embedding(N2V) + SNF4st 网络的表示学习
        for k in knei:
            for m in mu:
                auc_final = []
                for i in range(2, 10):  # 为了求平均值
                    # 第一个参数是KNN的K值,第二个是mu值,第三个是其他过程使用的K值,最后一个参数使迭代次数,一般情况下20次就会达到收敛
                    network, groundtruth, best, second = NFC.cluster_E(
                        nx_para_graph, ground_truth, Parameter, nodes, k, m, k,
                        30)  # CKM\V(20, 0.5, 20, 20)
                    Network_Adj = _find_dominate_set(
                        check_symmetric(network, raise_warning=False),
                        K=k)  # 从网络的相似性矩阵中构建邻接矩阵 CKM(20) Vickers(15)
                    g = nx.from_numpy_matrix(Network_Adj)  # 基于邻接矩阵构建网络
                    auc = []
                    for index, layer in enumerate(restru_test_edges):
                        y_pred = []
                        for e in layer:
                            if e[0] in train_nxgraph[index].nodes(
                            ) and e[1] in train_nxgraph[index].nodes():
                                y_pred.append(
                                    list(
                                        nx.adamic_adar_index(
                                            g, [
                                                (nodes.index(
                                                    e[0]), nodes.index(e[1]))
                                            ]))[0][2])  # 利用RA相似性计算测试集两点之间概率
                            else:
                                y_pred.append(0)
                        auc.append(roc_auc_score(test_labels[index],
                                                 y_pred))  # 计算AUC值
                    auc_final.append(sum(auc) / len(auc))
                value = max(auc_final)
                average = sum(auc_final) / len(auc_final)
                print("K=%d Mu=%.2f Max:index({%d})->%f" %
                      (k, m, auc_final.index(value), value))
                print("K=%d Mu=%f Ave:->%f" % (k, m, average))
Example #15
0
def my_mds_plot():

    data = pd.io.parsers.read_csv(  #pandas handles in a better way
        'pca_counts.csv',
        header="infer"  #the first row contains the recipiennts names
    )
    if (DEBUG):
        data = pd.io.parsers.read_csv(  #pandas handles in a better way
            'pca_counts_min.csv',
            header="infer"  #the first row contains the recipiennts names
        )
        print("Data\n", data)

    dissM = data.values[:, 1:]  #the first column contains the suppliers names
    countries = data.values[:, 0]

    #amax = np.amax(dissM)
    #dissM /= amax          #you can uncomment  these lines to plot MDS in a normalized 2D

    dissM = np.array(dissM, dtype=float)
    #Repair dissM which is not symmetric
    dissM = check_symmetric(dissM)

    if (DEBUG):
        print(dissM)
        with open("log.txt", "w") as out:
            out.write("".join([x for x in countries]))
            out.write("\n".join([str(x) for x in dissM]))

    mds = manifold.MDS(n_components=2,
                       n_init=20,
                       metric=True,
                       max_iter=3000,
                       eps=1e-9,
                       dissimilarity="precomputed",
                       random_state=RS)
    pos = mds.fit(dissM).embedding_
    stress = mds.fit(dissM).stress_

    plt.scatter(pos[:, 0], pos[:, 1], marker='o')
    for label, x, y in zip(countries, pos[:, 0], pos[:, 1]):
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(-20, 20),
                     textcoords='offset points',
                     ha='right',
                     va='bottom',
                     bbox=dict(boxstyle='round,pad=0.3',
                               fc='yellow',
                               alpha=0.1),
                     arrowprops=dict(arrowstyle='->',
                                     connectionstyle='arc3,rad=0'))

    plt.show()
    print(stress)

    # from sklearn import datasets
    # digits = datasets.load_digits()
    # print(digits)

    # tsne = manifold.TSNE(n_components=2,random_state=2).fit_transform(dissM)
    # plt.scatter(tsne,tsne,3, marker = 'o')
    # for label, x, y in zip(countries, pos[:, 0], pos[:, 1]):
    #     plt.annotate(
    #         label,
    #         xy = (x, y), xytext = (-20, 20),
    #         textcoords = 'offset points', ha = 'right', va = 'bottom',
    #         bbox = dict(boxstyle = 'round,pad=0.3', fc = 'yellow', alpha = 0.1),
    #         arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

    plt.show()
Example #16
0
def plot_partial_corrcoef(partial_corrcoef,
                          ax=None,
                          cbar=True,
                          figsize=None,
                          filename=None,
                          title='Partial correlation',
                          **kwargs):
    """Plot the partial correlation coefficient matrix.

    Parameters
    ----------
    partial_corrcoef : array-like of shape (n_features, n_features)
        Partial correlation coefficient matrix.

    ax : matplotlib Axes, default None
        Target axes instance.

    cbar : bool, default True.
        If True, draw a colorbar.

    figsize : tuple, default None
        Tuple denoting figure size of the plot.

    filename : str, default None
        If provided, save the current figure.

    title : string, default 'Partial correlation'
        Axes title. To disable, pass None.

    **kwargs : dict
        Other keywords passed to ``ax.pcolormesh``.

    Returns
    -------
    ax : matplotlib Axes
        Axes on which the plot was drawn.

    Examples
    --------
    >>> import matplotlib.pyplot as plt
    >>> from kenchi.plotting import plot_partial_corrcoef
    >>> from sklearn.datasets import make_sparse_spd_matrix
    >>> A = make_sparse_spd_matrix(dim=20, norm_diag=True, random_state=0)
    >>> plot_partial_corrcoef(A) # doctest: +ELLIPSIS
    <matplotlib.axes._subplots.AxesSubplot object at 0x...>
    >>> plt.show() # doctest: +SKIP

    .. figure:: images/plot_partial_corrcoef.png
    """

    import matplotlib.pyplot as plt
    from mpl_toolkits.axes_grid1 import make_axes_locatable

    partial_corrcoef = check_array(partial_corrcoef)
    partial_corrcoef = check_symmetric(partial_corrcoef, raise_exception=True)

    if ax is None:
        _, ax = plt.subplots(figsize=figsize)

    if title is not None:
        ax.set_title(title)

    # Add the pcolormesh kwargs here
    kwargs.setdefault('cmap', 'RdBu')
    kwargs.setdefault('edgecolors', 'white')
    kwargs.setdefault('vmin', -1.)
    kwargs.setdefault('vmax', 1.)

    # Draw the heatmap
    mesh = ax.pcolormesh(np.ma.masked_equal(partial_corrcoef, 0.), **kwargs)

    ax.set_aspect('equal')
    ax.set_facecolor('grey')

    # Invert the y axis to show the plot in matrix form
    ax.invert_yaxis()

    if cbar:
        # Create an axes on the right side of ax
        divider = make_axes_locatable(ax)
        cax = divider.append_axes('right', '5%', pad=0.1)

        ax.get_figure().colorbar(mesh, cax=cax)

    if filename is not None:
        ax.get_figure().savefig(filename)

    return ax
Example #17
0
def snf2_np(*aff, numofCom, K=20, t=20, alpha=1.0):
    """
    Performs Similarity Network Fusion on `aff` matrices

    Parameters
    ----------
    *aff : (N, N) array_like
        Input similarity arrays; all arrays should be square but no need to be equal size.
        Note: these arrays must have all the common samples appeared first in the matrix

    numofCom: int, required
        Number of common samples across all the matrices

    K : (0, N) int, optional
        Hyperparameter normalization factor for scaling. Default: 20

    t : int, optional
        Number of iterations to perform information swapping. Default: 20

    alpha : (0, 1) float, optional
        Hyperparameter normalization factor for scaling. Default: 1.0

    Returns
    -------
    W: (N, N) Ouputs similarity arrays
        Fused similarity networks of input arrays
    """
    print("Start applying diffusion!")
    aff = _check_SNF2_inputs(aff)
    newW = [0] * len(aff)
    aff_com = [0] * len(aff)

    # First, normalize different networks to avoid scale problems
    for n, mat in enumerate(aff):
        # normalize affinity matrix based on strength of edges
        # mat = mat / np.nansum(mat, axis=1, keepdims=True)
        mat = _stable_normalized(mat)
        aff[n] = check_symmetric(mat, raise_warning=False)
        aff_com[n] = aff[n][0:numofCom, :][:, 0:numofCom]
        # apply KNN threshold to normalized affinity matrix
        # We need to crop the intersecting samples from newW matrices
        newW[n] = _find_dominate_set(aff[n], int(K))
        newW[n] = newW[n][:, 0:numofCom]

    # take sum of all normalized (not thresholded) affinity matrices of the intersections part
    Wsum = np.nansum(aff_com, axis=0)

    # get number of modalities informing each subject x subject affinity
    n_aff = len(aff_com) - np.sum([np.isnan(a) for a in aff_com], axis=0)

    for iteration in range(t):
        for n, mat in enumerate(aff):
            # temporarily convert nans to 0 to avoid propagation errors
            nzW = np.nan_to_num(newW[n])
            mat = mat[0:numofCom, :][:, 0:numofCom]
            aw = np.nan_to_num(mat)
            # propagate `Wsum` through masked affinity matrix (`nzW`)
            aff0 = np.matmul(np.matmul(nzW, (Wsum - aw) / (n_aff - 1)),
                             nzW.T)  # TODO: / by 0
            # ensure diagonal retains highest similarity
            aff[n] = _B0_normalized(aff0, alpha=alpha)
            aff_com[n] = aff[n][0:numofCom, :][:, 0:numofCom]

        # compute updated sum of normalized affinity matrices
        Wsum = np.nansum(aff_com, axis=0)

    for n, mat in enumerate(aff):
        mat = _stable_normalized(mat)
        aff[n] = check_symmetric(mat, raise_warning=False)

    return aff
Example #18
0
def snf2(args, aff, dicts_common, dicts_unique, original_order):
    """
    Performs Similarity Network Fusion on `aff` matrices

    Parameters
    ----------
    aff : (N, N) pandas dataframe
        Input similarity arrays; all arrays should be square but no need to be equal size.

    dicts_common: dictionaries, required
        Dictionaries for getting common samples from different views
        Example: dicts_common[(0, 1)] == dicts_common[(1, 0)], meaning the common patients between view 1&2

    dicts_unique: dictionaries, required
        Dictionaries for getting unique samples for different views
        Example: dicts_unique[(0, 1)], meaning the unique samples from view 1 that are not in view 2
                 dicts_unique[(1, 0)], meaning the unique samples from view 2 that are not in view 1

    original_order: lists, required
        The original order of each view

    K : (0, N) int, optional
        Hyperparameter normalization factor for scaling. Default: 20

    t : int, optional
        Number of iterations to perform information swapping. Default: 20

    alpha : (0, 1) float, optional
        Hyperparameter normalization factor for scaling. Default: 1.0

    Returns
    -------
    W: (N, N) Ouputs similarity arrays
        Fused similarity networks of input arrays
    """

    print("Start applying diffusion! with new method")

    start_time = time.time()

    newW = [0] * len(aff)

    # First, normalize different networks to avoid scale problems, it is compatible with pandas dataframe
    for n, mat in enumerate(aff):
        # normalize affinity matrix based on strength of edges
        # mat = mat / np.nansum(mat, axis=1, keepdims=True)
        mat = _stable_normalized_pd(mat)
        aff[n] = check_symmetric(mat, raise_warning=False)

        # apply KNN threshold to normalized affinity matrix
        # We need to crop the intersecting samples from newW matrices
        neighbor_size = min(int(args.neighbor_size), aff[n].shape[0])
        newW[n] = _find_dominate_set(aff[n], neighbor_size)

    for iteration in range(args.fusing_iteration):
        for n, mat in enumerate(aff):
            # temporarily convert nans to 0 to avoid propagation errors
            nzW = newW[n]  # TODO: not sure this is a deep copy or not

            # Your goal is to update aff[n], but it is the average of all the defused matrices.
            # Make a copy of add[n], and set it to 0
            aff0_copy = aff[n].copy()
            for col in aff0_copy.columns:
                aff0_copy[col].values[:] = 0

            for j, mat_tofuse in enumerate(aff):
                if n == j:
                    continue

                # reorder mat_tofuse to have the common samples
                mat_tofuse = mat_tofuse.reindex(
                    (sorted(dicts_common[(j, n)]) +
                     sorted(dicts_unique[(j, n)])),
                    axis=1,
                )
                mat_tofuse = mat_tofuse.reindex(
                    (sorted(dicts_common[(j, n)]) +
                     sorted(dicts_unique[(j, n)])),
                    axis=0,
                )

                # Next, let's crop mat_tofuse
                num_common = len(dicts_common[(n, j)])
                to_drop_mat = mat_tofuse.columns[num_common:mat_tofuse.
                                                 shape[1]].values.tolist()
                mat_tofuse_crop = mat_tofuse.drop(to_drop_mat, axis=1)
                mat_tofuse_crop = mat_tofuse_crop.drop(to_drop_mat, axis=0)

                # Next, add the similarity from the view to fused to the current view identity matrix
                nzW_identity = pd.DataFrame(
                    data=np.identity(nzW.shape[0]),
                    index=original_order[n],
                    columns=original_order[n],
                )
                mat_tofuse_union = nzW_identity + mat_tofuse_crop
                mat_tofuse_union.fillna(0.0, inplace=True)
                mat_tofuse_union = _stable_normalized_pd(mat_tofuse_union)
                mat_tofuse_union = mat_tofuse_union.reindex(original_order[n],
                                                            axis=1)
                mat_tofuse_union = mat_tofuse_union.reindex(original_order[n],
                                                            axis=0)

                # Now we are ready to do the diffusion
                nzW_T = np.transpose(nzW)
                aff0_temp = nzW.dot(mat_tofuse_union.dot(
                    nzW_T))  # Matmul is not working, but .dot() is good

                aff0_temp = _B0_normalized(aff0_temp,
                                           alpha=args.normalization_factor)

                aff0_copy = np.add(aff0_temp, aff0_copy)

            aff[n] = np.divide(aff0_copy, len(aff) - 1)

    for n, mat in enumerate(aff):
        mat = _stable_normalized_pd(mat)
        aff[n] = check_symmetric(mat, raise_warning=False)

    end_time = time.time()
    print("Diffusion ends! Times: {}s".format(end_time - start_time))
    return aff