def test_neighbors_dexter(hubness_param, metric):
    hubness, param = hubness_param
    X, y = load_dexter()

    # Hubness in standard spaces
    hub = Hubness(k=10, metric=metric)
    hub.fit(X)
    k_skew_orig = hub.score()

    # Hubness in secondary distance spaces (after hub. red.)
    graph = kneighbors_graph(X,
                             n_neighbors=10,
                             metric=metric,
                             hubness=hubness,
                             hubness_params=param)
    hub = Hubness(k=10, metric='precomputed')
    hub.fit(graph)
    k_skew_hr = hub.score(has_self_distances=True)

    assert k_skew_hr < k_skew_orig * 8/10,\
        f'k-occurrence skewness was not reduced by at least 20% for dexter with {hubness}'
def hub_reduction(
        input_points,
        methods={
            'nothing': (None, None),
            'mp_normal': ('mp', {
                'method': 'normal'
            }),
            'ls': ('ls', None),
            'ls_nicdm': ('ls', {
                'method': 'nicdm'
            }),
            'dsl': ('dsl', None)
        },
        k=k):
    samples_reduced = dict()
    for method_name, (hubness, hubness_params) in tqdm(methods.items()):
        samples_reduced[method_name] = kneighbors_graph(
            input_points,
            n_neighbors=k,
            hubness=hubness,
            hubness_params=hubness_params)
    return samples_reduced
Beispiel #3
0
def generate_triplets(X,
                      n_inlier,
                      n_outlier,
                      n_random,
                      fast_trimap=True,
                      weight_adj=True,
                      verbose=True,
                      hub='mp'):
    n, dim = X.shape
    if dim > 100:
        X = TruncatedSVD(n_components=100, random_state=0).fit_transform(X)
        dim = 100
    exact = n <= 10000
    n_extra = min(max(n_inlier, 150), n)

    # if hub == 'mp_app':
    #     # D = euclidean_distance(X)
    #     n = X.shape[0]
    #     D_mp = SuQHR(n_samples=n-1).fit_transform(X)
    #     print("kjk", D_mp.shape)
    #
    #     # make knn graph
    #     distances, nbrs = KNN_Info(D_mp, n_extra)
    #
    #     if verbose:
    #         print("hubness reduction with {}".format(hub))

    if hub == 'mp1':  # hubness reductionをtriplet選択のみに使用
        neigbour_graph = kneighbors_graph(X,
                                          n_neighbors=n_extra,
                                          mode='distance',
                                          hubness='mutual_proximity',
                                          hubness_params={'method': 'normal'})
        nbrs = neigbour_graph.indices.astype(int).reshape(
            (X.shape[0], n_extra))
        # distances = neigbour_graph.data.reshape((X.shape[0], n_extra))

        flag = nbrs.tolist()

        D = euclidean_distance(X)
        D = np.array([D[i][flag[i]] for i in range(D.shape[0])])

        distances = D

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'mp2':  # 類似度Pを1−Dmpにする

        D = euclidean_distance(X)
        D_mp = hub_toolbox.global_scaling.mutual_proximity_gaussi(
            D=D, metric='distance')

        # make knn graph
        distances, nbrs = KNN_Info(D_mp, n_extra)

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'mp3_gauss':  # secondary distanceで類似度を計算
        D = euclidean_distance(X)
        D_mp = hub_toolbox.global_scaling.mutual_proximity_gaussi(
            D=D, metric='distance')

        del D
        gc.collect()

        # make knn graph
        distances, nbrs = KNN_Info(D_mp, n_extra)

        # neigbour_graph = sknn(X, n_neighbors=n_extra, mode='distance')
        # nbrs = neigbour_graph.indices.astype(int).reshape((X.shape[0], n_extra))
        # distances = neigbour_graph.data.reshape((X.shape[0], n_extra))

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'mp3_emp':  # secondary distanceで類似度を計算
        D = euclidean_distance(X)
        D_mp = hub_toolbox.global_scaling._mutual_proximity_empiric_full(
            D=D, metric='distance')

        # make knn graph
        # distances, nbrs = KNN_Info(D_mp, n_extra)
        neigbour_graph = k

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'mp4':  # 謎
        neigbour_graph = kneighbors_graph(X,
                                          n_neighbors=n_extra,
                                          mode='distance',
                                          hubness='mutual_proximity',
                                          hubness_params={'method': 'normal'})
        nbrs = neigbour_graph.indices.astype(int).reshape(
            (X.shape[0], n_extra))
        distances = neigbour_graph.data.reshape((X.shape[0], n_extra))

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'ls1':
        neigbour_graph = kneighbors_graph(X,
                                          n_neighbors=n_extra,
                                          mode='distance',
                                          hubness='local_scaling')
        nbrs = neigbour_graph.indices.astype(int).reshape(
            (X.shape[0], n_extra))
        # distances = neigbour_graph.data.reshape((X.shape[0], n_extra))

        flag = nbrs.tolist()

        D = euclidean_distance(X)
        D = np.array([D[i][flag[i]] for i in range(D.shape[0])])

        distances = D

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'ls2':
        D = euclidean_distance(X)
        D_ls = hub_toolbox.local_scaling.local_scaling(D=D,
                                                       k=10,
                                                       metric='distance')

        # make knn graph
        distances, nbrs = KNN_Info(D_ls, n_extra)

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'dsl':
        neigbour_graph = kneighbors_graph(X,
                                          n_neighbors=n_extra,
                                          mode='connectivity',
                                          hubness='dsl')
        nbrs = neigbour_graph.indices.astype(int).reshape(
            (X.shape[0], n_extra))
        # flag = neigbour_graph.data.reshape((X.shape[0], n_extra))
        flag = nbrs.tolist()

        D = euclidean_distance(X)
        D = np.array([D[i][flag[i]] for i in range(D.shape[0])])

        distances = D

        # D = np.empty((X.shape[0], n_extra, dtype=np.float64)
        # for i in range(X.shape[0]):
        #     for j in range(n_extra):
        #         D[i, j] = euclid_dist(X[i, :], X[nbrs[i][j]])
        #         np.sqrt(np.sum((X[triplets[t, 0], :] - X[triplets[t, 2], :]) ** 2))

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'mutual':
        # D = euclidean_distance(X)
        # # make knn graph
        # _, nbrs = KNN_Info(D_mp, n_extra)

        knn_tree = knn(n_neighbors=n_extra, algorithm='auto').fit(X)
        distances, nbrs = knn_tree.kneighbors(X)

        nbrs = make_mutual(nbrs)
        # a = nbrs == X.shape[0] + 1
        # print(a)

    elif hub == 'SNN1' or hub == 'SNN2':
        D = euclidean_distance(X)
        D_snn = hub_toolbox.shared_neighbors.shared_nearest_neighbors(
            D=D, metric='distance')

        # snn = shared_neighbors(k=10, metric='euclidean')
        # D_snn = snn.fit_tr(X)

        # make knn graph
        distances, nbrs = KNN_Info(D_snn, n_extra)

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif exact:  # do exact knn search
        knn_tree = knn(n_neighbors=n_extra, algorithm='auto').fit(X)
        distances, nbrs = knn_tree.kneighbors(X)

        # print(nbrs)
    elif fast_trimap:  # use annoy
        tree = AnnoyIndex(dim, metric='euclidean')
        for i in range(n):
            tree.add_item(i, X[i, :])
        tree.build(10)
        nbrs = np.empty((n, n_extra), dtype=np.int64)
        distances = np.empty((n, n_extra), dtype=np.float64)
        dij = np.empty(n_extra, dtype=np.float64)
        for i in range(n):
            nbrs[i, :] = tree.get_nns_by_item(i, n_extra)
            for j in range(n_extra):
                dij[j] = euclid_dist(X[i, :], X[nbrs[i, j], :])
            sort_indices = np.argsort(dij)
            nbrs[i, :] = nbrs[i, sort_indices]
            # for j in range(n_extra):
            #     distances[i,j] = tree.get_distance(i, nbrs[i,j])
            distances[i, :] = dij[sort_indices]
    else:
        n_bf = 10
        n_extra += n_bf
        knn_tree = knn(n_neighbors=n_bf, algorithm='auto').fit(X)
        _, nbrs_bf = knn_tree.kneighbors(X)
        nbrs = np.empty((n, n_extra), dtype=np.int64)
        nbrs[:, :n_bf] = nbrs_bf
        tree = AnnoyIndex(dim, metric='euclidean')
        for i in range(n):
            tree.add_item(i, X[i, :])
        tree.build(100)
        distances = np.empty((n, n_extra), dtype=np.float64)
        dij = np.empty(n_extra, dtype=np.float64)
        for i in range(n):
            nbrs[i, n_bf:] = tree.get_nns_by_item(i, n_extra - n_bf)
            unique_nn = np.unique(nbrs[i, :])
            n_unique = len(unique_nn)
            nbrs[i, :n_unique] = unique_nn
            for j in range(n_unique):
                dij[j] = euclid_dist(X[i, :], X[nbrs[i, j], :])
            sort_indices = np.argsort(dij[:n_unique])
            nbrs[i, :n_unique] = nbrs[i, sort_indices]
            distances[i, :n_unique] = dij[sort_indices]
    if verbose:
        print("found nearest neighbors")
    # if hub == 'ls':
    # #     sig = np.array([1.]*X.shape[0])
    # else:
    if hub == 'mp2':
        P = 1 - distances  # (n, k)

    # elif hub == 'mp3':
    #     sig = np.median(D_mp[np.triu_indices(D_mp.shape[0], k=1)])
    #     sig = np.array([sig] * D_mp.shape[0])
    #     P = find_p(distances, sig, nbrs)

    else:
        sig = np.maximum(np.mean(distances[:, 10:20], axis=1),
                         1e-20)  # scale parameter
        P = find_p(distances, sig, nbrs)
    # if hub == 'ls':
    #     P = -np.log(P)
    #     P = np.sqrt(P)
    #     P = 1 - P
    triplets = sample_knn_triplets(P, nbrs, n_inlier, n_outlier)
    print("tri_shape", triplets[0], triplets[0][2])
    n_triplets = triplets.shape[0]
    # if hub == 'mp':
    #     outlier_dist

    # if not hub == 'mp':
    #
    outlier_dist = np.empty(n_triplets, dtype=np.float64)
    # if hub == 'mp':
    #     for t in range(n_triplets):
    #         outlier_dist[t] = D_mp[triplets[t][0], triplets[t][2]]
    # el

    if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2':
        pass

    elif hub == 'mp3_gauss' or hub == 'mp3_emp':
        for t in range(n_triplets):
            outlier_dist[t] = D_mp[triplets[t][0], triplets[t][2]]

    elif hub == 'SNN2':
        for t in range(n_triplets):
            outlier_dist[t] = D_snn[triplets[t][0], triplets[t][2]]

    elif exact or not fast_trimap:
        for t in range(n_triplets):
            outlier_dist[t] = np.sqrt(
                np.sum((X[triplets[t, 0], :] - X[triplets[t, 2], :])**2))
    else:
        for t in range(n_triplets):
            outlier_dist[t] = euclid_dist(X[triplets[t, 0], :],
                                          X[triplets[t, 2], :])
            # outlier_dist[t] = tree.get_distance(triplets[t,0], triplets[t,2])

    if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2':
        if hub == 'SNN1':
            D_mp = D_snn
        elif hub == 'ls2':
            D_mp = D_ls

        n_triplets = triplets.shape[0]
        weights = np.empty(n_triplets, dtype=np.float64)
        print("P and triplets' shape", triplets)
        P = 1 - D_mp  # (n, n)
        for t in range(n_triplets):
            i = triplets[t, 0]
            p_sim = P[i, triplets[t, 1]]
            p_out = P[i, triplets[t, 2]]
            if p_out < 1e-20:
                p_out = 1e-20
            weights[t] = p_sim / p_out
    else:
        weights = find_weights(triplets, P, nbrs, outlier_dist, sig)

    if hub == 'weight':
        deg, mean_deg, var_deg = calculate_deg(nbrs)
        var_deg = max(var_deg, 1e-20)
        # hubness_score = (deg - mean_deg) / var_deg
        # hs_med = np.mean(hubness_score)
        hs_med = np.median(deg)
        hub_weights = np.exp(-deg / hs_med)
        # hub_weights = np.exp(- hubness_score)

        # print(hubness_score)

        m = hub_weights.shape[0]
        l = n_inlier * n_outlier

        for i in range(m):
            for j in range(l):
                weights[i * l:i * l +
                        j] = hub_weights[i] * weights[i * l:i * l + j]

    print('out_dist: ', outlier_dist)

    if n_random > 0:
        if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2':
            rand_triplets = sample_random_triplets(X, n_random,
                                                   P=P)  # P: (n, n)

        else:
            rand_triplets = sample_random_triplets(X, n_random, sig=sig)

        rand_weights = rand_triplets[:, -1]
        rand_triplets = rand_triplets[:, :-1].astype(np.int64)
        triplets = np.vstack((triplets, rand_triplets))
        weights = np.hstack((weights, rand_weights))
    weights /= np.max(weights)
    weights += 0.0001
    if weight_adj:
        if not isinstance(weight_adj, (int, float)):
            weight_adj = 400.0
        weights = np.log(1 + weight_adj * weights)
        weights /= np.max(weights)
    return (triplets, weights)
Beispiel #4
0
def viz_analysis(
    adata,
    do_norm,
    norm_scale,
    do_log,
    do_pca,
    n_clusters,
    metric,
    weighted,  # weighted adjmat for louvain/leiden clustering ?
    seed,
    n_comps,
    clustering_algo,
):
    hubness_methods = {
        'nothing': (None, None),
        'mp_normal': ('mp', {
            'method': 'normal'
        }),
        'ls': ('ls', None),
        'ls_nicdm': ('ls', {
            'method': 'nicdm'
        }),
        'dsl': ('dsl', None)
    }
    start0 = time.time()
    ### preprocess, prepare clustering input ###
    if type(do_norm) is str:
        adata.X = scipy.sparse.csr_matrix(adata.X)
        if do_norm == 'seurat':
            recipe_seurat(adata, do_log, norm_scale)
            # print(f'\t\tseurat norm retained {adata.X.shape[1]} genes')
        elif do_norm == 'duo':
            recipe_duo(adata, do_log, renorm=norm_scale)
            # print(f'\t\tduo norm retained {adata.X.shape[1]} genes')
        else:
            raise ValueError("do_norm not in duo, seurat")
    if scipy.sparse.issparse(adata.X):
        adata.X = adata.X.toarray()
    if do_log and not (type(do_norm) is str):
        # print('\t\tlog_transformed data')
        sc.pp.log1p(adata)
    if do_pca:
        use_rep = 'X_pca'
        sc.tl.pca(adata,
                  n_comps=min(adata.X.shape[1] - 1, min(len(adata.X) - 1,
                                                        500)))
        original1 = adata.obsm['X_pca']
        sc.tl.pca(adata,
                  n_comps=min(adata.X.shape[1] - 1,
                              min(len(adata.X) - 1, n_comps)))
        X = adata.obsm['X_pca']
    else:
        # print('pca not done!')
        use_rep = 'X'
        X = adata.X
    n_neighbors = int(np.sqrt(X.shape[0]))
    print('\t\t\tPreprocessing done:', round((time.time() - start0) / 60, 2),
          'mn')
    ### Hub reduction and clustering ###
    start = time.time()
    all_adata = dict()
    for kernel in ['umap', 'gauss']:
        all_adata[kernel] = adata.copy()
        try:
            sc.pp.neighbors(all_adata[kernel],
                            n_neighbors=n_neighbors + 1,
                            metric=metric,
                            use_rep=use_rep,
                            method=kernel)
        except:
            sc.pp.neighbors(all_adata[kernel],
                            n_neighbors=n_neighbors + 1,
                            metric=metric,
                            use_rep=use_rep,
                            method=kernel,
                            knn=False)
        G, weights = generate_clustering_inputs(X=X,
                                                metric=metric,
                                                n_neighbors=n_neighbors,
                                                weighted=weighted,
                                                seed=seed,
                                                hubness=None,
                                                hubness_params=None)
        resol, weighted = getNclusters(all_adata[kernel],
                                       G,
                                       n_clusters=n_clusters,
                                       seed=seed,
                                       clustering_algo=clustering_algo,
                                       flavor='scanpy',
                                       weights=weights)
        if clustering_algo == "leiden":
            sc.tl.leiden(all_adata[kernel],
                         resolution=resol,
                         use_weights=weighted,
                         random_state=seed)
        elif clustering_algo == "louvain":
            sc.tl.louvain(all_adata[kernel],
                          resolution=resol,
                          use_weights=weighted,
                          random_state=seed)
            sc.pl.paga(all_adata[kernel],
                       show=False,
                       random_state=seed,
                       plot=False)
    for method_name, (hubness, hubness_params) in hubness_methods.items():
        all_adata[method_name] = adata.copy()
        all_adata[method_name].obsp['connectivities'] = kneighbors_graph(
            X,
            n_neighbors=n_neighbors,
            hubness=hubness,
            hubness_params=hubness_params,
            metric=metric,
            mode="connectivity")
        all_adata[method_name].obsp['distances'] = kneighbors_graph(
            X,
            n_neighbors=n_neighbors,
            hubness=hubness,
            hubness_params=hubness_params,
            metric=metric,
            mode="distance")
        all_adata[method_name].uns['neighbors'] = {
            'connectivities_key': 'connectivities',
            'distances_key': 'distances',
            'params': {
                'n_neighbors': n_neighbors,
                'method': 'umap',
                'metric': metric
            }
        }
        G, weights = generate_clustering_inputs(X=X,
                                                metric=metric,
                                                n_neighbors=n_neighbors,
                                                weighted=weighted,
                                                seed=seed,
                                                hubness=hubness,
                                                hubness_params=hubness_params)
        resol, weighted = getNclusters(all_adata[method_name],
                                       G,
                                       n_clusters=n_clusters,
                                       seed=seed,
                                       clustering_algo=clustering_algo,
                                       flavor='base',
                                       weights=weights)
        if clustering_algo == "louvain":
            clus = np.array(
                louvain.find_partition(
                    graph=G,
                    partition_type=louvain.RBConfigurationVertexPartition,
                    weights=weights,
                    resolution_parameter=resol,
                    seed=seed).membership)
            all_adata[method_name].obs['louvain'] = pd.Categorical(
                values=clus.astype('U'),
                categories=natsorted(map(str, np.unique(clus))),
            )
        elif clustering_algo == "leiden":
            clus = np.array(
                leidenalg.find_partition(
                    graph=G,
                    partition_type=leidenalg.RBConfigurationVertexPartition,
                    weights=weights,
                    resolution_parameter=resol,
                    seed=seed).membership)
            all_adata[method_name].obs['leiden'] = pd.Categorical(
                values=clus.astype('U'),
                categories=natsorted(map(str, np.unique(clus))),
            )
    # original0 = adata.X
    # original2 = adata.obsm['X_pca'][:, :2]
    print('\t\t\tHubness and PAGA full pipeline:',
          round((time.time() - start) / 60, 2), 'mn')
    ### tSNE embedding ###
    start = time.time()
    tsne = sklearn.manifold.TSNE(n_components=2,
                                 metric='precomputed',
                                 random_state=seed,
                                 perplexity=50.0)
    q_tsne = np.empty((2, len(all_adata.keys())))
    for idx, method_name in enumerate(all_adata.keys()):
        all_adata[method_name].obsm['X_tsne'] = tsne.fit_transform(
            all_adata[method_name].obsp['distances'].toarray())
        # q_tsne[0, idx] = QDM(original0, all_adata[method_name].obsm['X_tsne'], metric)
        q_tsne[0, idx] = QDM(original1, all_adata[method_name].obsm['X_tsne'],
                             metric)
        # q_tsne[2, idx] = QDM(original2, all_adata[method_name].obsm['X_tsne'], metric)
        # q_tsne[3, idx] = QNP(original0, all_adata[method_name].obsm['X_tsne'], metric, n_neighbors)
        q_tsne[1, idx] = QNP(original1, all_adata[method_name].obsm['X_tsne'],
                             metric, n_neighbors)
        # q_tsne[5, idx] = QNP(original2, all_adata[method_name].obsm['X_tsne'], metric, n_neighbors)
    print('\t\t\ttSNE embedding pipeline:', round((time.time() - start) / 60,
                                                  2), 'mn')
    ### UMAP embedding ###
    start = time.time()
    umap = UMAP(n_components=2, metric='precomputed', random_state=seed)
    q_umap = np.empty((2, len(all_adata.keys())))
    for idx, method_name in enumerate(all_adata.keys()):
        all_adata[method_name].obsm['X_umap_'] = umap.fit_transform(
            all_adata[method_name].obsp['distances'].toarray())
        # q_umap[0, idx] = QDM(original0, all_adata[method_name].obsm['X_umap_'], metric)
        q_umap[0, idx] = QDM(original1, all_adata[method_name].obsm['X_umap_'],
                             metric)
        # q_umap[2, idx] = QDM(original2, all_adata[method_name].obsm['X_umap_'], metric)
        # q_umap[3, idx] = QNP(original0, all_adata[method_name].obsm['X_umap_'], metric, n_neighbors)
        q_umap[1, idx] = QNP(original1, all_adata[method_name].obsm['X_umap_'],
                             metric, n_neighbors)
        # q_umap[5, idx] = QNP(original2, all_adata[method_name].obsm['X_umap_'], metric, n_neighbors)
    print('\t\t\tUMAP embedding pipeline:', round((time.time() - start) / 60,
                                                  2), 'mn')
    ### PAGA embedding ###
    start = time.time()
    q_paga_umap = np.empty((2, len(all_adata.keys())))
    for idx, method_name in enumerate(all_adata.keys()):
        sc.tl.paga(all_adata[method_name], groups=clustering_algo)
        sc.pl.paga(all_adata[method_name],
                   show=False,
                   random_state=seed,
                   plot=False)
        sc.tl.umap(all_adata[method_name], init_pos="paga", random_state=seed)
        # q_paga_umap[0, idx] = QDM(original0, all_adata[method_name].obsm['X_umap'], metric)
        q_paga_umap[0,
                    idx] = QDM(original1,
                               all_adata[method_name].obsm['X_umap'], metric)
        # q_paga_umap[2, idx] = QDM(original2, all_adata[method_name].obsm['X_umap'], metric)
        # q_paga_umap[3, idx] = QNP(original0, all_adata[method_name].obsm['X_umap'], metric, n_neighbors)
        q_paga_umap[1, idx] = QNP(original1,
                                  all_adata[method_name].obsm['X_umap'],
                                  metric, n_neighbors)
        # q_paga_umap[5, idx] = QNP(original2, all_adata[method_name].obsm['X_umap'], metric, n_neighbors)
    print('\t\t\tPAGA+UMAP embedding pipeline:',
          round((time.time() - start) / 60, 2), 'mn')
    ### Save ###
    np.savetxt(get_res_path(fname) + "_tsne_q.csv", q_tsne, delimiter=',')
    np.savetxt(get_res_path(fname) + "_umap_q.csv", q_umap, delimiter=',')
    np.savetxt(get_res_path(fname) + "_paga_q.csv", q_paga_umap, delimiter=',')
    print('\t\t\tFull pipeline:', round((time.time() - start0) / 60, 2), 'mn')
Beispiel #5
0
def generator_from_index(X,
                         Y,
                         index_path,
                         k,
                         batch_size,
                         search_k=-1,
                         precompute=True,
                         verbose=1,
                         type='tri',
                         knn='MP'):
    if k >= X.shape[0] - 1:
        raise Exception('''k value greater than or equal to (num_rows - 1)
                        (k={}, rows={}). Lower k to a smaller
                        value.'''.format(k, X.shape[0]))
    if batch_size > X.shape[0]:
        raise Exception('''batch_size value larger than num_rows in dataset
                        (batch_size={}, rows={}). Lower batch_size to a
                        smaller value.'''.format(batch_size, X.shape[0]))

    if Y is None:
        if precompute:
            if verbose > 0:
                print('Extracting KNN from index')

            if knn == 'MP':
                if verbose > 0:
                    print('Making MP-based KNN')

                # D = euclidean_distance(X)
                # D_mp = hub_toolbox.global_scaling.mutual_proximity_empiric(
                #     D=D, metric='distance')
                neigbour_graph = kneighbors_graph(
                    X,
                    n_neighbors=k,
                    hubness='mutual_proximity',
                    hubness_params={'method': 'normal'})
                # neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness=None)
                neighbour_matrix = neigbour_graph.indices.reshape(
                    (X.shape[0], k))

            else:
                neighbour_matrix = extract_knn(X,
                                               index_path,
                                               k=k,
                                               search_k=search_k,
                                               verbose=verbose)
            # neighbour_matrix = np.asarray(neighbour_matrix, dtype=np.int32)
            print('neighbour_matrix: ', neighbour_matrix.shape)

            if knn == 'Mutual':
                if verbose > 0:
                    print('Making KNN mutual')

                neighbour_matrix = make_mutual(neighbour_matrix)

            # print('Mutual Knn: ', neighbour_matrix[0])

            if type == 'quad':
                return KnnQuadrupletGenerator(X,
                                              neighbour_matrix,
                                              batch_size=batch_size)
            if type == 'tri':
                return KnnTripletGenerator(X,
                                           neighbour_matrix,
                                           batch_size=batch_size)
        else:
            index = AnnoyIndex(X.shape[1])
            index.load(index_path)
            return AnnoyTripletGenerator(X,
                                         index,
                                         k=k,
                                         batch_size=batch_size,
                                         search_k=search_k)
    else:
        if precompute:
            if verbose > 0:
                print('Extracting KNN from index')

            neighbour_matrix = extract_knn(X,
                                           index_path,
                                           k=k,
                                           search_k=search_k,
                                           verbose=verbose)
            return LabeledKnnTripletGenerator(X,
                                              Y,
                                              neighbour_matrix,
                                              batch_size=batch_size)
        else:
            index = AnnoyIndex(X.shape[1])
            index.load(index_path)
            return LabeledAnnoyTripletGenerator(X,
                                                Y,
                                                index,
                                                k=k,
                                                batch_size=batch_size,
                                                search_k=search_k)
Beispiel #6
0
# d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors)
# # vectors = vectors[:10000, :]
# # d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors)
# # vectors = mnist.data
# # vectors = vectors[:10000, :]
# # d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors)
# D = euclidean_distance(vectors)
#
# S_k, _, _ = hub_toolbox.hubness.hubness(D=D, k=5, metric='distance')
# D_mp = hub_toolbox.global_scaling.mutual_proximity_empiric(
#         D=D, metric='distance')
# S_k_mp, _, _ = hub_toolbox.hubness.hubness(D=D_mp, k=5, metric='distance')
#
# print(S_k, S_k_mp)

from skhubness.data import load_dexter

X, y = load_dexter()

from skhubness import Hubness
hub = Hubness(k=10, metric='cosine')
hub.fit(X)
k_skew = hub.score()
print(f'Skewness = {k_skew:.3f}')

from skhubness.neighbors import kneighbors_graph
k = 5
# neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness='mutual_proximity')
neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness=None)
neighbor_matrix = neigbor_graph.indices.reshape((X.shape[0], k))
print(neighbor_matrix)
Beispiel #7
0
def ti_analysis(adata, true_labels, do_norm, norm_scale, do_log, do_pca,
                n_clusters, metric, weighted,  # weighted adjmat for louvain/leiden clustering ?
                seed, n_comps, clustering_algo, n_iter, bootstrap_size):
    hubness_methods = {'nothing': (None, None),
                       'mp_normal': ('mp', {'method': 'normal'}),
                       'ls': ('ls', None),
                       'ls_nicdm': ('ls', {'method': 'nicdm'}),
                       'dsl': ('dsl', None)}
    start = time.time()
    ### preprocess, prepare clustering input ###
    if type(do_norm) is str:
        adata.X = scipy.sparse.csr_matrix(adata.X)
        if do_norm == 'seurat':
            recipe_seurat(adata, do_log, norm_scale)
            # print(f'\t\tseurat norm retained {adata.X.shape[1]} genes')
        elif do_norm == 'duo':
            recipe_duo(adata, do_log, renorm=norm_scale)
            # print(f'\t\tduo norm retained {adata.X.shape[1]} genes')
        else:
            raise ValueError("do_norm not in 'duo', seurat'")
    if scipy.sparse.issparse(adata.X):
        adata.X = adata.X.toarray()
    if do_log and not(type(do_norm) is str):
        # print('\t\tlog_transformed data')
        sc.pp.log1p(adata)
    if do_pca:
        use_rep = 'X_pca'
        sc.tl.pca(adata, n_comps=min(adata.X.shape[1]-1, min(len(adata.X)-1, n_comps)))
        X = adata.obsm['X_pca']
    else:
        # print('pca not done!')
        use_rep = 'X'
        X = adata.X
    n_neighbors = int(np.sqrt(X.shape[0]))
    print('\t\t\tPreprocessing done:', round((time.time()-start)/60, 2), 'mn')
    start = time.time()
    ### clustering and PAGA step ###
    all_adata = dict()
    for kernel in ['umap', 'gauss']:
        all_adata[kernel] = adata.copy()
        try:
            sc.pp.neighbors(all_adata[kernel], n_neighbors=n_neighbors+1, metric=metric, use_rep=use_rep, method=kernel)
        except:
            sc.pp.neighbors(all_adata[kernel], n_neighbors=n_neighbors+1, metric=metric, use_rep=use_rep, method=kernel, knn=False)
        G, weights = generate_clustering_inputs(X=X,
                                                metric=metric,
                                                n_neighbors=n_neighbors,
                                                weighted=weighted,
                                                seed=seed,
                                                hubness=None,
                                                hubness_params=None)
        resol, weighted = getNclusters(all_adata[kernel], G, n_clusters=n_clusters, seed=seed,
                                       clustering_algo=clustering_algo, flavor='scanpy', weights=weights)
        if clustering_algo == "leiden":
            sc.tl.leiden(all_adata[kernel], resolution=resol, use_weights=weighted, random_state=seed)
            sc.tl.paga(all_adata[kernel], groups="leiden")
        elif clustering_algo == "louvain":
            sc.tl.louvain(all_adata[kernel], resolution=resol, use_weights=weighted, random_state=seed)
            sc.tl.paga(all_adata[kernel], groups="louvain")
    for method_name, (hubness, hubness_params) in hubness_methods.items():
        all_adata[method_name] = adata.copy()
        all_adata[method_name].obsp['connectivities'] = kneighbors_graph(X,
                                                                         n_neighbors=n_neighbors,
                                                                         hubness=hubness,
                                                                         hubness_params=hubness_params,
                                                                         metric=metric,
                                                                         mode="connectivity")
        all_adata[method_name].obsp['distances'] = kneighbors_graph(X,
                                                                    n_neighbors=n_neighbors,
                                                                    hubness=hubness,
                                                                    hubness_params=hubness_params,
                                                                    metric=metric,
                                                                    mode="distance")
        all_adata[method_name].uns['neighbors'] = {'connectivities_key': 'connectivities',
                                                   'distances_key': 'distances',
                                                   'params': {'n_neighbors': n_neighbors,
                                                              'method': method_name,
                                                              'metric': metric}}
        G, weights = generate_clustering_inputs(X=X,
                                                metric=metric,
                                                n_neighbors=n_neighbors,
                                                weighted=weighted,
                                                seed=seed,
                                                hubness=hubness,
                                                hubness_params=hubness_params)
        resol, weighted = getNclusters(all_adata[method_name], G, n_clusters=n_clusters, seed=seed,
                                       clustering_algo=clustering_algo, flavor='base', weights=weights)
        if clustering_algo == "louvain":
            clus = np.array(louvain.find_partition(graph=G,
                                                   partition_type=louvain.RBConfigurationVertexPartition,
                                                   weights=weights,
                                                   resolution_parameter=resol, seed=seed).membership)
            all_adata[method_name].obs['louvain'] = pd.Categorical(values=clus.astype('U'),
                                                                   categories=natsorted(map(str, np.unique(clus))),)
            sc.tl.paga(all_adata[method_name], groups="louvain", neighbors_key='neighbors')
        elif clustering_algo == "leiden":
            clus = np.array(leidenalg.find_partition(graph=G,
                                                     partition_type=leidenalg.RBConfigurationVertexPartition,
                                                     weights=weights,
                                                     resolution_parameter=resol,
                                                     seed=seed).membership)
            all_adata[method_name].obs['leiden'] = pd.Categorical(values=clus.astype('U'),
                                                                  categories=natsorted(map(str, np.unique(clus))),)
            sc.tl.paga(all_adata[method_name], groups="leiden")
    print('\t\t\tHubness and PAGA full pipeline:', round((time.time()-start)/60, 2), 'mn')
    start = time.time()
    ### PAGA stab ###
    all_iter = dict()
    cell_iter = dict()
    feat_iter = dict()
    for method_name, (hubness, hubness_params) in hubness_methods.items():
        all_iter[method_name] = dict()
        cell_iter[method_name] = np.zeros((n_iter, adata.n_obs))
        feat_iter[method_name] = np.zeros((n_iter, adata.n_vars))
        for iter in tqdm(range(n_iter)):
            feat_bootstrap = np.random.uniform(0, 1, size=adata.n_vars)
            feat_bootstrap[feat_bootstrap <= bootstrap_size] = 0
            feat_bootstrap[feat_bootstrap > bootstrap_size] = 1
            feat_bootstrap = feat_bootstrap == 0
            cell_bootstrap = np.random.uniform(0, 1, size=adata.n_obs)
            cell_bootstrap[cell_bootstrap <= bootstrap_size] = 0
            cell_bootstrap[cell_bootstrap > bootstrap_size] = 1
            cell_bootstrap = cell_bootstrap == 0
            cell_iter[method_name][iter, :] = cell_bootstrap
            feat_iter[method_name][iter, :] = feat_bootstrap
            uns = {'Order': true_labels[cell_bootstrap]}
            adata_sampled = anndata.AnnData(adata.X[cell_bootstrap][:, feat_bootstrap],
                                            uns=uns)
            n_clusters2 = len(np.unique(adata_sampled.uns['Order']))
            if do_pca:
                sc.tl.pca(adata_sampled, n_comps=min(adata_sampled.X.shape[1]-1, min(len(adata_sampled.X)-1, n_comps)))
                X2 = adata_sampled.obsm['X_pca']
            else:
                X2 = adata_sampled.X
            adata_sampled.obsp["connectivities"] = kneighbors_graph(X2,
                                                                    n_neighbors=n_neighbors,
                                                                    hubness=hubness,
                                                                    hubness_params=hubness_params,
                                                                    metric=metric,
                                                                    mode="connectivity")
            adata_sampled.obsp["distances"] = kneighbors_graph(X2,
                                                               n_neighbors=n_neighbors,
                                                               hubness=hubness,
                                                               hubness_params=hubness_params,
                                                               metric=metric,
                                                               mode="distance")
            adata_sampled.uns['neighbors'] = {'connectivities_key': 'connectivities',
                                              'distances_key': 'distances',
                                              'params': {'n_neighbors': n_neighbors,
                                                         'method': method_name,
                                                         'metric': metric}}
            G2, weights2 = generate_clustering_inputs(X=X2,
                                                      metric=metric,
                                                      n_neighbors=n_neighbors,
                                                      weighted=weighted,
                                                      seed=seed,
                                                      hubness=hubness,
                                                      hubness_params=hubness_params)
            resol2, weighted2 = getNclusters(adata_sampled, G2, n_clusters=n_clusters2, seed=seed,
                                             clustering_algo=clustering_algo, flavor='base', weights=weights2)
            if clustering_algo == "leiden":
                clus = np.array(leidenalg.find_partition(graph=G2,
                                                         partition_type=leidenalg.RBConfigurationVertexPartition,
                                                         weights=weights2,
                                                         resolution_parameter=resol2,
                                                         seed=seed).membership)
                adata_sampled.obs['leiden'] = pd.Categorical(values=clus.astype('U'),
                                                             categories=natsorted(map(str, np.unique(clus))),)
                sc.tl.paga(adata_sampled, groups="leiden")
            elif clustering_algo == "louvain":
                clus = np.array(louvain.find_partition(graph=G2,
                                                       partition_type=louvain.RBConfigurationVertexPartition,
                                                       weights=weights2,
                                                       resolution_parameter=resol2, seed=seed).membership)
                adata_sampled.obs['louvain'] = pd.Categorical(values=clus.astype('U'),
                                                              categories=natsorted(map(str, np.unique(clus))), )
                sc.tl.paga(adata_sampled, groups="louvain")
            all_iter[method_name]['iter'+str(iter)] = adata_sampled.uns["paga"]["connectivities_tree"]
    print('\t\t\tPAGA stability pipeline:', round((time.time()-start)/60, 2), 'mn')
    for method_name in all_adata.keys():
        if method_name == "nothing":
            all_adata[method_name] = anndata.AnnData(X=all_adata[method_name].X,
                                                     uns={'Order': all_adata[method_name].uns['Order'],
                                                          'paga': all_adata[method_name].uns['paga']},
                                                     obs=all_adata[method_name].obs)
        else:
            all_adata[method_name] = anndata.AnnData(X=all_adata[method_name].X[:, :2],
                                                     uns={'Order': all_adata[method_name].uns['Order'],
                                                          'paga': all_adata[method_name].uns['paga']},
                                                     obs=all_adata[method_name].obs)
        all_adata[method_name].write_h5ad(filename=get_res_path(fname)+'_'+method_name+".h5ad")
        if method_name not in ["umap", "gauss"]:
            w = csv.writer(open(get_res_path(fname)+'_'+method_name+"_stab.csv", "w"))
            for key, val in all_iter[method_name].items():
                w.writerow([key, val])
            np.savetxt(get_res_path(fname)+'_'+method_name+"_stab_cell.csv", cell_iter[method_name], delimiter=',', fmt='%d')
            np.savetxt(get_res_path(fname)+'_'+method_name+"_stab_feat.csv", feat_iter[method_name], delimiter=',', fmt='%d')