コード例 #1
0
def performance(encoder, models, K):
    mean_ami = dict(zip(models.keys(), list(np.zeros(len(models)))))
    mean_chs = dict(zip(models.keys(), list(np.zeros(len(models)))))
    mean_sil = dict(zip(models.keys(), list(np.zeros(len(models)))))

    tic = time.perf_counter()
    for i in range(K):
        features_enc = encoder.fit_transform(features, target)

        for key in models:
            model = models[key]
            
            y_predict = model.fit_predict(features_enc, target)

            mean_ami[key] += ami(target, y_predict)/K
            mean_chs[key] += chs(features_enc, y_predict)/K
            mean_sil[key] += sil(features_enc, y_predict, metric='euclidean')/K

    toc = time.perf_counter()

    # Write results to file
    res = open('../results/'+name_prefix+'_results.txt', 'a')
    res.write(type(encoder).__name__[0:-7]+' Encoder\n')
    for key in mean_ami:
        res.write(' '+key+': '+str(mean_ami[key])+', '+str(mean_chs[key])+', '+str(mean_sil[key])+'\n')
    res.write('Total time: '+str(round(toc-tic,3))+'\n') 
    res.close()

    print('Evaluation of', type(encoder).__name__[0:-7], 'Encoder completed in', round(toc-tic,3),'s')
コード例 #2
0
def test_approx(datasets_dimred, genes, labels, idx, distr, xlabels):
    integrated = assemble(datasets_dimred[:], approx=False, sigma=150)
    X = np.concatenate(integrated)
    distr.append(sil(X[idx, :], labels[idx]))
    print(ttest_ind(X[idx, :], labels[idx]))
    xlabels.append('Exact NN')
    print('')

    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('approx'))
コード例 #3
0
def silhouette_score(estimator, X, y=None):
    """Scorer wrapper for metrics.silhouette_score."""
    if hasattr(estimator, 'affinity') and estimator.affinity == 'precomputed':
        return np.nan

    _y = estimator.predict(X)
    n_labels = len(np.unique(_y))
    if 1 < n_labels < X.shape[0]:
        return sil(X, _y)

    logging.warn("adenine.utils.extension.silhouette_score() returned NaN "
                 "because the number of labels is {}. Valid values are 2 "
                 "to n_samples - 1 (inclusive) = {}".format(
                     n_labels, X.shape[0] - 1))
    return np.nan
コード例 #4
0
    def predict_from_tuned(self):
        """Returns labels from hierarchical clustering with n_clusters that maximizes Silhouette measure"""
        assert self.Mk is not None, "First run fit"
        scores = []
        labels = []

        for k in range(self.L_, self.K_):
            cls = AgglomerativeClustering(n_clusters=k, linkage='average', affinity='precomputed').fit(1 - self.Mk)

            ls = cls.labels_
            labels.append(ls)
            scores.append(sil(self.X, ls))

        labels = np.array(labels)
        return labels[np.argmax(scores)]
コード例 #5
0
ファイル: extensions.py プロジェクト: slipguru/adenine
def silhouette_score(estimator, X, y=None):
    """Scorer wrapper for metrics.silhouette_score."""
    if hasattr(estimator, 'affinity') and estimator.affinity == 'precomputed':
        return np.nan

    _y = estimator.predict(X)
    n_labels = len(np.unique(_y))
    if 1 < n_labels < X.shape[0]:
        return sil(X, _y)

    logging.warn("adenine.utils.extension.silhouette_score() returned NaN "
                 "because the number of labels is {}. Valid values are 2 "
                 "to n_samples - 1 (inclusive) = {}"
                 .format(n_labels, X.shape[0]-1))
    return np.nan
コード例 #6
0
def test_alpha(datasets_dimred, genes, labels, idx, distr, xlabels):
    alphas = [0, 0.05, 0.20, 0.50]
    for alpha in alphas:
        integrated = assemble(datasets_dimred[:], alpha=alpha, sigma=150)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        print(ttest_ind(X[idx, :], labels[idx]))
        xlabels.append(str(alpha))
    print('')

    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('alpha'))
コード例 #7
0
def test_sigma(datasets_dimred, genes, labels, idx, distr, xlabels):
    sigmas = [10, 50, 100, 200]
    for sigma in sigmas:
        integrated = assemble(datasets_dimred[:], sigma=sigma)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        print(ttest_ind(X[idx, :], labels[idx]))
        xlabels.append(str(sigma))
    print('')

    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('sigma'))
コード例 #8
0
def test_learn_rate(datasets_dimred, genes, labels, idx, distr, xlabels):
    X = np.concatenate(datasets_dimred)

    learn_rates = [50., 100., 500., 1000.]
    for learn_rate in learn_rates:
        embedding = fit_tsne(X, learn_rate=learn_rate)
        distr.append(sil(embedding[idx, :], labels[idx]))
        print(ttest_ind(X[idx, :], labels[idx]))
        xlabels.append(str(learn_rate))
    print('')

    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('learn_rate'))
コード例 #9
0
def test_perplexity(datasets_dimred, genes, labels, idx, distr, xlabels):
    X = np.concatenate(datasets_dimred)

    perplexities = [10, 100, 500, 2000]
    for perplexity in perplexities:
        embedding = fit_tsne(X, perplexity=perplexity)
        distr.append(sil(embedding[idx, :], labels[idx]))
        print(ttest_ind(X[idx, :], labels[idx]))
        xlabels.append(str(perplexity))
    print('')

    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('perplexity'))
コード例 #10
0
ファイル: example.py プロジェクト: ldrbmrtv/Auto2ML
def cluster_and_evaluate(models, X, y, score):
    n = len(X)
    k_list = range(2, n)
    sil_list = []
    db_list = []
    ae_list = []
    for k in k_list:
        labels = cluster(X, k)
        sil_list.append(sil(X, labels))
        db_list.append(db(X, labels))
        ae_list.append(ae(labels))

    model = {}
    evaluate(model, k_list, sil_list, score, 'sil')
    evaluate(model, k_list, db_list, score, 'db', min)
    evaluate(model, k_list, ae_list, score, 'ae')
    models.append(model)
コード例 #11
0
def test_knn(datasets_dimred, genes, labels, idx, distr, xlabels):
    knns = [5, 10, 50, 100]
    len_distr = len(distr)
    for knn in knns:
        integrated = assemble(datasets_dimred[:], knn=knn, sigma=150)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(knn))
    print('')

    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('knn'))
コード例 #12
0
def test_dimred(datasets, genes, labels, idx, distr, xlabels):
    dimreds = [10, 20, 50, 200, 6000]
    for dimred in dimreds:
        datasets_dimred, genes = process_data(datasets, genes, dimred=dimred)
        datasets_dimred = assemble(datasets_dimred, sigma=150)
        X = np.concatenate(datasets_dimred)
        distr.append(sil(X[idx, :], labels[idx]))
        print(ttest_ind(X[idx, :], labels[idx]))
        xlabels.append(str(dimred))
    print('')
    xlabels[-1] = 'No SVD'

    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('dimred'))
コード例 #13
0
def plot_projections(holder,
                     labels,
                     preprocess_lda='PCA',
                     class_name='Antioxidants',
                     only_pca=False,
                     binarize_class=True,
                     standardize=True,
                     cluster=True,
                     return_distances=False):
    '''
    holder should be a dictionary with df's as values and fp-filenames as keys
    labels should be a mapping of DrugCombID: ATC_class
    '''
    if only_pca:
        from sklearn.decomposition import PCA

        df = dict()
        for ind, i in enumerate([
                'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit',
                'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new',
                'fps_transformer_1024bit_new', 'fps_transformer_64bit_new',
                'fps_gae_64bit_new'
        ]):

            df_cluster = holder[i].copy()
            df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())]
            df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')]
            if standardize:
                from mlxtend.preprocessing import standardize as st
                classes = df_cluster.index.copy()
                df_cluster.reset_index(inplace=True, drop=True)
                df_cluster = st(df_cluster)
            else:
                classes = df_cluster.index.copy()
            pca = PCA(n_components=2)
            temp = pca.fit_transform(df_cluster)
            df[ind] = pd.DataFrame(index=df_cluster.index, data=temp)
            df[ind]['classes'] = classes
            df[ind]['classes'] = df[ind]['classes'].map(labels)
        title = 'PCA'

    else:  # to LDA
        from mlxtend.feature_extraction import LinearDiscriminantAnalysis as LDA
        from sklearn.preprocessing import LabelEncoder
        # binary https://stats.stackexchange.com/questions/178587/why-is-the-rank-of-covariance-matrix-at-most-n-1/180366#180366

        df = dict()
        for ind, i in enumerate([
                'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit',
                'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new',
                'fps_transformer_1024bit_new', 'fps_transformer_64bit_new',
                'fps_gae_64bit_new'
        ]):

            df_cluster = holder[i].copy()
            df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())]
            df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')]
            if standardize:
                from mlxtend.preprocessing import standardize as st
                from sklearn.preprocessing import MinMaxScaler

                classes = df_cluster.index.copy()
                df_cluster.reset_index(inplace=True, drop=True)
                mms = MinMaxScaler()
                df_cluster = pd.DataFrame(data=mms.fit_transform(df_cluster),
                                          index=df_cluster.index,
                                          columns=df.columns)
            else:
                classes = df_cluster.index.copy()
            df_cluster['classes'] = classes
            df_cluster['classes'] = df_cluster['classes'].map(labels)
            if binarize_class:
                df_cluster.loc[df_cluster.classes != class_name,
                               'classes'] = 'not ' + 'class_name'

            # change labels from str to int
            enc = LabelEncoder()
            real_classes = df_cluster.loc[:, 'classes']
            df_cluster.loc[:, 'classes'] = enc.fit_transform(
                df_cluster['classes'])
            classes = df_cluster.pop('classes')

            if preprocess_lda == 'PLS':
                from sklearn.cross_decomposition import PLSRegression
                pls = PLSRegression(n_components=10, scale=False)
                temp = pls.fit_transform(df_cluster.values, classes.values)[0]
            elif preprocess_lda == 'PCA':
                from sklearn.decomposition import PCA
                pca = PCA(n_components=0.95, svd_solver='full', whiten=False)
                temp = pca.fit_transform(df_cluster.values)
            elif preprocess_lda == 'kernelPCA':
                from sklearn.decomposition import KernelPCA
                pca = KernelPCA(kernel="rbf", gamma=5)
                temp = pca.fit_transform(df_cluster.values)
            elif preprocess_lda == 'NONE':
                temp = df_cluster.values

            # lda
            lda = LDA(n_discriminants=2)
            lda.fit(temp, classes.values)
            temp = lda.transform(temp)
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    'ignore',
                    'Casting complex values to real discards the imaginary part'
                )
                temp = temp.astype(np.float)  # in case of complex numbers///
            df[ind] = pd.DataFrame(index=df_cluster.index,
                                   columns=[0, 1],
                                   data=temp)
            df[ind]['classes'] = real_classes

        title = 'LDA'

    sns.set_context(context='talk')
    sns.set_style('dark')
    sns.set_style({'font.family': 'serif', 'font.sans-serif': ['Helvetica']})
    fig, ((ax1, ax2, ax3), (ax4, ax5, ax6),
          (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize=(13, 14))
    cm = plt.cm.get_cmap('Spectral')
    my_cmap = cm(np.linspace(0, 1, len(np.unique(df[ind]['classes']))),
                 alpha=0.6)

    if return_distances:
        distances = dict()
        sil_scores = dict()
        chs_scores = dict()
    for ax_n, key, x, name in zip(
        [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9], df.keys(), df.values(),
        [
            'E3FP', 'Morgan_300', 'Topo_1024', 'Infomax', 'VAE_256', 'VAE_16',
            'Trans_1024', 'Trans_64', 'GAE_64'
        ]):
        if not binarize_class:
            for ind, i in enumerate(np.unique(x['classes'])):
                color = my_cmap[ind]
                marker = '.'
                if i == class_name:
                    color = 'black',
                    marker = ','
                ax_n.scatter(
                    x.loc[x.classes == i, 0],
                    x.loc[x.classes == i, 1],
                    marker=marker,
                    label=i +
                    f' (n={str(len(x.loc[x.classes==i, 0]))}) vs Rest ({str(len(x.loc[x.classes!=i, 0]))})',
                    color=color)
                ax_n.title.set_text(name)
        else:
            ax_n.scatter(x.loc[:, 0], x.loc[:, 1], marker='.')
            ax_n.scatter(
                x.loc[x.classes == class_name, 0],
                x.loc[x.classes == class_name, 1],
                marker=',',
                label=class_name +
                f' (n={str(len(x.loc[x.classes==class_name, 0]))}) vs rest (n={str(len(x.loc[x.classes!=class_name, 0]))})',
                color='darkorange')
            ax_n.title.set_text(name)
            if cluster:
                from sklearn.cluster import KMeans
                from scipy.spatial.distance import pdist
                from sklearn.metrics import silhouette_score as sil
                from sklearn.metrics import calinski_harabasz_score as chs

                km = KMeans(init='k-means++', n_clusters=1, n_init=10)
                km.fit(x.loc[x.classes != class_name, [0, 1]])

                km1 = KMeans(init='k-means++', n_clusters=1, n_init=10)
                km1.fit(x.loc[x.classes == class_name, [0, 1]])

                ax_n.scatter(km.cluster_centers_[:, 0],
                             km.cluster_centers_[:, 1],
                             marker='X',
                             color='darkblue',
                             s=100,
                             linewidth=3)
                ax_n.scatter(km1.cluster_centers_[:, 0],
                             km1.cluster_centers_[:, 1],
                             marker='X',
                             color='red',
                             s=100,
                             linewidth=3)

                d = round(
                    pdist([km.cluster_centers_[0], km1.cluster_centers_[0]],
                          metric='euclidean')[0], 3)
                d_sc = round(sil(x.loc[:, [0, 1]], x['classes']), 3)
                d_chs = round(chs(x.loc[:, [0, 1]], x['classes']), 3)
                if return_distances:
                    cl_name = class_name + ' ' + name
                    distances[cl_name] = d
                    sil_scores[cl_name] = d_sc
                    chs_scores[cl_name] = d_chs
                name = name + '\n|d:' + str(d) + '|sil:' + str(
                    d_sc) + '|chs:' + str(d_chs)
                ax_n.title.set_text(name)
    for ax in [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9]:
        ax.set_xticks([])
        ax.set_yticks([])

    labels = ax_n.get_legend_handles_labels()[1]
    if only_pca:
        fig.suptitle(labels[0] + "\n classified with: " + title)
    else:
        fig.suptitle(labels[0] + "\n classified with: " + title +
                     f', preprocessed with: {preprocess_lda}')
    fig.tight_layout()
    if not return_distances:
        return fig
    else:
        return fig, distances, sil_scores, chs_scores
コード例 #14
0
    plt.savefig('param_sensitivity_{}.svg'.format('learn_rate'))


if __name__ == '__main__':
    with open('conf/panorama.txt') as f:
        data_names = f.read().split()

    labels = np.array(open('data/cell_labels/all.txt').read().rstrip().split())
    idx = range(labels.shape[0])

    datasets, genes_list, n_cells = load_names(data_names)
    datasets, genes = merge_datasets(datasets, genes_list)
    datasets_dimred, genes = process_data(datasets, genes)

    X = np.concatenate(datasets_dimred)
    sil_non = sil(X[idx, :], labels[idx])
    print(np.median(sil_non))

    X = np.loadtxt('data/corrected_mnn.txt')
    sil_mnn = sil(X[idx, :], labels[idx])
    print(np.median(sil_mnn))

    X = np.loadtxt('data/corrected_seurat.txt')
    sil_cca = sil(X[idx, :], labels[idx])
    print(np.median(sil_cca))

    distr = [sil_non, sil_mnn, sil_cca]
    xlabels = ['No correction', 'scran MNN', 'Seurat CCA']

    # Test alignment parameters.
    test_approx(datasets_dimred[:], genes, labels, idx, distr[:], xlabels[:])
コード例 #15
0
ファイル: silhouette.py プロジェクト: marnifora/scanorama
    labels = np.array(
        open(path + 'cell_labels/all.txt').read().rstrip().split()
    )
    idx = range(labels.shape[0])
    #idx = np.random.choice(len(labels), size=int(len(labels)/2), replace=False)
    
    datasets, genes_list, cells_list, n_cells = load_names(data_names)
    datasets, genes = merge_datasets(datasets, genes_list)
    datasets_dimred, datasets_norm, genes = process_data(datasets, genes)

    #mmwrite(output + 'panorama_nocorrection_silh.mtx', vstack(datasets_dimred))

    # Baseline without correction.
    X = np.concatenate(datasets_dimred)
    sil_non = sil(X[idx, :], labels[idx])
    print(np.median(sil_non))

    '''
    # scran MNN.
    X = np.loadtxt(path + 'corrected_mnn.txt')
    sil_mnn = sil(X[idx, :], labels[idx])
    print(np.median(sil_mnn))

    # Seurat CCA.
    X = np.loadtxt(path + 'corrected_seurat.txt')
    sil_cca = sil(X[idx, :], labels[idx])
    print(np.median(sil_cca))
    '''

    # Scanorama.