def kmedoidsWithScores(filenameData, filenameSilhMean, nameDBS, nameCHS,
                       kClusters, measure):
    path = pathlib.Path(str(root) + '\\' + filenameData)
    if path.is_file():
        data = read_sample(path)

        clusters, predicted = kmedoidsRun(data, kClusters, measure)

        meanSilhouetteScore = meanSilh(data, clusters)
        witTXT(meanSilhouetteScore,
               filenameSilhMean,
               filepath=root,
               note=filenameData + " k: " + str(kClusters))

        dbsScore = dbs(data, predicted)
        witTXT(dbsScore,
               nameDBS,
               filepath=root,
               note=filenameData + " k: " + str(kClusters))

        chsScore = chs(data, predicted)
        witTXT(chsScore,
               nameCHS,
               filepath=root,
               note=filenameData + " k: " + str(kClusters))
Exemple #2
0
def kmedoidsWithScores(filenameData, filenameSilhMean, filenameDBS,
                       filenameCHS, kClusters):
    data = read_sample(str(root) + '\\' + filenameData)

    #kClusters = canoc(data, kmin, kmax)

    initial_medoids = randomCenters(len(data), kClusters)
    kmedoids_instance = kmedoids(data, initial_medoids, metric=metricResearch)

    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    predicted = kmedoids_instance.predict(data)

    silhouetteScore = silhouette(data, clusters).process().get_score()
    meanSilhouetteScore = np.mean(silhouetteScore)
    witTXT(meanSilhouetteScore,
           filenameSilhMean,
           filepath=root,
           note='k: ' + str(kClusters))

    dbsScore = dbs(data, predicted)
    witTXT(dbsScore, filenameDBS, filepath=root, note='k: ' + str(kClusters))

    chsScore = chs(data, predicted)
    witTXT(chsScore, filenameCHS, filepath=root, note='k: ' + str(kClusters))
Exemple #3
0
    def calculate_cluster(self, dist_mat, max_iter=21):
        """Calculates the best possible communities/clusters.

        self.store_score calculated CH score for every clustering number. (Read the paper for more details)

        Args:
            dist_mat (numpy.array) : 1 - cross-correlation matrix.
            max_iter (int)         : Maximum number of iterations for cluster counts.
        
        Returns:
            True if successful; None otherwise.
        """
        best_score = 0
        dist_mat = numpy.sqrt(2*(dist_mat))
        Data = numpy.triu(dist_mat)
        Z = ward(Data)
        self.store_score = dict()
        for i in range(2, max_iter):
            label = fcluster(Z, i, criterion='maxclust')
            self.store_communities[i] = label
            score = chs(dist_mat, label)
            self.store_score[i] = score
            if score > best_score:
                self.best_community = label
                best_score = score
        
        self.store_score = dict(sorted(self.store_score.items(), key=lambda item: item[1])[::-1])
        return True
Exemple #4
0
def performance(encoder, models, K):
    mean_ami = dict(zip(models.keys(), list(np.zeros(len(models)))))
    mean_chs = dict(zip(models.keys(), list(np.zeros(len(models)))))
    mean_sil = dict(zip(models.keys(), list(np.zeros(len(models)))))

    tic = time.perf_counter()
    for i in range(K):
        features_enc = encoder.fit_transform(features, target)

        for key in models:
            model = models[key]
            
            y_predict = model.fit_predict(features_enc, target)

            mean_ami[key] += ami(target, y_predict)/K
            mean_chs[key] += chs(features_enc, y_predict)/K
            mean_sil[key] += sil(features_enc, y_predict, metric='euclidean')/K

    toc = time.perf_counter()

    # Write results to file
    res = open('../results/'+name_prefix+'_results.txt', 'a')
    res.write(type(encoder).__name__[0:-7]+' Encoder\n')
    for key in mean_ami:
        res.write(' '+key+': '+str(mean_ami[key])+', '+str(mean_chs[key])+', '+str(mean_sil[key])+'\n')
    res.write('Total time: '+str(round(toc-tic,3))+'\n') 
    res.close()

    print('Evaluation of', type(encoder).__name__[0:-7], 'Encoder completed in', round(toc-tic,3),'s')
Exemple #5
0
def kmediansWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS,
                      k_clusters, measure, kmin, kmax):
    data = read_sample(str(root) + '\\' + nameData)

    initial_medians = kppi(data, k_clusters).initialize()
    kmedians_instance = kmedians(data, initial_medians)
    kmedians_instance.process()

    clusters = kmedians_instance.get_clusters()
    #    final_medians = kmedians_instance.get_medians()

    predicted = kmedians_instance.predict(data)

    silhouetteScore = silhouette(data, clusters).process().get_score()
    meanSilhouetteScore = np.mean(silhouetteScore)
    #wlitCSV(silhouetteScore, filenameSilhouette, '', root)
    #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root)

    dbsScore = dbs(data, predicted)
    #witCSV(dbsScore, nameDBS, '', root)

    chsScore = chs(data, predicted)
    #witCSV(chsScore, nameCHS, '', root)

    elbow_instance = elbow(data, kmin, kmax)
    elbow_instance.process()
    amount_clusters = elbow_instance.get_amount(
    )  # most probable amount of clusters
    wce = elbow_instance.get_wce()
def plot_projections(holder,
                     labels,
                     preprocess_lda='PCA',
                     class_name='Antioxidants',
                     only_pca=False,
                     binarize_class=True,
                     standardize=True,
                     cluster=True,
                     return_distances=False):
    '''
    holder should be a dictionary with df's as values and fp-filenames as keys
    labels should be a mapping of DrugCombID: ATC_class
    '''
    if only_pca:
        from sklearn.decomposition import PCA

        df = dict()
        for ind, i in enumerate([
                'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit',
                'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new',
                'fps_transformer_1024bit_new', 'fps_transformer_64bit_new',
                'fps_gae_64bit_new'
        ]):

            df_cluster = holder[i].copy()
            df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())]
            df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')]
            if standardize:
                from mlxtend.preprocessing import standardize as st
                classes = df_cluster.index.copy()
                df_cluster.reset_index(inplace=True, drop=True)
                df_cluster = st(df_cluster)
            else:
                classes = df_cluster.index.copy()
            pca = PCA(n_components=2)
            temp = pca.fit_transform(df_cluster)
            df[ind] = pd.DataFrame(index=df_cluster.index, data=temp)
            df[ind]['classes'] = classes
            df[ind]['classes'] = df[ind]['classes'].map(labels)
        title = 'PCA'

    else:  # to LDA
        from mlxtend.feature_extraction import LinearDiscriminantAnalysis as LDA
        from sklearn.preprocessing import LabelEncoder
        # binary https://stats.stackexchange.com/questions/178587/why-is-the-rank-of-covariance-matrix-at-most-n-1/180366#180366

        df = dict()
        for ind, i in enumerate([
                'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit',
                'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new',
                'fps_transformer_1024bit_new', 'fps_transformer_64bit_new',
                'fps_gae_64bit_new'
        ]):

            df_cluster = holder[i].copy()
            df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())]
            df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')]
            if standardize:
                from mlxtend.preprocessing import standardize as st
                from sklearn.preprocessing import MinMaxScaler

                classes = df_cluster.index.copy()
                df_cluster.reset_index(inplace=True, drop=True)
                mms = MinMaxScaler()
                df_cluster = pd.DataFrame(data=mms.fit_transform(df_cluster),
                                          index=df_cluster.index,
                                          columns=df.columns)
            else:
                classes = df_cluster.index.copy()
            df_cluster['classes'] = classes
            df_cluster['classes'] = df_cluster['classes'].map(labels)
            if binarize_class:
                df_cluster.loc[df_cluster.classes != class_name,
                               'classes'] = 'not ' + 'class_name'

            # change labels from str to int
            enc = LabelEncoder()
            real_classes = df_cluster.loc[:, 'classes']
            df_cluster.loc[:, 'classes'] = enc.fit_transform(
                df_cluster['classes'])
            classes = df_cluster.pop('classes')

            if preprocess_lda == 'PLS':
                from sklearn.cross_decomposition import PLSRegression
                pls = PLSRegression(n_components=10, scale=False)
                temp = pls.fit_transform(df_cluster.values, classes.values)[0]
            elif preprocess_lda == 'PCA':
                from sklearn.decomposition import PCA
                pca = PCA(n_components=0.95, svd_solver='full', whiten=False)
                temp = pca.fit_transform(df_cluster.values)
            elif preprocess_lda == 'kernelPCA':
                from sklearn.decomposition import KernelPCA
                pca = KernelPCA(kernel="rbf", gamma=5)
                temp = pca.fit_transform(df_cluster.values)
            elif preprocess_lda == 'NONE':
                temp = df_cluster.values

            # lda
            lda = LDA(n_discriminants=2)
            lda.fit(temp, classes.values)
            temp = lda.transform(temp)
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    'ignore',
                    'Casting complex values to real discards the imaginary part'
                )
                temp = temp.astype(np.float)  # in case of complex numbers///
            df[ind] = pd.DataFrame(index=df_cluster.index,
                                   columns=[0, 1],
                                   data=temp)
            df[ind]['classes'] = real_classes

        title = 'LDA'

    sns.set_context(context='talk')
    sns.set_style('dark')
    sns.set_style({'font.family': 'serif', 'font.sans-serif': ['Helvetica']})
    fig, ((ax1, ax2, ax3), (ax4, ax5, ax6),
          (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize=(13, 14))
    cm = plt.cm.get_cmap('Spectral')
    my_cmap = cm(np.linspace(0, 1, len(np.unique(df[ind]['classes']))),
                 alpha=0.6)

    if return_distances:
        distances = dict()
        sil_scores = dict()
        chs_scores = dict()
    for ax_n, key, x, name in zip(
        [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9], df.keys(), df.values(),
        [
            'E3FP', 'Morgan_300', 'Topo_1024', 'Infomax', 'VAE_256', 'VAE_16',
            'Trans_1024', 'Trans_64', 'GAE_64'
        ]):
        if not binarize_class:
            for ind, i in enumerate(np.unique(x['classes'])):
                color = my_cmap[ind]
                marker = '.'
                if i == class_name:
                    color = 'black',
                    marker = ','
                ax_n.scatter(
                    x.loc[x.classes == i, 0],
                    x.loc[x.classes == i, 1],
                    marker=marker,
                    label=i +
                    f' (n={str(len(x.loc[x.classes==i, 0]))}) vs Rest ({str(len(x.loc[x.classes!=i, 0]))})',
                    color=color)
                ax_n.title.set_text(name)
        else:
            ax_n.scatter(x.loc[:, 0], x.loc[:, 1], marker='.')
            ax_n.scatter(
                x.loc[x.classes == class_name, 0],
                x.loc[x.classes == class_name, 1],
                marker=',',
                label=class_name +
                f' (n={str(len(x.loc[x.classes==class_name, 0]))}) vs rest (n={str(len(x.loc[x.classes!=class_name, 0]))})',
                color='darkorange')
            ax_n.title.set_text(name)
            if cluster:
                from sklearn.cluster import KMeans
                from scipy.spatial.distance import pdist
                from sklearn.metrics import silhouette_score as sil
                from sklearn.metrics import calinski_harabasz_score as chs

                km = KMeans(init='k-means++', n_clusters=1, n_init=10)
                km.fit(x.loc[x.classes != class_name, [0, 1]])

                km1 = KMeans(init='k-means++', n_clusters=1, n_init=10)
                km1.fit(x.loc[x.classes == class_name, [0, 1]])

                ax_n.scatter(km.cluster_centers_[:, 0],
                             km.cluster_centers_[:, 1],
                             marker='X',
                             color='darkblue',
                             s=100,
                             linewidth=3)
                ax_n.scatter(km1.cluster_centers_[:, 0],
                             km1.cluster_centers_[:, 1],
                             marker='X',
                             color='red',
                             s=100,
                             linewidth=3)

                d = round(
                    pdist([km.cluster_centers_[0], km1.cluster_centers_[0]],
                          metric='euclidean')[0], 3)
                d_sc = round(sil(x.loc[:, [0, 1]], x['classes']), 3)
                d_chs = round(chs(x.loc[:, [0, 1]], x['classes']), 3)
                if return_distances:
                    cl_name = class_name + ' ' + name
                    distances[cl_name] = d
                    sil_scores[cl_name] = d_sc
                    chs_scores[cl_name] = d_chs
                name = name + '\n|d:' + str(d) + '|sil:' + str(
                    d_sc) + '|chs:' + str(d_chs)
                ax_n.title.set_text(name)
    for ax in [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9]:
        ax.set_xticks([])
        ax.set_yticks([])

    labels = ax_n.get_legend_handles_labels()[1]
    if only_pca:
        fig.suptitle(labels[0] + "\n classified with: " + title)
    else:
        fig.suptitle(labels[0] + "\n classified with: " + title +
                     f', preprocessed with: {preprocess_lda}')
    fig.tight_layout()
    if not return_distances:
        return fig
    else:
        return fig, distances, sil_scores, chs_scores
Exemple #7
0

def kmedoidsWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax):
	data = read_sample(str(root)+'\\'+filenameData)
    
    kClusters = canoc(data, kmin, kmax)
    
    initial_medoids = rci(data, kClusters).initialize()

    kmedoids_instance = kmedoids(data, initial_medoids)
    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    predicted = kmedoids_instance.predict(data)

    silhouetteScore = silhouette(data, clusters).process().get_score()
    meanSilhouetteScore = np.mean(silhouetteScore)
    #wlitCSV(silhouetteScore, filenameSilhouette, '', root)
    #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root)

    dbsScore = dbs(data, predicted)
    #witCSV(dbsScore, nameDBS, '', root)

    chsScore = chs(data, predicted)
    #witCSV(chsScore, nameCHS, '', root)

   # elbow_instance = elbow(data, kmin, kmax)
   # elbow_instance.process()
   # amount_clusters = elbow_instance.get_amount()  # most probable amount of clusters
   # wce = elbow_instance.get_wce()

kmedoidsWithScore(filenameData, filenameSilhouetteMean, filenameDBS, filenameCHS, k, metric, k_min, k_max)