コード例 #1
0
ファイル: code.py プロジェクト: rishub/unsupervised-leaerning
def run_kmeans(X,y,title):

    kclusters = list(np.arange(2,50,2))
    sil_scores = []; f1_scores = []; homo_scores = []; train_times = []

    for k in kclusters:
        start_time = timeit.default_timer()
        km = KMeans(n_clusters=k, n_init=10,random_state=100,n_jobs=-1).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)
        sil_scores.append(sil_score(X, km.labels_))
        y_mode_vote = cluster_predictions(y,km.labels_)
        f1_scores.append(f1_score(y, y_mode_vote))
        
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kclusters, sil_scores)
    plt.grid(True)
    plt.xlabel('Num. Clusters')
    plt.ylabel('Silhouette')
    plt.title(title + 'k-means Silhouette')
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kclusters, f1_scores)
    plt.grid(True)
    plt.xlabel('No. Clusters')
    plt.ylabel('F1 Score')
    plt.title('F1 Scores KMeans: '+ title)
    plt.show()
コード例 #2
0
def em_experiment(X, y, title, folder=""):
    cluster_range = list(np.arange(2, 11, 1))
    sil_scores, accuracy_scores, homo_scores, sse_scores, ami_scores, bic_scores = (
        [] for i in range(6))
    completeness_scores = []

    for k in cluster_range:
        # print(k)
        em = EM(n_components=k).fit(X)
        em_labels = em.predict(X)
        sil_scores.append(sil_score(X, em_labels))
        sse_scores.append(em.score(X))
        # print(sil_score(X,em_labels))
        homo_scores.append(homogeneity_score(y, em_labels))
        completeness_scores.append(completeness_score(y, em_labels))
        ami_scores.append(adjusted_mutual_info_score(y, em_labels))
        bic_scores.append(em.bic(X))

    plt.plot(cluster_range, sil_scores)
    plt.xlabel('No. Components')
    plt.ylabel('Avg Silhouette Score')
    plt.title('Silhouette Score for EM: ' + title)
    plt.savefig(folder + '/EMSIL.png')
    plt.close()

    plt.plot(cluster_range, homo_scores)
    plt.xlabel('No. Components')
    plt.ylabel('Homogeneity Score')
    plt.title('Homogeneity Scores EM: ' + title)
    plt.savefig(folder + '/EMHOMOGENEITY.png')
    plt.close()

    plt.plot(cluster_range, completeness_scores)
    plt.xlabel('No. Components')
    plt.ylabel('Completeness Score')
    plt.title('Completeness Score for EM: ' + title)
    plt.savefig(folder + '/EMCompletness.png')
    plt.close()

    plt.plot(cluster_range, sse_scores)
    plt.xlabel('No. Components')
    plt.ylabel('SSE Score')
    plt.title('SSE Scores EM: ' + title)
    plt.savefig(folder + '/EMSSE.png')
    plt.close()

    plt.plot(cluster_range, ami_scores)
    plt.xlabel('No. Components')
    plt.ylabel('AMI Score')
    plt.title('Adjusted Mutual Information Scores EM: ' + title)
    plt.savefig(folder + '/EMAMI.png')
    plt.close()

    plt.plot(cluster_range, bic_scores)
    plt.xlabel('No. Components')
    plt.ylabel('AMI Score')
    plt.title('BIC Scores EM: ' + title)
    plt.savefig(folder + '/EMBIC.png')
    plt.close()
コード例 #3
0
def run_kmeans(X, y, title):

    kclusters = list(np.arange(2, 50, 2))
    sil_scores = []
    f1_scores = []
    homo_scores = []
    train_times = []

    for k in kclusters:
        start_time = timeit.default_timer()
        km = KMeans(n_clusters=k, n_init=10, random_state=100,
                    n_jobs=-1).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)
        sil_scores.append(sil_score(X, km.labels_))
        y_mode_vote = cluster_predictions(y, km.labels_)
        f1_scores.append(f1_score(y, y_mode_vote))
        homo_scores.append(homogeneity_score(y, km.labels_))

    # elbow curve for silhouette score
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kclusters, sil_scores)
    plt.grid(True)
    plt.xlabel('No. Clusters')
    plt.ylabel('Avg Silhouette Score')
    plt.title('Elbow Plot for KMeans: ' + title)
    plt.show()

    # plot homogeneity scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kclusters, homo_scores)
    plt.grid(True)
    plt.xlabel('No. Clusters')
    plt.ylabel('Homogeneity Score')
    plt.title('Homogeneity Scores KMeans: ' + title)
    plt.show()

    # plot f1 scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kclusters, f1_scores)
    plt.grid(True)
    plt.xlabel('No. Clusters')
    plt.ylabel('F1 Score')
    plt.title('F1 Scores KMeans: ' + title)
    plt.show()

    # plot model training time
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kclusters, train_times)
    plt.grid(True)
    plt.xlabel('No. Clusters')
    plt.ylabel('Training Time (s)')
    plt.title('KMeans Training Time: ' + title)
    plt.show()
コード例 #4
0
ファイル: code.py プロジェクト: rishub/unsupervised-leaerning
def run_EM(X,y,title):

    kdist = list(np.arange(2,100,5))
    sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = []
    
    for k in kdist:
        start_time = timeit.default_timer()
        em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)
        
        labels = em.predict(X)
        sil_scores.append(sil_score(X, labels))
        y_mode_vote = cluster_predictions(y,labels)
        f1_scores.append(f1_score(y, y_mode_vote))
        homo_scores.append(homogeneity_score(y, labels))
        aic_scores.append(em.aic(X))
        bic_scores.append(em.bic(X))
        
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('Silhouette')
    plt.title(title + ' Exp Max Silhouette')
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('Silhouette')
    plt.title(title + ' Exp Max Silhouette')
    plt.show()
   

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, f1_scores)
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('F1 Score')
    plt.title(title + 'Exp Max F1')
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, aic_scores, label='AIC')
    ax.plot(kdist, bic_scores,label='BIC')
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('Model Complexity Score')
    plt.title(title + 'Exp Max Model Complexity')
    plt.legend(loc="best")
    plt.show()
コード例 #5
0
def kmeans_experiment(X, y, title, folder=""):
    cluster_range = list(np.arange(2, 40, 1))
    sil_scores, accuracy_scores, homo_scores, sse_scores, ami_scores = (
        [] for i in range(5))
    completeness_scores = []

    print(title)
    for k in cluster_range:
        print(k)
        km = KMeans(n_clusters=k).fit(X)
        km_labels = km.predict(X)
        # sse_scores.append(km.score(X))
        sse_scores.append(km.inertia_)
        sil_scores.append(sil_score(X, km_labels))
        # print(sil_score(X, km_labels))
        homo_scores.append(homogeneity_score(y, km_labels))
        completeness_scores.append(completeness_score(y, km_labels))
        ami_scores.append(adjusted_mutual_info_score(y, km_labels))

    plt.plot(cluster_range, sil_scores)
    plt.xlabel('No. Clusters')
    plt.ylabel('Avg Silhouette Score')
    plt.title('Silhouette Score for KMeans: ' + title)
    plt.savefig(folder + '/KMSIL.png')
    plt.close()

    plt.plot(cluster_range, homo_scores)
    plt.xlabel('No. Clusters')
    plt.ylabel('Homogeneity Score')
    plt.title('Homogeneity Scores KMeans: ' + title)
    plt.savefig(folder + '/KMHOMOGENEITY.png')
    plt.close()

    plt.plot(cluster_range, sse_scores)
    plt.xlabel('No. Clusters')
    plt.ylabel('SSE Score')
    plt.title('SSE Scores KMeans: ' + title)
    plt.savefig(folder + '/KMSSE.png')
    plt.close()

    plt.plot(cluster_range, ami_scores)
    plt.xlabel('No. Clusters')
    plt.ylabel('AMI Score')
    plt.title('Adjusted Mutual Information Scores KMeans: ' + title)
    plt.savefig(folder + '/KMAMI.png')
    plt.close()

    plt.plot(cluster_range, completeness_scores)
    plt.xlabel('No. Clusters')
    plt.ylabel('Completeness Score')
    plt.title('Completeness Scores KMeans: ' + title)
    plt.savefig(folder + '/KMCompleteness.png')
    plt.close()
コード例 #6
0
def run_EM(X, y, title):
    kdist = list(np.arange(2, 100, 5))
    sil_scores = []
    train_times = []
    aic_scores = []
    bic_scores = []

    for k in kdist:
        start_time = timeit.default_timer()
        em = EM(n_components=k, covariance_type='spherical',
                random_state=100).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)

        labels = em.predict(X)
        sil_scores.append(sil_score(X, labels))
        # y_mode_vote = cluster_predictions(y, labels)
        # f1_scores.append(f1_score(y, y_mode_vote))
        # homo_scores.append(homogeneity_score(y, labels))
        aic_scores.append(em.aic(X))
        bic_scores.append(em.bic(X))

    # elbow curve for silhouette score
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Avg Silhouette Score')
    plt.title('Silhouette Analysis for EM: ' + title)
    plt.show()

    # plot model AIC and BIC
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, aic_scores, label='AIC')
    ax.plot(kdist, bic_scores, label='BIC')
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Model Complexity Score')
    plt.title('EM Model Complexity: ' + title)
    plt.legend(loc="best")
    plt.show()
コード例 #7
0
def run_kmeans(X, y, title):
    kclusters = list(np.arange(2, 50, 2))
    sil_scores = []
    train_times = []

    for k in kclusters:
        start_time = timeit.default_timer()
        km = KMeans(n_clusters=k, n_init=10, random_state=100,
                    n_jobs=-1).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)
        sil_scores.append(sil_score(X, km.labels_))

    # elbow curve for silhouette score
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kclusters, sil_scores)
    plt.grid(True)
    plt.xlabel('No. Clusters')
    plt.ylabel('Avg Silhouette Score')
    plt.title('Silhouette Analysis for KMeans: ' + title)
    plt.show()
コード例 #8
0
def run_EM(X,y,title):

    #kdist =  [2,3,4,5]
    #kdist = list(range(2,51))
    kdist = list(np.arange(2,20,2))
    sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = []
    
    for k in kdist:
        start_time = timeit.default_timer()
        em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)
        
        labels = em.predict(X)
        sil_scores.append(sil_score(X, labels))
        y_mode_vote = cluster_predictions(y,labels)
        f1_scores.append(f1_score(y, y_mode_vote))
        homo_scores.append(homogeneity_score(y, labels))
        aic_scores.append(em.aic(X))
        bic_scores.append(em.bic(X))
        
    # elbow curve for silhouette score
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Avg Silhouette Score')
    plt.title('Elbow Plot for EM: '+ title)
    plt.show()
   
    # plot homogeneity scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, homo_scores)
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Homogeneity Score')
    plt.title('Homogeneity Scores EM: '+ title)
    plt.show()

    # plot f1 scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, f1_scores)
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('F1 Score')
    plt.title('F1 Scores EM: '+ title)
    plt.show()

    # plot model AIC and BIC
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, aic_scores, label='AIC')
    ax.plot(kdist, bic_scores,label='BIC')
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Model Complexity Score')
    plt.title('EM Model Complexity: '+ title)
    plt.legend(loc="best")
    plt.show()
コード例 #9
0
    def __do_perform(self, custom_out=None, main_experiment=None):
        if custom_out is not None:
            # if not os.path.exists(custom_out):
            #     os.makedirs(custom_out)
            self._old_out = self._out
            self._out = custom_out
        elif self._old_out is not None:
            self._out = self._old_out

        if main_experiment is not None:
            self.log("Performing {} as part of {}".format(self.experiment_name(), main_experiment.experiment_name()))
        else:
            self.log("Performing {}".format(self.experiment_name()))

        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py
        # %% Data for 1-3
        sse = defaultdict(list)
        ll = defaultdict(list)
        bic = defaultdict(list)
        sil = defaultdict(lambda: defaultdict(list))
        sil_s = np.empty(shape=(2*len(self._clusters)*self._details.ds.training_x.shape[0],4), dtype='<U21')
        acc = defaultdict(lambda: defaultdict(float))
        adj_mi = defaultdict(lambda: defaultdict(float))
        km = kmeans(random_state=self._details.seed)
        gmm = GMM(random_state=self._details.seed)

        st = clock()
        j = 0
        for k in self._clusters:
            km.set_params(n_clusters=k)
            gmm.set_params(n_components=k)
            km.fit(self._details.ds.training_x)
            gmm.fit(self._details.ds.training_x)

            km_labels = km.predict(self._details.ds.training_x)
            gmm_labels = gmm.predict(self._details.ds.training_x)

            sil[k]['Kmeans'] = sil_score(self._details.ds.training_x, km_labels)
            sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels)

            km_sil_samples = sil_samples(self._details.ds.training_x, km_labels)
            gmm_sil_samples = sil_samples(self._details.ds.training_x, gmm_labels)
            # There has got to be a better way to do this, but I can't brain right now
            for i, x in enumerate(km_sil_samples):
                sil_s[j] = [k, 'Kmeans', round(x, 6), km_labels[i]]
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1

            sse[k] = [km.score(self._details.ds.training_x)]
            ll[k] = [gmm.score(self._details.ds.training_x)]
            bic[k] = [gmm.bic(self._details.ds.training_x)]

            acc[k]['Kmeans'] = cluster_acc(self._details.ds.training_y, km_labels)
            acc[k]['GMM'] = cluster_acc(self._details.ds.training_y, gmm_labels)

            adj_mi[k]['Kmeans'] = ami(self._details.ds.training_y, km_labels)
            adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels)

            self.log("Cluster: {}, time: {}".format(k, clock() - st))

        sse = (-pd.DataFrame(sse)).T
        sse.index.name = 'k'
        sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)]

        ll = pd.DataFrame(ll).T
        ll.index.name = 'k'
        ll.columns = ['{} log-likelihood'.format(self._details.ds_readable_name)]

        bic = pd.DataFrame(bic).T
        bic.index.name = 'k'
        bic.columns = ['{} BIC'.format(self._details.ds_readable_name)]

        sil = pd.DataFrame(sil).T
        sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score', 'label']).set_index('k')  #.T
        # sil_s = sil_s.T
        acc = pd.DataFrame(acc).T
        adj_mi = pd.DataFrame(adj_mi).T

        sil.index.name = 'k'
        sil_s.index.name = 'k'
        acc.index.name = 'k'
        adj_mi.index.name = 'k'

        sse.to_csv(self._out.format('{}_sse.csv'.format(self._details.ds_name)))
        ll.to_csv(self._out.format('{}_logliklihood.csv'.format(self._details.ds_name)))
        bic.to_csv(self._out.format('{}_bic.csv'.format(self._details.ds_name)))
        sil.to_csv(self._out.format('{}_sil_score.csv'.format(self._details.ds_name)))
        sil_s.to_csv(self._out.format('{}_sil_samples.csv'.format(self._details.ds_name)))
        acc.to_csv(self._out.format('{}_acc.csv'.format(self._details.ds_name)))
        adj_mi.to_csv(self._out.format('{}_adj_mi.csv'.format(self._details.ds_name)))

        # %% NN fit data (2,3)
        grid = {'km__n_clusters': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch}
        mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed)
        km = kmeans(random_state=self._details.seed, n_jobs=self._details.threads)
        pipe = Pipeline([('km', km), ('NN', mlp)], memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(pipe, grid, type='kmeans')
        self.log("KMmeans Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(self._out.format('{}_cluster_kmeans.csv'.format(self._details.ds_name)))

        grid = {'gmm__n_components': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch}
        mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed)
        gmm = CustomGMM(random_state=self._details.seed)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)], memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(pipe, grid, type='gmm')
        self.log("GMM search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(self._out.format('{}_cluster_GMM.csv'.format(self._details.ds_name)))

        # %% For chart 4/5
        self._details.ds.training_x2D = TSNE(verbose=10, random_state=self._details.seed).fit_transform(
            self._details.ds.training_x
        )

        ds_2d = pd.DataFrame(np.hstack((self._details.ds.training_x2D, np.atleast_2d(self._details.ds.training_y).T)),
                             columns=['x', 'y', 'target'])
        ds_2d.to_csv(self._out.format('{}_2D.csv'.format(self._details.ds_name)))
        self.log("Done")
コード例 #10
0
def vis(X, y, nameappendix, k):
    
    scaler = MinMaxScaler(feature_range=[0,100])
    scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X))
    
    
    
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(15, 6)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X) + (k + 1) * 10])
    
    print("Num of clusters: ", k)
    clusters = KMeans(n_clusters = k, random_state = 10).fit(X)
    labels = clusters.labels_
    print("NMI score: %.5f" % normalized_mutual_info_score(y, labels))

    silhouette_avg = sil_score(X, labels)
    print("Silhouette score: ", silhouette_avg)
    sample_silhouette_values = silhouette_samples(X, labels)


    y_lower = 10
    for i in range(k):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

#             color = plt.spectral(float(i) / numOfCluster)
        color = plt.get_cmap('Spectral')(float(i) / k)
        ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("Silhouette Coefficients for Clusters.")
    ax1.set_xlabel("Silhouette Coefficient Values")
    ax1.set_ylabel("Cluster Labels")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
#         colors = plt.spectral(labels.astype(float) / numOfCluster)
    colors = plt.get_cmap('Spectral')(labels.astype(float) / k)
#     print(X.values[:, 10])
#         colors = ["b","g","r","c","m","y","k"]
    ax2.scatter(X.values[:, 3], X.values[:,5], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusters.cluster_centers_

    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 3], centers[:, 5], marker='o', c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[3], c[5], marker='$%d$' % i, alpha=1, s=50, edgecolor='k')

    ax2.set_title("Clustering Visualization")
    ax2.set_xlabel("1st feature: Pressure X4")
    ax2.set_ylabel("2nd feature: Pressure X5")

    plt.suptitle("Analysis for KMeans for " + str(k) + " Clusters", fontsize=14, fontweight='bold')
#     plt.savefig('img/kmeans_vis' + str(k) + '.png')
    plt.show()
コード例 #11
0
ファイル: clustering.py プロジェクト: yifanguo247/CS7641
    def __do_perform(self,
                     custom_out=None,
                     main_experiment=None
                     ):  # ./output/ICA/clustering//{}', ICAExperiment
        if custom_out is not None:
            # if not os.path.exists(custom_out):
            #     os.makedirs(custom_out)
            self._old_out = self._out  # './output/ICA/{}'
            self._out = custom_out  # ./output/ICA/clustering//{}'
        elif self._old_out is not None:
            self._out = self._old_out

        if main_experiment is not None:
            self.log("Performing {} as part of {}".format(
                self.experiment_name(),
                main_experiment.experiment_name()))  # 'clustering', 'ICA'
        else:
            self.log("Performing {}".format(self.experiment_name()))

        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py
        # %% Data for 1-3
        sse = defaultdict(list)
        ll = defaultdict(list)
        bic = defaultdict(list)
        sil = defaultdict(lambda: defaultdict(list))
        sil_s = np.empty(shape=(2 * len(self._clusters) *
                                self._details.ds.training_x.shape[0], 4),
                         dtype='<U21')
        acc = defaultdict(lambda: defaultdict(float))
        adj_mi = defaultdict(lambda: defaultdict(float))
        km = kmeans(random_state=self._details.seed)
        gmm = GMM(random_state=self._details.seed)

        st = clock()
        j = 0
        for k in self._clusters:
            km.set_params(n_clusters=k)
            gmm.set_params(n_components=k)
            km.fit(
                self._details.ds.training_x
            )  # cluster the ICA-transformed input features using kMeans with varying K
            gmm.fit(
                self._details.ds.training_x
            )  # cluster the ICA-transformed input features using GMM with varying k

            km_labels = km.predict(
                self._details.ds.training_x
            )  # give each ICA-transformed input feature a label
            gmm_labels = gmm.predict(self._details.ds.training_x)

            sil[k]['Kmeans'] = sil_score(
                self._details.ds.training_x, km_labels
            )  # compute mean silhouette score for all ICA-transformed input features
            sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels)

            km_sil_samples = sil_samples(
                self._details.ds.training_x, km_labels
            )  # compute silhouette score for each ICA-transformed input feature
            gmm_sil_samples = sil_samples(self._details.ds.training_x,
                                          gmm_labels)
            # There has got to be a better way to do this, but I can't brain right now
            for i, x in enumerate(km_sil_samples):
                sil_s[j] = [
                    k, 'Kmeans', round(x, 6), km_labels[i]
                ]  # record the silhouette score x for each instance i given its label kn_labels[i] by kMeans with value k
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1

            sse[k] = [
                km.score(self._details.ds.training_x)
            ]  # score (opposite of the value of X on the k-Means objective (what is the objective???)
            ll[k] = [gmm.score(self._details.ds.training_x)
                     ]  # per-sample average log-likelihood
            bic[k] = [
                gmm.bic(self._details.ds.training_x)
            ]  # bayesian information criterion (review ???) on the input X

            acc[k]['Kmeans'] = cluster_acc(
                self._details.ds.training_y, km_labels
            )  # compute the accuracy of the clustering algorithm on the ICA-transformed data (against the original y-label) if it predicted the majority y-label for each cluster
            acc[k]['GMM'] = cluster_acc(self._details.ds.training_y,
                                        gmm_labels)

            adj_mi[k]['Kmeans'] = ami(
                self._details.ds.training_y, km_labels
            )  # compute the adjusted mutual information between the true labels and the cluster predicted labels (how well does clustering match truth)
            adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels)

            self.log("Cluster: {}, time: {}".format(k, clock() - st))

        sse = (-pd.DataFrame(sse)).T
        sse.index.name = 'k'
        sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)
                       ]  # Bank sse (left)

        ll = pd.DataFrame(ll).T
        ll.index.name = 'k'
        ll.columns = [
            '{} log-likelihood'.format(self._details.ds_readable_name)
        ]  # Bank log-likelihood

        bic = pd.DataFrame(bic).T
        bic.index.name = 'k'
        bic.columns = ['{} BIC'.format(self._details.ds_readable_name)
                       ]  # Bank BIC

        sil = pd.DataFrame(sil).T
        sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score',
                                             'label']).set_index('k')  #.T
        # sil_s = sil_s.T
        acc = pd.DataFrame(acc).T
        adj_mi = pd.DataFrame(adj_mi).T

        sil.index.name = 'k'
        sil_s.index.name = 'k'
        acc.index.name = 'k'
        adj_mi.index.name = 'k'

        # write scores to files
        sse.to_csv(self._out.format('{}_sse.csv'.format(
            self._details.ds_name)))
        ll.to_csv(
            self._out.format('{}_logliklihood.csv'.format(
                self._details.ds_name)))
        bic.to_csv(self._out.format('{}_bic.csv'.format(
            self._details.ds_name)))
        sil.to_csv(
            self._out.format('{}_sil_score.csv'.format(self._details.ds_name)))
        sil_s.to_csv(
            self._out.format('{}_sil_samples.csv'.format(
                self._details.ds_name)))
        acc.to_csv(self._out.format('{}_acc.csv'.format(
            self._details.ds_name)))
        adj_mi.to_csv(
            self._out.format('{}_adj_mi.csv'.format(self._details.ds_name)))

        # %% NN fit data (2,3)
        # train a NN on clustered data
        grid = {
            'km__n_clusters': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        km = kmeans(random_state=self._details.seed,
                    n_jobs=self._details.threads)
        pipe = Pipeline(
            [('km', km), ('NN', mlp)], memory=experiments.pipeline_memory
        )  # run a NN on the clustered data (only on the cluster labels, or input features + cluster labels???)
        gs, _ = self.gs_with_best_estimator(
            pipe, grid, type='kmeans')  # write the best NN to file
        self.log("KMmeans Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_kmeans.csv'.format(
                self._details.ds_name))
        )  # write grid search results --> bank_cluster_kmeans.csv

        grid = {
            'gmm__n_components': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        gmm = CustomGMM(random_state=self._details.seed)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)],
                        memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(
            pipe, grid, type='gmm')  # write the best NN to file
        self.log("GMM search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_GMM.csv'.format(
                self._details.ds_name))
        )  # write grid search results --> bank_cluster_GMM.csv

        # %% For chart 4/5
        # perform TSNE D.R on training data (why???)
        self._details.ds.training_x2D = TSNE(
            verbose=10, random_state=self._details.seed).fit_transform(
                self._details.ds.training_x)

        ds_2d = pd.DataFrame(
            np.hstack((self._details.ds.training_x2D,
                       np.atleast_2d(self._details.ds.training_y).T)),
            columns=['x', 'y', 'target']
        )  # prepare NN-learnable data using TSNE D.R'd input features + label
        ds_2d.to_csv(
            self._out.format('{}_2D.csv'.format(
                self._details.ds_name)))  # --> bank_2D.csv
        self.log("Done")
コード例 #12
0
            print('Now processing {} data with {} using {} clusters...'.format(ds, r, k))
            data_st = clock()

            # fit the credit data
            km.fit(dataX)
            km_labels = km.predict(dataX)  

            gmm.fit(dataX)
            gmm_labels = gmm.predict(dataX)

            # save the labels
            labels[k]['Kmeans'] = km_labels
            labels[k]['GMM'] = gmm_labels

            sil[k]['Kmeans'] = sil_score(dataX, km_labels)
            sil[k]['GMM'] = sil_score(dataX, gmm_labels)
            km_sil_samples = sil_samples(dataX, km_labels)
            gmm_sil_samples = sil_samples(dataX, gmm_labels)
            for i, x in enumerate(km_sil_samples):
                sil_samp[j] = [k, 'Kmeans', round(x, 6), km_labels[i]]
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_samp[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1
            sse[k] = km.score(dataX)
            ll[k] = gmm.score(dataX)
            bic[k] = gmm.bic(dataX)
            acc[k]['Kmeans'] = cluster_acc(dataY,km.predict(dataX))
            acc[k]['GMM'] = cluster_acc(dataY,gmm.predict(dataX))
            adj_mi[k]['Kmeans'] = ami(dataY,km.predict(dataX))
コード例 #13
0
        nmf_array = nmf.fit_transform(x_tfidf)

        # Normalize the nmf array
        nmf_array = normalizer.fit_transform(nmf_array)

        # NMF labels
        labels = [np.argmax(x) for x in nmf_array]

        # Weighted matrix of similarities
        weighted_matrix = nmf_array.dot(nmf_array.T)

        # ------ Silhouette coefficient of dissimilarities ---- #

        dissim = np.ones(weighted_matrix.shape) - weighted_matrix

        sil = sil_score(dissim, labels, metric = 'precomputed')

        # --------------------- Modularity -------------------- #

        weighted_matrix_graph = deepcopy(weighted_matrix)
        for i in range(len(weighted_matrix_graph)):
             weighted_matrix_graph[i][i] = 0.00

        graph = igraph.Graph.Weighted_Adjacency(list(weighted_matrix_graph),\
                                                 mode = igraph.ADJ_MAX)

        weights = [es['weight'] for es in graph.es]

        mod = graph.modularity(labels, weights = weights)

        # ---------------- Weak merit factor ------------ #
コード例 #14
0
def cluster(cluster_range, dataset, dir):
    global start, kmeans_accuracy_for_k, end, em_prediction_y, em_pca_accuracy_for_k
    kmeans_accuracy, em_accuracy, kmeans_timetaken, em_timetaken = {}, {}, {}, {}
    sse = defaultdict(list)
    ll = defaultdict(list)
    bic = defaultdict(list)
    sil = defaultdict(lambda: defaultdict(list))
    acc = defaultdict(lambda: defaultdict(float))
    adj_mi = defaultdict(lambda: defaultdict(float))
    for k in cluster_range:
        # Kmeans Clustering
        km = KMeans(n_clusters=k, random_state=0)
        gmm = GaussianMixture(n_components=k, random_state=0)
        start = datetime.now()
        kmeans_predicted_y = km.fit_predict(dataset.x)
        end = datetime.now()

        # EM Clustering
        start = datetime.now()
        em_prediction_y = gmm.fit_predict(dataset.x)
        end = datetime.now()

        ## Accuracy

        kmeans_accuracy_for_k = common_utils.get_cluster_accuracy(
            dataset.y, kmeans_predicted_y)
        kmeans_accuracy[k] = kmeans_accuracy_for_k
        kmeans_timetaken[k] = (end - start).total_seconds()

        em_pca_accuracy_for_k = common_utils.get_cluster_accuracy(
            dataset.y, em_prediction_y)
        em_accuracy[k] = em_pca_accuracy_for_k
        em_timetaken[k] = (end - start).total_seconds()

        ## PLotting
        sil[k]['Kmeans'] = sil_score(dataset.x, kmeans_predicted_y)
        sil[k]['GMM'] = sil_score(dataset.x, em_prediction_y)

        sse[k] = [km.score(dataset.x)]
        ll[k] = [gmm.score(dataset.x)]
        bic[k] = [gmm.bic(dataset.x)]

        adj_mi[k]['Kmeans'] = ami(dataset.y, kmeans_predicted_y)
        adj_mi[k]['GMM'] = ami(dataset.y, em_prediction_y)

    sse = (-pd.DataFrame(sse)).T
    sse.index.name = 'k'
    sse.columns = ['{} sse (left)'.format(dataset.dataset_name)]

    ll = pd.DataFrame(ll).T
    ll.index.name = 'k'
    ll.columns = ['{} log-likelihood'.format(dataset.dataset_name)]

    bic = pd.DataFrame(bic).T
    bic.index.name = 'k'
    bic.columns = ['{} BIC'.format(dataset.dataset_name)]

    sil = pd.DataFrame(sil).T
    adj_mi = pd.DataFrame(adj_mi).T

    sil.index.name = 'k'
    adj_mi.index.name = 'k'

    sse.to_csv(dir + '{}_sse.csv'.format(dataset.dataset_name))
    ll.to_csv(dir + '{}_logliklihood.csv'.format(dataset.dataset_name))
    bic.to_csv(dir + '{}_bic.csv'.format(dataset.dataset_name))
    sil.to_csv(dir + '{}_sil_score.csv'.format(dataset.dataset_name))
    adj_mi.to_csv(dir + '{}_adj_mi.csv'.format(dataset.dataset_name))

    neural_net_score = nn_experiment(dataset)
    common_utils.plot_clustering_accuracy(kmeans_accuracy,
                                          "k-means - clusters vs Accuracy",
                                          dir)
    common_utils.plot_clustering_time(kmeans_timetaken,
                                      "k-means - clusters vs Time", dir)
    common_utils.plot_clustering_accuracy(em_accuracy,
                                          "EM clusters - vs Accuracy", dir)
    common_utils.plot_clustering_time(em_timetaken, "EM clusters - vs Time",
                                      dir)

    common_utils.read_and_plot_sse(
        'Clustering', dir + '{}_sse.csv'.format(dataset.dataset_name), dir)
    common_utils.read_and_plot_loglikelihood(
        'Clustering', dir + '{}_logliklihood.csv'.format(dataset.dataset_name),
        dir)
    common_utils.read_and_plot_bic(
        'Clustering', dir + '{}_bic.csv'.format(dataset.dataset_name), dir)
    common_utils.read_and_plot_sil_score(
        'Clustering', dir + '{}_sil_score.csv'.format(dataset.dataset_name),
        dir)
    common_utils.read_and_plot_adj_mi(
        'Clustering', dir + '{}_adj_mi.csv'.format(dataset.dataset_name), dir)