Esempio n. 1
0
def fit_mixtures(X,mag,mbins,binwidth=0.2,seed=None,
                 keepscore=False,keepbic=False,**kwargs):
    kwargs.setdefault('n_components',25)
    kwargs.setdefault('covariance_type','full')
    fits = []
    if keepscore:
        scores = []
    if keepbic:
        bics = []
    if seed:
        np.random.seed(seed)
    for bincenter in mbins:
        # this is not an efficient way to assign bins, but the time
        # is negligible compared to the GMM fitting anyway
        ii = np.where( np.abs(mag-bincenter) < binwidth )[0]
        if False:
            print('{:.2f}: {} qsos'.format(bincenter,len(ii)))
        gmm = GaussianMixture(**kwargs)
        gmm.fit(X[ii])
        fits.append(gmm)
        if keepscore:
            scores.append(gmm.score(X[ii]))
        if keepbic:
            bics.append(gmm.bic(X[ii]))
    rv = (fits,)
    if keepscore:
        rv += (scores,)
    if keepbic:
        rv += (bics,)
    return rv
    X_train = datatrans[train_index]
    y_train = np.array(target[train_index])
    X_test = datatrans[test_index]
    y_test = target[test_index]
    #find the best mixture number using bic with different covariances type
    optimal = []
    for s in range(
            2, 6
    ):  #more number here lead to more traffic because the data size is little big
        for cov_type in ['spherical', 'diag', 'tied', 'full']:
            models = GaussianMixture(n_components=s,
                                     covariance_type=cov_type,
                                     max_iter=150,
                                     n_init=20,
                                     random_state=500).fit(X_train)
            bic = models.bic(datatrans)  # bic number
            optimal.append([bic, cov_type, s])
    # you can here plot this loop using line chart (but it will make our code more slow)

    final_components = min(optimal)  #lower bic lead to best fis

    n_classes = final_components[2]
    reg_ = [0.0001, 0.008, 0.05, 0.1, 0.2, 0.3,
            0.5]  #useing different size to control covariance
    param_ = ['random', 'kmeans']
    # Try GMMs using different types of covariances.
    for u in (reg_):
        for param in (param_):
            estimators = {
                cov_type:
                GaussianMixture(n_components=n_classes,
Esempio n. 3
0
Cancer_EM_bic = []
Cancer_EM_score = []
Cancer_EM_homogeneity_score = []
Cancer_EM_complete_score = []
Cancer_EM_log = []
Cancer_EM_train_acc = []
Cancer_EM_cv_acc = []



for i in n_components:
    print(i)
    EM.set_params(random_state=7641,n_components=i)
    EM.fit(Cancer_X)
    Cancer_EM_score.append(EM.score(Cancer_X_train))
    Cancer_EM_bic.append(EM.bic(Cancer_X_train))
    Cancer_EM_aic.append(EM.aic(Cancer_X_train))
    Cancer_EM_log.append(silhouette_score(Cancer_X_train,EM.predict(Cancer_X_train)))
    Cancer_EM_homogeneity_score.append(homogeneity_score(Cancer_y_train,EM.predict(Cancer_X_train)))
    Cancer_EM_complete_score.append(completeness_score(Cancer_y_train,EM.predict(Cancer_X_train)))
    Cancer_scores = cross_validate(EM, Cancer_X_train, Cancer_y_train, cv=5, scoring=make_scorer(my_custom_acc, greater_is_better=True), n_jobs=-1, return_train_score=True)
    Cancer_EM_train_acc.append(np.mean(Cancer_scores['train_score']))
    Cancer_EM_cv_acc.append(np.mean(Cancer_scores['test_score']))

    
PlotEm(6,n_components,Cancer_EM_aic,'AIC','Cancer')
PlotEm(7,n_components,Cancer_EM_bic,'BIC','Cancer')
PlotEm(8,n_components,Cancer_EM_score,'SSE','Cancer')
PlotEm(9,n_components,Cancer_EM_log,'Log-Likelihood','Cancer')
PlotEm(10,n_components,Cancer_EM_homogeneity_score,'homogeneity_score','Cancer')
PlotEm(11,n_components,Cancer_EM_complete_score,'complete_score','Cancer')
# K-fold crossvalidation
CV = model_selection.KFold(n_splits=10, shuffle=True)

for t, K in enumerate(KRange):
    print('Fitting model for K={0}'.format(K))

    # Fit Gaussian mixture model
    gmm = GaussianMixture(n_components=K,
                          covariance_type=covar_type,
                          n_init=reps,
                          init_params=init_procedure,
                          tol=1e-6,
                          reg_covar=1e-6).fit(X)

    # Get BIC and AIC
    BIC[t, ] = gmm.bic(X)
    AIC[t, ] = gmm.aic(X)

    # For each crossvalidation fold
    for train_index, test_index in CV.split(X):

        # extract training and test set for current CV fold
        X_train = X[train_index]
        X_test = X[test_index]

        # Fit Gaussian mixture model to X_train
        gmm = GaussianMixture(n_components=K,
                              covariance_type=covar_type,
                              n_init=reps).fit(X_train)

        # compute negative log likelihood of X_test
Esempio n. 5
0
fn = open(args.outroot + '.nzgmm.json', 'w')
for sel in sels:
    print sel
    #
    bic, lowest_bic = [], np.infty
    n_components_range = range(1, args.ngauss)
    #
    z = data['zphot'][data['is' + sel]]
    print sel, len(z), 'obj.'
    # gmm
    X = z.reshape((len(z), 1))
    for n_components in range(1, args.ngauss):
        gmm = GaussianMixture(n_components=n_components,
                              covariance_type='diag')
        gmm.fit(X)
        bic.append(gmm.bic(X))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
    bic = np.array(bic)
    clf = best_gmm
    ## rounding
    ps = np.round(clf.weights_, 3).tolist()
    mus = np.round(clf.means_.flatten(), 3).tolist()
    sds = np.round(np.sqrt(clf.covariances_.flatten()), 3).tolist()
    mydict[sel] = {}
    mydict[sel]['p'], mydict[sel]['mu'], mydict[sel]['sd'] = ps, mus, sds
    ## plotting
    fig, ax = plt.subplots()
    nzraw = np.array([((z >= zgrid[i]) & (z < zgrid[i + 1])).sum()
                      for i in range(nbins)])
Esempio n. 6
0
#Put numbers back to original shape so we can reconstruct segmented image
original_shape = img.shape
segmented = gmm_labels.reshape(original_shape[0], original_shape[1])
plt.imshow(segmented)
#cv2.imwrite("images/segmented.jpg", segmented)
##############################################################
#How to know the best number of components?
#Using Bayesian information criterion (BIC) to find the best number of components
import numpy as np
import cv2

img = cv2.imread("images/BSE.tif")
img2 = img.reshape((-1, 3))

from sklearn.mixture import GaussianMixture as GMM

n = 4
gmm_model = GMM(n, covariance_type='tied').fit(img2)
#The above line generates GMM model for n=2
#Now let us call the bic method (or aic if you want).

bic_value = gmm_model.bic(
    img2)  #Remember to call the same model name from above)
print(bic_value)  #You should see bic for GMM model generated using n=2.
#Do this exercise for different n values and plot them to find the minimum.

#Now, to explain m.bic, here are the lines I used in the video.
n_components = np.arange(1, 10)
gmm_models = [GMM(n, covariance_type='tied').fit(img2) for n in n_components]
plt.plot(n_components, [m.bic(img2) for m in gmm_models], label='BIC')
Esempio n. 7
0
def run_EM(X,y,title):

    #kdist =  [2,3,4,53
    #kdist = list(range(2,51))
    kdist = list(np.arange(2,150,5))
    sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = []
    
    for k in kdist:
        start_time = timeit.default_timer()
        em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)
        
        labels = em.predict(X)
        sil_scores.append(sil_score(X, labels))
        y_mode_vote = cluster_predictions(y,labels)
#         f1_scores.append(f1_score(y, y_mode_vote))
        homo_scores.append(homogeneity_score(y, labels))
        aic_scores.append(em.aic(X))
        bic_scores.append(em.bic(X))
        
    # elbow curve for silhouette score
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Avg Silhouette Score')
    plt.title('Elbow Plot for EM: '+ title)
    plt.show()
   
    # plot homogeneity scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, homo_scores)
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Homogeneity Score')
    plt.title('Homogeneity Scores EM: '+ title)
    plt.show()

    # plot f1 scores
#     fig = plt.figure()
#     ax = fig.add_subplot(111)
#     ax.plot(kdist, f1_scores)
#     plt.grid(True)
#     plt.xlabel('No. Distributions')
#     plt.ylabel('F1 Score')
#     plt.title('F1 Scores EM: '+ title)
#     plt.show()

    # plot model AIC and BIC
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, aic_scores, label='AIC')
    ax.plot(kdist, bic_scores,label='BIC')
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Model Complexity Score')
    plt.title('EM Model Complexity: '+ title)
    plt.legend(loc="best")
    plt.show()
Esempio n. 8
0
 def gmm_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='GMM'):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     em_bic = []
     em_aic = []
     em_completeness_score = []
     em_homogeneity_score = []
     em_measure_score = []
     em_adjusted_rand_score = []
     em_adjusted_mutual_info_score = []
     
     cluster_range = np.arange(2, max_clusters+1, 1)
     for k in cluster_range:
         print('K Clusters: ', k)
         
         ##
         ## Expectation Maximization
         ##
         em = GaussianMixture(n_components=k, covariance_type='full')
         em.fit(X_train_scl)
         em_pred = em.predict(X_train_scl)
         
         em_bic.append(em.bic(X_train_scl))
         em_aic.append(em.aic(X_train_scl))        
     
         # metrics
         y_train_score = y_train.reshape(y_train.shape[0],)
         
         em_homogeneity_score.append(homogeneity_score(y_train_score, em_pred))
         em_completeness_score.append(completeness_score(y_train_score, em_pred))
         em_measure_score.append(v_measure_score(y_train_score, em_pred))
         em_adjusted_rand_score.append(adjusted_rand_score(y_train_score, em_pred))
         em_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, em_pred))
         
     
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     ##
     ## BIC/AIC Plot
     ##
     title = 'Information Criterion Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_ic'
     filename = './' + self.out_dir + '/' + name + '.png'
     
     ph.plot_series(cluster_range,
                 [em_bic, em_aic],
                 [None, None],
                 ['bic', 'aic'],
                 cm.viridis(np.linspace(0, 1, 2)),
                 ['o', '*'],
                 title,
                 'Number of Clusters',
                 'Information Criterion',
                 filename)
     
     ##
     ## Score Plot
     ##
     title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_score'
     filename = './' + self.out_dir + '/' + name + '.png'
                 
     ph.plot_series(cluster_range,
                 [em_homogeneity_score, em_completeness_score, em_measure_score, em_adjusted_rand_score, em_adjusted_mutual_info_score],
                 [None, None, None, None, None, None],
                 ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'],
                 cm.viridis(np.linspace(0, 1, 5)),
                 ['o', '^', 'v', '>', '<', '1'],
                 title,
                 'Number of Clusters',
                 'Score',
                 filename)
    def __do_perform(self, custom_out=None, main_experiment=None):
        if custom_out is not None:
            # if not os.path.exists(custom_out):
            #     os.makedirs(custom_out)
            self._old_out = self._out
            self._out = custom_out
        elif self._old_out is not None:
            self._out = self._old_out

        if main_experiment is not None:
            self.log("Performing {} as part of {}".format(
                self.experiment_name(), main_experiment.experiment_name()))
        else:
            self.log("Performing {}".format(self.experiment_name()))

        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py
        # %% Data for 1-3
        sse = defaultdict(list)
        ll = defaultdict(list)
        bic = defaultdict(list)
        sil = defaultdict(lambda: defaultdict(list))
        sil_s = np.empty(shape=(2 * len(self._clusters) *
                                self._details.ds.training_x.shape[0], 4),
                         dtype='<U21')
        acc = defaultdict(lambda: defaultdict(float))
        adj_mi = defaultdict(lambda: defaultdict(float))
        km = kmeans(random_state=self._details.seed)
        gmm = GMM(random_state=self._details.seed)

        st = clock()
        j = 0
        for k in self._clusters:
            km.set_params(n_clusters=k)
            gmm.set_params(n_components=k)
            km.fit(self._details.ds.training_x)
            gmm.fit(self._details.ds.training_x)

            km_labels = km.predict(self._details.ds.training_x)
            gmm_labels = gmm.predict(self._details.ds.training_x)

            sil[k]['Kmeans'] = sil_score(self._details.ds.training_x,
                                         km_labels)
            sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels)

            km_sil_samples = sil_samples(self._details.ds.training_x,
                                         km_labels)
            gmm_sil_samples = sil_samples(self._details.ds.training_x,
                                          gmm_labels)
            # There has got to be a better way to do this, but I can't brain right now
            for i, x in enumerate(km_sil_samples):
                sil_s[j] = [k, 'Kmeans', round(x, 6), km_labels[i]]
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1

            sse[k] = [km.score(self._details.ds.training_x)]
            ll[k] = [gmm.score(self._details.ds.training_x)]
            bic[k] = [gmm.bic(self._details.ds.training_x)]

            acc[k]['Kmeans'] = cluster_acc(self._details.ds.training_y,
                                           km_labels)
            acc[k]['GMM'] = cluster_acc(self._details.ds.training_y,
                                        gmm_labels)

            adj_mi[k]['Kmeans'] = ami(self._details.ds.training_y, km_labels)
            adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels)

            self.log("Cluster: {}, time: {}".format(k, clock() - st))

        sse = (-pd.DataFrame(sse)).T
        sse.index.name = 'k'
        sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)]

        ll = pd.DataFrame(ll).T
        ll.index.name = 'k'
        ll.columns = [
            '{} log-likelihood'.format(self._details.ds_readable_name)
        ]

        bic = pd.DataFrame(bic).T
        bic.index.name = 'k'
        bic.columns = ['{} BIC'.format(self._details.ds_readable_name)]

        sil = pd.DataFrame(sil).T
        sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score',
                                             'label']).set_index('k')  #.T
        # sil_s = sil_s.T
        acc = pd.DataFrame(acc).T
        adj_mi = pd.DataFrame(adj_mi).T

        sil.index.name = 'k'
        sil_s.index.name = 'k'
        acc.index.name = 'k'
        adj_mi.index.name = 'k'

        sse.to_csv(self._out.format('{}_sse.csv'.format(
            self._details.ds_name)))
        ll.to_csv(
            self._out.format('{}_logliklihood.csv'.format(
                self._details.ds_name)))
        bic.to_csv(self._out.format('{}_bic.csv'.format(
            self._details.ds_name)))
        sil.to_csv(
            self._out.format('{}_sil_score.csv'.format(self._details.ds_name)))
        sil_s.to_csv(
            self._out.format('{}_sil_samples.csv'.format(
                self._details.ds_name)))
        acc.to_csv(self._out.format('{}_acc.csv'.format(
            self._details.ds_name)))
        adj_mi.to_csv(
            self._out.format('{}_adj_mi.csv'.format(self._details.ds_name)))

        # %% NN fit data (2,3)
        grid = {
            'km__n_clusters': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        km = kmeans(random_state=self._details.seed,
                    n_jobs=self._details.threads)
        pipe = Pipeline([('km', km), ('NN', mlp)],
                        memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(pipe, grid, type='kmeans')
        self.log("KMmeans Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_kmeans.csv'.format(
                self._details.ds_name)))

        grid = {
            'gmm__n_components': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        gmm = CustomGMM(random_state=self._details.seed)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)],
                        memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(pipe, grid, type='gmm')
        self.log("GMM search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_GMM.csv'.format(
                self._details.ds_name)))

        # %% For chart 4/5
        self._details.ds.training_x2D = TSNE(
            verbose=10, random_state=self._details.seed).fit_transform(
                self._details.ds.training_x)

        ds_2d = pd.DataFrame(np.hstack(
            (self._details.ds.training_x2D,
             np.atleast_2d(self._details.ds.training_y).T)),
                             columns=['x', 'y', 'target'])
        ds_2d.to_csv(
            self._out.format('{}_2D.csv'.format(self._details.ds_name)))
        self.log("Done")
Esempio n. 10
0
    def _fit_cluster(self, X, y, params):
        label_init = self.label_init
        if label_init is not None:
            onehot = _labels_to_onehot(label_init)
            weights_init, means_init, precisions_init = _onehot_to_initial_params(
                X, onehot, params[1]["covariance_type"])
            gm_params = params[1]
            gm_params["weights_init"] = weights_init
            gm_params["means_init"] = means_init
            gm_params["precisions_init"] = precisions_init
        elif params[0]["affinity"] != "none":
            agg = AgglomerativeClustering(**params[0])
            n = X.shape[0]

            if self.max_agglom_size is None or n <= self.max_agglom_size:
                X_subset = X
            else:  # if dataset is huge, agglomerate a subset
                subset_idxs = np.random.choice(np.arange(0, n),
                                               self.max_agglom_size)
                X_subset = X[subset_idxs, :]
            agg_clustering = agg.fit_predict(X_subset)
            onehot = _labels_to_onehot(agg_clustering)
            weights_init, means_init, precisions_init = _onehot_to_initial_params(
                X_subset, onehot, params[1]["covariance_type"])
            gm_params = params[1]
            gm_params["weights_init"] = weights_init
            gm_params["means_init"] = means_init
            gm_params["precisions_init"] = precisions_init
        else:
            gm_params = params[1]
            gm_params["init_params"] = "kmeans"
        gm_params["reg_covar"] = 0
        gm_params["max_iter"] = self.max_iter

        criter = np.inf  # if none of the iterations converge, bic/aic is set to inf
        # below is the regularization scheme
        while gm_params["reg_covar"] <= 1 and criter == np.inf:
            model = GaussianMixture(**gm_params)
            try:
                model.fit(X)
                predictions = model.predict(X)
                counts = [
                    sum(predictions == i)
                    for i in range(gm_params["n_components"])
                ]
                # singleton clusters not allowed
                assert not any([count <= 1 for count in counts])

            except ValueError:
                gm_params["reg_covar"] = _increase_reg(gm_params["reg_covar"])
                continue
            except AssertionError:
                gm_params["reg_covar"] = _increase_reg(gm_params["reg_covar"])
                continue
            # if the code gets here, then the model has been fit with no errors or
            # singleton clusters
            if self.selection_criteria == "bic":
                criter = model.bic(X)
            else:
                criter = model.aic(X)
            break

        if y is not None:
            self.predictions = model.predict(X)
            ari = adjusted_rand_score(y, self.predictions)
        else:
            ari = float("nan")
        results = {
            "model": model,
            "bic/aic": criter,
            "ari": ari,
            "n_components": gm_params["n_components"],
            "affinity": params[0]["affinity"],
            "linkage": params[0]["linkage"],
            "covariance_type": gm_params["covariance_type"],
            "reg_covar": gm_params["reg_covar"],
        }
        return results
Esempio n. 11
0
def recommend_coldstart(song_input, songs_np, songs_df, num_recommend_gmm,
                        num_recommend_nn, gmm_clusters):
    '''Generates song recommendations based on Nearest Neighbours and GMM sampling.

    Inputs

    song_inputs: Index of song that user likes.
    songs_np: Numpy array of numeric attributes of dataset.
    songs_df: Full dataframe.

    num_recommend_nn: Number of songs to recommend using NN.
    num_recommend_gmm: Number of songs to recommend using GMM sampling.
    gmm_clusters: Number of clusters for GMM model. Will find optimal if specified as 0.

    Outputs

    nn_recc_songs: Recommendations using NN.
    gmm_recc_songs: Recommendations using GMM.
    '''

    query_song = songs_np[song_input]
    playlist_idx = songs_df[songs_df['track_uri'] == songs_df.iloc[song_input]
                            ['track_uri']]['pid'].values
    query_songs_df = songs_df[songs_df['pid'].isin(playlist_idx)]
    idx = query_songs_df.drop_duplicates(subset=['track_uri']).index.values
    query_songs_np = songs_np[idx]

    if gmm_clusters == 0:
        #Do tuning
        print("Tuning hyperparameters for GMM.")

        n_clusters = np.arange(2, 10)
        sils = []
        bics = []
        iterations = 20
        for n in tqdm(n_clusters):
            tmp_sil = []
            tmp_bic = []
            for _ in range(iterations):
                gmm = GaussianMixture(n, n_init=2).fit(query_songs_np)
                labels = gmm.predict(query_songs_np)
                sil = silhouette_score(query_songs_np,
                                       labels,
                                       metric='euclidean')
                tmp_sil.append(sil)
                tmp_bic.append(gmm.bic(query_songs_np))
            val = np.mean(SelBest(np.array(tmp_sil), int(iterations / 5)))
            sils.append(val)
            val = np.mean(SelBest(np.array(tmp_bic), int(iterations / 5)))
            bics.append(val)
        gmm_clusters = int(
            (n_clusters[np.argmin(bics)] + n_clusters[np.argmax(sils)]) / 2)

        print("Optimal number of clusters: {}.".format(gmm_clusters))

    print("Fitting models.")

    gmm = GaussianMixture(n_components=gmm_clusters).fit(query_songs_np)
    nn = NearestNeighbors().fit(query_songs_np)

    print("Generating recommendations.")

    #GMM sampling
    label_gmm = gmm.predict(query_song.reshape(1, -1))[0]
    #to ensure we get 10 recommendations
    num_being_recommended = 0
    while num_being_recommended < num_recommend_gmm:
        samples = np.random.multivariate_normal(gmm.means_[label_gmm],
                                                gmm.covariances_[label_gmm],
                                                2 * num_recommend_gmm)
        dist, indices = nn.kneighbors(samples, n_neighbors=1)
        #drop possible duplicates
        gmm_recc = list(set(indices.flatten()))[:num_recommend_gmm]
        num_being_recommended = len(gmm_recc)

    #NN
    dist, indices = nn.kneighbors(query_song.reshape(1, -1),
                                  n_neighbors=num_recommend_nn + 1)
    nn_recc = indices.flatten()[1:]

    nn_recc_songs = songs_df.iloc[idx].iloc[nn_recc]
    gmm_recc_songs = songs_df.iloc[idx].iloc[gmm_recc]

    return nn_recc_songs, gmm_recc_songs
Esempio n. 12
0
            labels[k]['Kmeans'] = km_labels
            labels[k]['GMM'] = gmm_labels

            sil[k]['Kmeans'] = sil_score(dataX, km_labels)
            sil[k]['GMM'] = sil_score(dataX, gmm_labels)
            km_sil_samples = sil_samples(dataX, km_labels)
            gmm_sil_samples = sil_samples(dataX, gmm_labels)
            for i, x in enumerate(km_sil_samples):
                sil_samp[j] = [k, 'Kmeans', round(x, 6), km_labels[i]]
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_samp[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1
            sse[k] = km.score(dataX)
            ll[k] = gmm.score(dataX)
            bic[k] = gmm.bic(dataX)
            acc[k]['Kmeans'] = cluster_acc(dataY,km.predict(dataX))
            acc[k]['GMM'] = cluster_acc(dataY,gmm.predict(dataX))
            adj_mi[k]['Kmeans'] = ami(dataY,km.predict(dataX))
            adj_mi[k]['GMM'] = ami(dataY,gmm.predict(dataX))



        gmm_clusters = pd.DataFrame()
        kmeans_clusters = pd.DataFrame()

        for i in clusters:
            gmm_clusters[i] = labels[i]['GMM']
            kmeans_clusters[i] = labels[i]['Kmeans']

        bic = pd.DataFrame(bic, index=[0]).T
Esempio n. 13
0
X = X.reshape(-1, 2)
colors = (np.ones((N,1)) * np.arange(3)).reshape(-1)

pl.figure()
pl.scatter(X[:, 0], X[:, 1], c=colors, s=16, lw=0)
pl.title('input data')

n_components = np.arange(1, 16)
BIC = np.zeros(n_components.shape)

for i, n in enumerate(n_components):
    clf = GaussianMixture(n_components=n,
              covariance_type='diag')
    clf.fit(X)

    BIC[i] = clf.bic(X)

pl.figure()
pl.bar(n_components, BIC, label='BIC')
pl.legend(loc=0)
pl.xlabel('n_components')
pl.ylabel('BIC')

i_n = np.argmin(BIC)

clf = GaussianMixture(n_components[i_n])
clf.fit(X)
label = clf.predict(X)

pl.figure()
pl.scatter(X[:, 0], X[:, 1], c=label, s=16, lw=0)
    N = N1 + N2
    x1 = np.random.multivariate_normal(mean=(1, 2), cov=cov1, size=N1)
    m = np.array(((1, 1), (1, 3)))
    x1 = x1.dot(m)
    x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)
    x = np.vstack((x1, x2))
    y = np.array([0]*N1 + [1]*N2)

    types = ('spherical', 'diag', 'tied', 'full')
    err = np.empty(len(types))
    bic = np.empty(len(types))
    for i, type in enumerate(types):
        gmm = GaussianMixture(n_components=2, covariance_type=type, random_state=0)
        gmm.fit(x)
        err[i] = 1 - accuracy_rate(gmm.predict(x), y)
        bic[i] = gmm.bic(x)
    print('错误率:', err.ravel())
    print('BIC:', bic.ravel())
    xpos = np.arange(4)
    plt.figure(facecolor='w')
    ax = plt.axes()
    b1 = ax.bar(xpos-0.3, err, width=0.3, color='#77E0A0', edgecolor='k')
    b2 = ax.twinx().bar(xpos, bic, width=0.3, color='#FF8080', edgecolor='k')
    plt.grid(b=True, ls=':', color='#606060')
    bic_min, bic_max = expand(bic.min(), bic.max())
    plt.ylim((bic_min, bic_max))
    plt.xticks(xpos, types)
    plt.legend([b1[0], b2[0]], ('错误率', 'BIC'))
    plt.title('不同方差类型的误差率和BIC', fontsize=15)
    plt.show()
Esempio n. 15
0
def gaussian_mixture(
    X,
    n_clusters=5,
    covariance_type="full",
    best_model=False,
    max_clusters=10,
    random_state=None,
    **kwargs,
):
    """Clustering with Gaussian Mixture Model.

    Parameters
    ----------
    X  : array-like
        n x k attribute data
    n_clusters : int, optional, default: 5
        The number of clusters to form.
    covariance_type: str, optional, default: "full""
        The covariance parameter passed to scikit-learn's GaussianMixture
        algorithm
    best_model: bool, optional, default: False
        Option for finding endogenous K according to Bayesian Information
        Criterion
    max_clusters: int, optional, default:10
        The max number of clusters to test if using `best_model` option
    random_state: int, optional, default: None
        The seed used to generate replicable results
    kwargs

    Returns
    -------
    fitted cluster instance: sklearn.mixture.GaussianMixture

    """
    if random_state is None:
        warn(
            "Note: Gaussian Mixture Clustering is probabilistic--"
            "cluster labels may be different for different runs. If you need consistency, "
            "you should set the `random_state` parameter")

    if best_model is True:

        # selection routine from
        # https://plot.ly/scikit-learn/plot-gmm-selection/
        lowest_bic = np.infty
        bic = []
        maxn = max_clusters + 1
        n_components_range = range(1, maxn)
        cv_types = ["spherical", "tied", "diag", "full"]
        for cv_type in cv_types:
            for n_components in n_components_range:
                # Fit a Gaussian mixture with EM
                gmm = GaussianMixture(
                    n_components=n_components,
                    random_state=random_state,
                    covariance_type=cv_type,
                )
                gmm.fit(X)
                bic.append(gmm.bic(X))
                if bic[-1] < lowest_bic:
                    lowest_bic = bic[-1]
                    best_gmm = gmm

        bic = np.array(bic)
        model = best_gmm

    else:
        model = GaussianMixture(
            n_components=n_clusters,
            random_state=random_state,
            covariance_type=covariance_type,
        )
    model.fit(X)
    model.labels_ = model.predict(X)
    return model
Esempio n. 16
0
def fit_gmm(
    max_components,
    n_distances,
    atoms,
    distances,
    regularization_type="bic",
    covariance_type="diag",
):
    """
    Fit a GMM to a set of distances.

    This routine will fit a Gaussian mixture model from a set
    of input distances using sklearn_. The resulting set of parameters can
    be used to initialize a `GMMDistanceRestraint` in a MELD simulation.

    .. _sklearn: http://scikit-learn.org/stable/modules/mixture.html

    Parameters
    ----------
    max_components: int
        Maximum number of components to use in fitting GMM.
    n_distances: int
        Number of distances involved in GMM
    atoms: list of (int, str, int, str) tuples.
        The atoms that are involved in each distance are specified
        as a list of `n_distances` tuples, each of the form
        (r1, n1, r2, n2), where r1, r2 are the integer residue
        indices starting from one, and n1, n2 are the atom names.
    distances: array_like(n_dim=2)
        An (n_samples, n_distances) array of distances (in nm) to fit.
    regularization_type: str
        The type of regularization to use, options are "bic"
        and "dirichlet".
    covariance_type: str
        The form of the covariance matrix, options are "diag"
        and "full".

    Returns
    -------
    GMMParams
        The fit parameters, which can be used to initialize
        a `meld.system.restraints.GMMDistanceRestraint` using
        ``GMMDistanceRestraint.from_params``.

    Notes
    -----
    There are two ways to regularize in order to prevent over fitting.

    ``regularization_type="bic"`` will use the Bayesian information
    criterion to penalize models that have more parameters. When
    using ``bic``, The final number of components in the model
    will be less than or equal to `max_components`.

    ``regularization_type=dirichlet`` will use a Dirichlet process
    prior on the weight distributions. The final number of components
    in the model will always be equal to `max_components`, but most
    of the weights will be small.

    There are two forms for the covariance matrix, which differ in
    the number of parameters and expressiveness.

    ``covariance_type="diag"`` will fit using a diagonal covariance
    matrix. This has few parameters, but does not capture correlations
    between input distances. Typically, choosing ``"diag"`` will
    result in a model with more components.

    ``covariance_type="full"`` will fit using a full representation
    of the covariance matrix. This captures correlations between
    input distances, but has far more parameters and is potentially
    prone to over fitting.
    """

    #
    # Constants
    #
    N_INIT = 25
    MAX_ITER = 1000
    KFOLD_SPLITS = 5
    REG_COVAR = 1e-4
    RANDOMSEARCH_TRIALS = 32

    #
    # Check the inputs
    #
    if distances.shape[1] != n_distances:
        raise ValueError("distances must have shape (n_samples, n_distances)")

    if len(atoms) != n_distances:
        raise ValueError(
            "atoms must be a list of (ind1, name1, ind2, name2) of "
            "length n_components"
        )

    if regularization_type not in ["bic", "dirichlet"]:
        raise ValueError('regularization_type must be one of ["bic", "dirichlet"]')

    if covariance_type not in ["diag", "full"]:
        raise ValueError('covariance_type must be one of ["diag", "full"]')

    if max_components < 1:
        raise ValueError("max_components must be >= 1")
    if max_components > 32:
        raise ValueError("MELD supports a maximum of 32 GMM components")

    #
    # Create and fit the model
    #
    if regularization_type == "bic":
        # BIC fit
        # Search different values of n_components to find the minimal
        # BIC.
        models = []
        for i in range(1, max_components + 1):
            g = GaussianMixture(
                n_components=i,
                n_init=N_INIT,
                max_iter=MAX_ITER,
                covariance_type=covariance_type,
                reg_covar=REG_COVAR,
            )
            g.fit(distances)
            models.append((g.bic(distances), g))

        gmm = sorted(models, key=lambda x: x[0])[0][1]

    else:
        # Dirichlet process fit
        # use RandomSearchCV to optimize hyperparameters
        params = {
            "weight_concentration_prior": LogUniformSampler(1e-6, 10),
            "mean_precision_prior": LogUniformSampler(1, 10),
        }
        model = BayesianGaussianMixture(
            max_components,
            n_init=N_INIT,
            max_iter=MAX_ITER,
            covariance_type=covariance_type,
            reg_covar=REG_COVAR,
        )
        rs = RandomizedSearchCV(
            model,
            param_distributions=params,
            n_iter=RANDOMSEARCH_TRIALS,
            cv=KFold(n_splits=KFOLD_SPLITS, shuffle=True),
        )
        rs.fit(distances)
        gmm = rs.best_estimator_

    # turn the vector representation of the diagonal into a full
    # precision matrix
    if covariance_type == "diag":
        precisions = gmm.precisions_
        assert len(precisions.shape) == 2
        new_precisions = []
        for i in range(precisions.shape[0]):
            new_precisions.append(np.diag(precisions[i, :]))
        precisions = np.array(new_precisions)
    else:
        precisions = gmm.precisions_

    # convert the list of atoms into the correct form
    new_atoms = []
    for r1, n1, r2, n2 in atoms:
        new_atoms.append((r1, n1))
        new_atoms.append((r2, n2))

    # Return the parameters for a GMM
    return GMMParams(
        n_components=gmm.weights_.shape[0],
        n_distances=n_distances,
        atoms=new_atoms,
        weights=gmm.weights_,
        means=gmm.means_,
        precisions=precisions,
    )
Esempio n. 17
0
    def fit(self, X, y=None):
        """
        Fits gaussian mixure model to the data. 
        Estimate model parameters with the EM algorithm.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.
        
        y : array-like, shape (n_samples,), optional (default=None)
            List of labels for X if available. Used to compute
            ARI scores.

        Returns
        -------
        self
        """

        # Deal with number of clusters
        if self.max_components is None:
            lower_ncomponents = 1
            upper_ncomponents = self.min_components
        else:
            lower_ncomponents = self.min_components
            upper_ncomponents = self.max_components

        n_mixture_components = upper_ncomponents - lower_ncomponents + 1

        if upper_ncomponents > X.shape[0]:
            if self.max_components is None:
                msg = "if max_components is None then min_components must be >= "
                msg += "n_samples, but min_components = {}, n_samples = {}".format(
                    upper_ncomponents, X.shape[0])
            else:
                msg = "max_components must be >= n_samples, but max_components = "
                msg += "{}, n_samples = {}".format(upper_ncomponents,
                                                   X.shape[0])
            raise ValueError(msg)
        elif lower_ncomponents > X.shape[0]:
            msg = "min_components must be <= n_samples, but min_components = "
            msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0])
            raise ValueError(msg)

        # Get parameters
        random_state = self.random_state

        param_grid = dict(
            covariance_type=self.covariance_type,
            n_components=range(lower_ncomponents, upper_ncomponents + 1),
            random_state=[random_state],
        )

        param_grid = list(ParameterGrid(param_grid))

        models = [[] for _ in range(n_mixture_components)]
        bics = [[] for _ in range(n_mixture_components)]
        aris = [[] for _ in range(n_mixture_components)]

        for i, params in enumerate(param_grid):
            model = GaussianMixture(**params)
            model.fit(X)
            models[i % n_mixture_components].append(model)
            bics[i % n_mixture_components].append(model.bic(X))
            if y is not None:
                predictions = model.predict(X)
                aris[i % n_mixture_components].append(
                    adjusted_rand_score(y, predictions))

        self.bic_ = pd.DataFrame(
            np.array(bics),
            index=np.arange(lower_ncomponents, upper_ncomponents + 1),
            columns=self.covariance_type,
        )

        if y is not None:
            self.ari_ = pd.DataFrame(
                np.array(aris),
                index=np.arange(lower_ncomponents, upper_ncomponents + 1),
                columns=self.covariance_type,
            )
        else:
            self.ari_ = None

        # Finding the minimum bic for each covariance structure
        bic_mins = [min(bic) for bic in bics]
        bic_argmins = [np.argmin(bic) for bic in bics]

        # Find the index for the minimum bic amongst all covariance structures
        model_type_argmin = np.argmin(bic_mins)

        self.n_components_ = np.argmin(bics[model_type_argmin]) + 1
        self.model_ = models[model_type_argmin][bic_argmins[model_type_argmin]]

        return self
Esempio n. 18
0
# load the included diabetes dataset
diab = load_diabetes(as_frame=True)
# view information about the columns
print(diab.DESCR)
diab_df = diab.data
print(diab.target)
# since we are not performing regression, we can add the target
# column
diab_df['s7'] = diab.target
# print a summary of our data
print(diab_df.describe())

em_gaussian = GaussianMixture(n_components=4,
                              init_params='random',
                              covariance_type='full')
cluster_preds = em_gaussian.fit_predict(diab_df)
plt.title('Gaussian Mixture Clusters')
# we can pick two dimensions of the input data in order to visualize clusters
# in R^2. Note that this output will look different depending on which
# dimensions you choose to plot
plt.xlabel('bmi')
plt.ylabel('bp')
plt.scatter(diab_df['bmi'], diab_df['bp'], c=cluster_preds, cmap='rainbow')
plt.savefig('simple_diabetes_clusters.png', dpi=300)

# view the akaike information criterion
print(em_gaussian.aic(diab_df))
# view the bayesian information criterion
print(em_gaussian.bic(diab_df))
Esempio n. 19
0
lowest_bic = np.infty

# we'll compare BIC scores for four different CV types and
# 6 different numbers of components (clusters) to choose the "best"
# model
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    scores = []
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = GaussianMixture(n_components=n_components,
                              covariance_type=cv_type)
        gmm.fit(X)
        curr_bic = gmm.bic(X)

        scores.append(curr_bic)
        # update tracking variables if new lowest BIC found
        if curr_bic < lowest_bic:
            lowest_bic = curr_bic
            best_gmm = gmm

    plt.plot(n_components_range, scores, label=cv_type)

# now we can inspect the "best" model, as decided by BIC score
print('CV:', best_gmm.covariance_type, '| #Components:', best_gmm.n_components,
      '| BIC:', lowest_bic)
plt.legend()
plt.savefig('BIC_plot.png', dpi=300)
Esempio n. 20
0
    def _fit_cluster(
        self,
        X: np.ndarray,
        X_subset: np.ndarray,
        y: Optional[np.ndarray],
        params: ParamGridType,
        agg_clustering: Union[List[int], np.ndarray],
        seed: int,
    ) -> Dict[str, Any]:
        label_init = self.label_init
        if label_init is not None:
            onehot = _labels_to_onehot(label_init)
            weights_init, means_init, precisions_init = _onehot_to_initial_params(
                X, onehot, params[1]["covariance_type"])
            gm_params = params[1]
            gm_params["weights_init"] = weights_init
            gm_params["means_init"] = means_init
            gm_params["precisions_init"] = precisions_init
        elif params[0]["affinity"] != "none":
            onehot = _labels_to_onehot(agg_clustering)
            weights_init, means_init, precisions_init = _onehot_to_initial_params(
                X_subset, onehot, params[1]["covariance_type"])
            gm_params = params[1]
            gm_params["weights_init"] = weights_init
            gm_params["means_init"] = means_init
            gm_params["precisions_init"] = precisions_init
        else:
            gm_params = params[1]
            gm_params["init_params"] = "kmeans"
        gm_params["reg_covar"] = 0
        gm_params["max_iter"] = self.max_iter
        gm_params["random_state"] = seed

        criter = np.inf  # if none of the iterations converge, bic/aic is set to inf
        # below is the regularization scheme
        while gm_params["reg_covar"] <= 1 and criter == np.inf:
            model = GaussianMixture(**gm_params)
            try:
                # ignoring warning here because if convergence is not reached,
                # the regularization is automatically increased
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", ConvergenceWarning)
                    model.fit(X)
                predictions = model.predict(X)
                counts = [
                    sum(predictions == i)
                    for i in range(gm_params["n_components"])
                ]
                # singleton clusters not allowed
                assert not any([count <= 1 for count in counts])

            except ValueError:
                gm_params["reg_covar"] = _increase_reg(gm_params["reg_covar"])
                continue
            except AssertionError:
                gm_params["reg_covar"] = _increase_reg(gm_params["reg_covar"])
                continue
            # if the code gets here, then the model has been fit with no errors or
            # singleton clusters
            if self.selection_criteria == "bic":
                criter = model.bic(X)
            else:
                criter = model.aic(X)
            break

        if y is not None:
            self.predictions = model.predict(X)
            ari = adjusted_rand_score(y, self.predictions)
        else:
            ari = float("nan")
        results = {
            "model": model,
            "bic/aic": criter,
            "ari": ari,
            "n_components": gm_params["n_components"],
            "affinity": params[0]["affinity"],
            "linkage": params[0]["linkage"],
            "covariance_type": gm_params["covariance_type"],
            "reg_covar": gm_params["reg_covar"],
        }
        return results
Esempio n. 21
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        TODO: YP description

        **Positional Arguments:**

        inputs:
            - A matrix

        **Optional Arguments:**

        dim:
            - The number of clusters in which to assign the data
        """

        if self._embedding is None:
            self._embedding = inputs[0]

        nodeIDs = inputs[1]
        nodeIDS = np.array([int(i) for i in nodeIDs])

        max_clusters = self.hyperparams['max_clusters']

        if max_clusters < self._embedding.shape[1]:
            self._embedding = self._embedding[:, :max_clusters].copy()

        cov_types = ['full', 'tied', 'diag', 'spherical']

        clf = GaussianMixture(n_components=1, covariance_type='spherical')
        clf.fit(self._embedding)
        BIC_max = -clf.bic(self._embedding)
        cluster_likelihood_max = 1
        cov_type_likelihood_max = "spherical"

        for i in range(1, max_clusters):
            for k in cov_types:
                clf = GaussianMixture(n_components=i, covariance_type=k)

                clf.fit(self._embedding)

                current_bic = -clf.bic(self._embedding)

                if current_bic > BIC_max:
                    BIC_max = current_bic
                    cluster_likelihood_max = i
                    cov_type_likelihood_max = k

        clf = GaussianMixture(n_components=cluster_likelihood_max,
                              covariance_type=cov_type_likelihood_max)
        clf.fit(self._embedding)

        predictions = clf.predict(self._embedding)

        testing = inputs[2]

        testing_nodeIDs = np.asarray(testing['G1.nodeID'])
        testing_nodeIDs = np.array([int(i) for i in testing_nodeIDs])
        final_labels = np.zeros(len(testing))

        for i in range(len(testing_nodeIDs)):
            #temp = np.where(self._nodeIDs == int(testing_nodeIDs[i]))[0][0]
            label = predictions[i]
            #print(label)
            final_labels[i] = int(label) + 1

        testing['classLabel'] = final_labels
        outputs = container.DataFrame(testing[['d3mIndex', 'classLabel']])
        outputs[['d3mIndex',
                 'classLabel']] = outputs[['d3mIndex',
                                           'classLabel']].astype(int)
        #outputs = container.DataFrame(testing['classLabel'])
        return base.CallResult(outputs)
Esempio n. 22
0
def clustering_experiment(X, y, name, clusters, rdir):
    """Generate results CSVs for given datasets using the K-Means and EM
    clustering algorithms.

    Args:
        X (Numpy.Array): Attributes.
        y (Numpy.Array): Labels.
        name (str): Dataset name.
        clusters (list[int]): List of k values.
        rdir (str): Output directory.

    """
    sse = defaultdict(dict)  # sum of squared errors
    logl = defaultdict(dict)  # log-likelihood
    bic = defaultdict(dict)  # BIC for EM
    aic = defaultdict(dict)  # AIC for EM
    aic = defaultdict(dict)  # AIC for EM
    silhouette = defaultdict(dict)  # silhouette score
    acc = defaultdict(lambda: defaultdict(dict))  # accuracy scores
    adjmi = defaultdict(lambda: defaultdict(dict))  # adjusted mutual info
    h**o = defaultdict(lambda: defaultdict(dict))  # adjusted mutual info
    km = KMeans(random_state=0)  # K-Means
    gmm = GMM(random_state=0)  # Gaussian Mixture Model (EM)

    # start loop for given values of k
    print('DATESET: %s' % name)
    for k in clusters:
        print('K: %s' % k)
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(X)
        gmm.fit(X)

        # calculate SSE, log-likelihood, accuracy, and adjusted mutual info
        sse[k][name] = km.score(X)
        logl[k][name] = gmm.score(X)
        acc[k][name]['km'] = cluster_acc(y, km.predict(X))
        acc[k][name]['gmm'] = cluster_acc(y, gmm.predict(X))
        adjmi[k][name]['km'] = ami(y, km.predict(X))
        adjmi[k][name]['gmm'] = ami(y, gmm.predict(X))

        h**o[k][name]['km'] = homogeneity_score(y, km.predict(X))
        h**o[k][name]['gmm'] = homogeneity_score(y, gmm.predict(X))

        # calculate silhouette score for K-Means
        km_silhouette = silhouette_score(X, km.predict(X))
        silhouette[k][name] = km_silhouette

        # calculate BIC for EM
        bic[k][name] = gmm.bic(X)
        aic[k][name] = gmm.aic(X)

    # generate output dataframes
    sse = (-pd.DataFrame(sse)).T
    sse.rename(columns={name: 'sse'}, inplace=True)
    logl = pd.DataFrame(logl).T
    logl.rename(columns={name: 'log-likelihood'}, inplace=True)
    bic = pd.DataFrame(bic).T
    bic.rename(columns={name: 'bic'}, inplace=True)
    aic = pd.DataFrame(aic).T
    aic.rename(columns={name: 'aic'}, inplace=True)
    silhouette = pd.DataFrame(silhouette).T
    silhouette.rename(columns={name: 'silhouette_score'}, inplace=True)
    acc = pd.Panel(acc)
    acc = acc.loc[:, :, name].T.rename(lambda x: '{}_acc'.format(x),
                                       axis='columns')
    adjmi = pd.Panel(adjmi)
    adjmi = adjmi.loc[:, :, name].T.rename(lambda x: '{}_adjmi'.format(x),
                                           axis='columns')
    h**o = pd.Panel(h**o)
    h**o = h**o.loc[:, :, name].T.rename(lambda x: '{}_homo'.format(x),
                                         axis='columns')

    # concatenate all results
    dfs = (sse, silhouette, logl, bic, aic, acc, adjmi, h**o)
    metrics = pd.concat(dfs, axis=1)
    print(metrics)
    resfile = get_abspath('{}_train_metrics.csv'.format(name), rdir)
    metrics.to_csv(resfile, index_label='k')
Esempio n. 23
0
    def cluster_segments(self) -> None:
        """Clusters the input segments :attr:`self.raw_segments` based on the parameters passed as argument.
        """
        Logger.debug("Clustering segments")
        if self.params.cluster_type not in ["gmm", "knn"]:
            Logger.fatal("Invalid value for cluster type: {}".format(
                self.params.cluster_type))
            raise ValueError(
                "Invalid value for 'cluster_type': {} "
                "'cluster_type' should be in ['gmm', 'knn']".format(
                    self.params.cluster_type))

        centers = []
        angles = []
        for segment in self.raw_segments:
            pt1 = segment[0:2]
            pt2 = segment[2:4]
            center = (pt1 + pt2) * 0.5
            centers.append(center)

            # Segment angle lies in [0, pi], multiply by 2 such that complex number associated to similar angles are
            # close on the complex plane (e.g. 180° and 0°)
            angle = tg.utils.angle(pt1, pt2) * 2

            # Need to use complex representation as Euclidean distance used in clustering makes sense in complex plane,
            # and does not directly on angles.
            point = np.array([np.cos(angle), np.sin(angle)])
            angles.append(point)

        centers = np.array(centers)
        centers = normalize(centers, axis=0)
        angles = np.array(angles)

        if self.params.use_angles and self.params.use_centers:
            features = np.hstack((angles, centers))
        elif self.params.use_angles:
            features = angles
        elif self.params.use_centers:
            features = centers
        else:
            raise RuntimeError(
                "Can not perform segment clustering without any feature. "
                "Select 'use_angles=True' and/or 'use_centers=True'.")

        cluster_prediction = None

        if self.params.cluster_type is "knn":
            Logger.debug("Clustering segments using KNN")
            cluster_prediction = KMeans(n_clusters=self.params.num_clusters,
                                        n_init=self.params.num_init,
                                        random_state=0).fit_predict(features)
        elif self.params.cluster_type is "gmm":
            Logger.debug("Clustering segments using GMM")
            best_gmm = None
            lowest_bic = np.infty
            bic = []
            n_components_range = range(1, self.params.num_clusters + 1)
            if not self.params.swipe_clusters:
                n_components_range = [self.params.num_clusters]
            for n_components in n_components_range:
                # Fit a Gaussian mixture with EM.
                gmm = GaussianMixture(n_components=n_components,
                                      covariance_type='full')
                gmm.fit(features)
                bic.append(gmm.bic(features))
                if bic[-1] < lowest_bic:
                    lowest_bic = bic[-1]
                    best_gmm = gmm

            cluster_prediction = best_gmm.predict(features)

        # Reorder the segments as clusters.
        cluster_segment_list = []
        cluster_feature_list = []
        num_labels = np.max(cluster_prediction) + 1
        for label in range(num_labels):
            cluster_segments = self.raw_segments[cluster_prediction == label]
            if len(cluster_segments) == 0:
                continue
            cluster_features = features[cluster_prediction == label]
            cluster_segment_list.append(cluster_segments)
            cluster_feature_list.append(cluster_features)

        self.cluster_list = cluster_segment_list
        self.cluster_features = cluster_feature_list
Esempio n. 24
0
    def fit(self, X, Y, epochs, batch_size):

        EPOCHS = epochs
        BATCH_SIZE = batch_size
        n = len(X)
        XY = np.concatenate((X, Y), axis=1)
        #df = n - 1

        self._X = X.copy()
        hidden_neurons = self.hidden_neurons

        if self.n_mixtures == -1:
            lowest_bic = np.infty
            bic = []
            n_components_range = range(1, 7)
            cv_types = ['spherical', 'tied', 'diag', 'full']
            for cv_type in cv_types:
                for n_components in n_components_range:
                    # Fit a Gaussian mixture with EM
                    gmm = GaussianMixture(n_components=n_components,
                                          covariance_type=cv_type,
                                          max_iter=10000)
                    gmm.fit(XY)
                    bic.append(gmm.bic(XY))
                    if bic[-1] < lowest_bic:
                        lowest_bic = bic[-1]
                        best_gmm = gmm
                        self.n_mixtures = n_components

            clusterer = HDBSCAN()
            clusterer.fit(XY)
            clusterer.labels_

            if len(np.unique(clusterer.labels_)) < self.n_mixtures:
                self.n_mixtures = len(np.unique(clusterer.labels_))
            else:
                pass

            if self.gmm_boost == True:
                if len(np.unique(clusterer.labels_)) < self.n_mixtures:
                    clusterer = HDBSCAN()
                    clusterer.fit(X)
                    clusters = clusterer.labels_
                else:
                    clusterer = best_gmm
                    clusterer.fit(X)
                    clusters = clusterer.predict_proba(X)

                self._clusterer = clusterer

                X = np.concatenate((X, clusters), axis=1)

            else:
                pass

        elif self.gmm_boost == True:

            clusterer1 = BayesianGaussianMixture(n_components=self.n_mixtures,
                                                 covariance_type='full',
                                                 max_iter=10000)
            clusterer1.fit(X)
            clusters = clusterer1.predict_proba(X)
            self._clusterer = clusterer1

            clusterer2 = HDBSCAN()
            clusterer2.fit(X)

            if len(np.unique(clusterer2.labels_)) < self.n_mixtures:
                clusters = clusterer2.labels_
                self._clusterer = clusterer2
            else:
                pass

            X = np.concatenate((X, clusters), axis=1)

        else:
            pass

        self._y = Y.copy()

        dataset = tf.compat.v1.data.Dataset \
         .from_tensor_slices((X, Y)) \
         .repeat(EPOCHS).shuffle(len(X)).batch(BATCH_SIZE)
        iter_ = tf.compat.v1.data.make_one_shot_iterator(dataset)

        x, y = iter_.get_next()

        K = self.n_mixtures

        self.K = K
        self.x = x

        input_activation = self.input_activation
        hidden_activation = self.hidden_activation

        if input_activation.lower() == 'crelu':
            input_actv = tf.nn.crelu
        elif input_activation.lower() == 'relu6':
            input_actv = tf.nn.relu6
        elif input_activation.lower() == 'elu':
            input_actv = tf.nn.elu
        elif input_activation.lower() == 'selu':
            input_actv = tf.nn.selu
        elif input_activation.lower() == 'leaky_relu':
            input_actv = tf.nn.leaky_relu
        elif input_activation.lower() == 'relu':
            input_actv = tf.nn.relu
        elif input_activation.lower() == 'swish':
            input_actv = tf.nn.swish
        elif input_activation.lower() == 'tanh':
            input_actv = tf.nn.tanh
        elif input_activation.lower() == 'linear':
            input_actv = None
        elif input_activation.lower() == 'softplus':
            input_actv = tf.nn.softplus
        elif input_activation.lower() == 'sigmoid':
            input_actv = tf.nn.sigmoid
        elif input_activation.lower() == 'softmax':
            input_actv = tf.nn.softmax
        else:
            input_actv = tf.nn.relu

        if hidden_activation.lower() == 'crelu':
            h_actv = tf.nn.crelu
        elif hidden_activation.lower() == 'relu6':
            h_actv = tf.nn.relu6
        elif hidden_activation.lower() == 'elu':
            h_actv = tf.nn.elu
        elif hidden_activation.lower() == 'selu':
            h_actv = tf.nn.selu
        elif hidden_activation.lower() == 'leaky_relu':
            h_actv = tf.nn.leaky_relu
        elif hidden_activation.lower() == 'relu':
            h_actv = tf.nn.relu
        elif hidden_activation.lower() == 'swish':
            h_actv = tf.nn.swish
        elif hidden_activation.lower() == 'tanh':
            h_actv = tf.nn.tanh
        elif hidden_activation.lower() == 'linear':
            h_actv = None
        elif hidden_activation.lower() == 'softplus':
            h_actv = tf.nn.softplus
        elif hidden_activation.lower() == 'sigmoid':
            h_actv = tf.nn.sigmoid
        elif hidden_activation.lower() == 'softmax':
            h_actv = tf.nn.softmax
        else:
            h_actv = tf.nn.relu

        n_layer = len(hidden_neurons)

        if n_layer < 1:
            self.layer_last = tf.layers.dense(x,
                                              units=self.input_neurons,
                                              activation=input_actv)
            self.mu = tf.layers.dense(self.layer_last,
                                      units=K,
                                      activation=None,
                                      name="mu")
            self.var = tf.exp(
                tf.layers.dense(self.layer_last,
                                units=K,
                                activation=None,
                                name="sigma"))
            self.pi = tf.layers.dense(self.layer_last,
                                      units=K,
                                      activation=tf.nn.softmax,
                                      name="mixing")

        else:
            self.layer_1 = tf.layers.dense(x,
                                           units=self.input_neurons,
                                           activation=input_actv)
            for i in range(2, n_layer + 2):

                n_neurons = hidden_neurons[i - 2]

                if i == n_layer + 1:
                    print('last', i)
                    string_var = 'self.layer_last = tf.layers.dense(self.layer_' + str(
                        i - 1) + ', units=n_neurons, activation=h_actv)'
                else:
                    print(i)
                    string_var = 'self.layer_' + str(
                        i) + ' = tf.layers.dense(self.layer_' + str(
                            i - 1) + ', units=n_neurons, activation=h_actv)'

                exec(string_var)

            self.mu = tf.layers.dense(self.layer_last,
                                      units=K,
                                      activation=None,
                                      name="mu")
            self.var = tf.exp(
                tf.layers.dense(self.layer_last,
                                units=K,
                                activation=None,
                                name="sigma"))
            self.pi = tf.layers.dense(self.layer_last,
                                      units=K,
                                      activation=tf.nn.softmax,
                                      name="mixing")

        if self.tf_mixture_family == False:
            #---------------- Not using TF Mixture Family ------------------------
            if self.dist.lower() == 'normal':
                self.likelihood = tfp.distributions.Normal(loc=self.mu,
                                                           scale=self.var)
            elif (self.dist.lower() == 'laplacian'
                  or self.dist.lower() == 'laplace') == True:
                self.likelihood = tfp.distributions.Laplace(loc=self.mu,
                                                            scale=self.var)
            elif self.dist.lower() == 'lognormal':
                self.likelihood = tfp.distributions.LogNormal(loc=self.mu,
                                                              scale=self.var)
            elif self.dist.lower() == 'gamma':
                alpha = (self.mu**2) / self.var
                beta = self.var / self.mu
                self.likelihood = tfp.distributions.Gamma(concentration=alpha,
                                                          rate=beta)
            else:
                self.likelihood = tfp.distributions.Normal(loc=self.mu,
                                                           scale=self.var)

            self.out = self.likelihood.prob(y)
            self.out = tf.multiply(self.out, self.pi)
            self.out = tf.reduce_sum(self.out, 1, keepdims=True)
            self.out = -tf.log(self.out + 1e-10)
            self.mean_loss = tf.reduce_mean(self.out)

        else:
            # -------------------- Using TF Mixture Family ------------------------
            self.mixture_distribution = tfp.distributions.Categorical(
                probs=self.pi)

            if self.dist.lower() == 'normal':
                self.distribution = tfp.distributions.Normal(loc=self.mu,
                                                             scale=self.var)
            elif (self.dist.lower() == 'laplacian'
                  or self.dist.lower() == 'laplace') == True:
                self.distribution = tfp.distributions.Laplace(loc=self.mu,
                                                              scale=self.var)
            elif self.dist.lower() == 'lognormal':
                #self.distribution = tfp.edward2.LogNormal(loc=self.mu, scale=self.var)
                self.distribution = tfp.distributions.LogNormal(loc=self.mu,
                                                                scale=self.var)
            elif self.dist.lower() == 'gamma':
                alpha = (self.mu**2) / self.var
                beta = self.var / self.mu
                self.distribution = tfp.distributions.Gamma(
                    concentration=alpha, rate=beta)
            else:
                self.distribution = tfp.distributions.Normal(loc=self.mu,
                                                             scale=self.var)

            self.likelihood = tfp.distributions.MixtureSameFamily(
                mixture_distribution=self.mixture_distribution,
                components_distribution=self.distribution)
            self.log_likelihood = -self.likelihood.log_prob(tf.transpose(y))

            self.mean_loss = tf.reduce_mean(self.log_likelihood)

# ----------------------------------------------------------------------

        self.global_step = tf.Variable(0, trainable=False)

        if self.optimizer.lower() == 'adam':
            self.train_op = tf.compat.v1.train.AdamOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'adadelta':
            self.train_op = tf.compat.v1.train.AdadeltaOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'adagradda':
            self.train_op = tf.compat.v1.train.AdagradDAOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'adagrad':
            self.train_op = tf.compat.v1.train.AdagradOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'ftrl':
            self.train_op = tf.compat.v1.train.FtrlOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'gradientdescent':
            self.train_op = tf.compat.v1.train.GradientDescentOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'momentum':
            self.train_op = tf.compat.v1.train.MomentumOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)

        elif self.optimizer.lower() == 'proximaladagrad':
            self.train_op = tf.compat.v1.train.ProximalAdagradOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'proximalgradientdescent':
            self.train_op = tf.compat.v1.train.ProximalGradientDescentOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'rmsprop':
            self.train_op = tf.compat.v1.train.RMSPropOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)

        else:
            self.train_op = tf.compat.v1.train.AdamOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)

        self.init = tf.compat.v1.global_variables_initializer()

        # Initialize coefficients
        self.sess = tf.compat.v1.Session()
        self.sess.run(self.init)

        best_loss = 1e+10
        self.stopping_step = 0
        for i in range(EPOCHS * (n // BATCH_SIZE)):
            _, loss, mu, var, pi, x__ = self.sess.run([
                self.train_op, self.mean_loss, self.mu, self.var, self.pi,
                self.x
            ])

            if loss < best_loss:
                self.stopping_step = 0
                self.best_loss = loss

                best_mu = mu
                best_var = var
                best_pi = pi
                best_mean_y = mu[:, 0]
                best_x = x__
                best_loss = loss
                print("Epoch: {} Loss: {:3.3f}".format(i, loss))
            else:
                self.stopping_step += 1

            if self.stopping_step >= self.early_stopping:
                self.should_stop = True
                print("Early stopping is trigger at step: {} loss:{}".format(
                    i, loss))
                return
            else:
                pass

            self._mean_y_train = mu[:, 0]
            self._dist_mu_train = mu
            self._dist_var_train = var
            self._dist_pi_train = pi
            self._x_data_train = x__
def gmm_information_criteria_report(
        X_mat,
        k=np.arange(1, 20),
        covar_type=['full', 'tied', 'diag', 'spherical'],
        random_seed=11238,
        out="Graph"):
    # Dataframe transposing closure type funct

    tmp_global_aic, tmp_global_bic = [], []
    for i in covar_type:
        tmp_iter_aic, tmp_iter_bic = [], []
        for j in k:
            tmp_model = GaussianMixture(j,
                                        covariance_type=i,
                                        random_state=random_seed).fit(X_mat)
            tmp_iter_aic.append(tmp_model.aic(X_mat))
            tmp_iter_bic.append(tmp_model.bic(X_mat))
        tmp_global_aic.append(tmp_iter_aic)
        tmp_global_bic.append(tmp_iter_bic)

    covar_type = covar_type
    tmp_get_aic = handle_df(tmp_global_aic, covar_type)
    tmp_get_bic = handle_df(tmp_global_bic, covar_type)
    tmp_get_aic_max = pd.melt(tmp_get_aic,
                              id_vars=['n_components'],
                              value_vars=covar_type).sort_values(by='value')
    tmp_get_bic_max = pd.melt(tmp_get_bic,
                              id_vars=['n_components'],
                              value_vars=covar_type).sort_values(by='value')
    tmp_top_aic = tmp_get_aic_max.head(3)
    tmp_top_bic = tmp_get_bic_max.head(3)

    if out is "Graph":
        plt.subplot(2, 1, 1)
        for colname, index in tmp_get_aic.drop(
                columns='n_components').iteritems():
            plt.plot(index, label=colname)
        plt.scatter(tmp_top_aic['n_components'],
                    tmp_top_aic['value'],
                    edgecolors='slategrey',
                    facecolor='none',
                    lw=2,
                    label="Best hyperparams")
        plt.title('Akaike Information Criteria')
        plt.xticks(k - 1, k)
        plt.xlabel('Number of clusters estimated')
        plt.ylabel('AIC')
        plt.legend()

        plt.subplot(2, 1, 2)
        for colname, index, in tmp_get_bic.drop(
                columns='n_components').iteritems():
            plt.plot(index, label=colname)
        plt.scatter(tmp_top_bic['n_components'],
                    tmp_top_bic['value'],
                    edgecolors='slategrey',
                    facecolor='none',
                    lw=2,
                    label="Best hyperparams")
        plt.title('Bayesian Information Criteria')
        plt.xticks(k - 1, k)
        plt.xlabel('Number of clusters estimated')
        plt.ylabel('BIC')
        plt.legend()

    elif out is not "Graph":
        return tmp_get_aic_max, tmp_get_bic_max
Esempio n. 26
0
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X[:, 1:9])
X[:, 1:9] = imputer.transform(X[:, 1:9])

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=1)

lowest_bic = np.infty
bic = []

for n_components in range(1, 4):
    classifier = GaussianMixture(n_components=n_components,
                                 covariance_type='full')
    classifier.fit(X_train, y_train)
    bic.append(classifier.bic(X))
    if bic[-1] < lowest_bic:
        lowest_bic = bic[-1]
        best_gmm = classifier

y_pred = best_gmm.predict(X_test)
score = accuracy_score(y_pred, y_test)

print(score)

# # Applying k-Fold Cross Validation
# from sklearn.model_selection import cross_val_score
# accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
# print(accuracies.mean())
# print(accuracies.std())
Esempio n. 27
0
    print(gmm.covariances_)

    #now repeat but use AIC and BIC to identify optimum number of components
    comp = np.arange(1, 9, 1)
    ncomp = np.shape(comp)[0]

    aic = []
    bic = []
    axes = []
    yres = []
    for i in range(ncomp):
        cnow = comp[i]
        gmm = GaussianMixture(n_components=cnow)
        gmm.fit(xin)
        aic.append(gmm.aic(xin))
        bic.append(gmm.bic(xin))
        yres.append(np.exp(gmm.score_samples(xr)))
        print 'ncomponents...', cnow, '    aic', aic[i], '     bic', bic[i]

    #make plots of all the tested number of components
    fig = plt.figure()
    i_best = np.argmin(aic)
    for i in range(ncomp):
        cnow = comp[i]
        ax1 = fig.add_subplot(np.int(np.ceil(ncomp / 2.)), 2, i + 1)
        axes.append(ax1)
        ilo = 0
        for i2 in range(ndis):
            if (i2 > 0):
                ilo = ihi
            ihi = ilo + nsamp[i2]
Esempio n. 28
0
    counter += iterations

xcoords[-1] = 2 * ds

y = meanMatrix[0].reshape(-1, 1)
#clustering = KMeans(n_clusters=k).fit(meanMatrix[0].reshape(-1,1))

bics = []
aics = []
for kk in range(1, 7):
    gmm = GaussianMixture(n_components=kk,
                          covariance_type='spherical',
                          random_state=1991)
    gmm.fit(y)

    bics.append(gmm.bic(y))
    aics.append(gmm.aic(y))

kkBest = 1 + np.argmin(bics)

gmm = GaussianMixture(n_components=kkBest,
                      covariance_type='spherical',
                      random_state=1991)
gmm.fit(y)
labels = gmm.predict(y)

fig = plt.figure()
#0 coverage, 1 global, 2 local

colors = np.array(['C' + str(i) for i in range(kkBest)])
alphas = np.array([0.2, 1.])
Esempio n. 29
0
AIC = np.zeros((T,))
CVE = np.zeros((T,))

# K-fold crossvalidation
CV = model_selection.KFold(n_splits=10, shuffle=True)

for t, K in enumerate(KRange):
    print('Fitting model for K={0}'.format(K))

    # Fit Gaussian mixture model
    gmm = GaussianMixture(n_components=K, covariance_type=covar_type,
                          n_init=reps, init_params=init_procedure,
                          tol=1e-6, reg_covar=1e-6).fit(X_norm)

    # Get BIC and AIC
    BIC[t,] = gmm.bic(X_norm)
    AIC[t,] = gmm.aic(X_norm)

    # For each crossvalidation fold
    for train_index, test_index in CV.split(X_norm):
        # extract training and test set for current CV fold
        X_train = X_norm[train_index]
        X_test = X_norm[test_index]

        # Fit Gaussian mixture model to X_train
        gmm = GaussianMixture(n_components=K, covariance_type=covar_type, n_init=reps).fit(X_train)

        # compute negative log likelihood of X_test
        CVE[t] += -gmm.score_samples(X_test).sum()

opt_clust = KRange[CVE.argmin()]
Esempio n. 30
0
    m = np.array(((1, 1), (1, 3)))
    x1 = x1.dot(m)  #对x1进行旋转,方差都发生变化
    x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)
    x = np.vstack((x1, x2))
    y = np.array([0] * N1 + [1] * N2)

    types = ('spherical', 'diag', 'tied', 'full')  #圆形 对角线 两方差相同 可不同
    err = np.empty(len(types))
    bic = np.empty(len(types))
    for i, type in enumerate(types):
        gmm = GaussianMixture(n_components=2,
                              covariance_type=type,
                              random_state=0)
        gmm.fit(x)
        err[i] = 1 - accuracy_rate(gmm.predict(x), y)
        bic[i] = gmm.bic(x)
    print('错误率:', err.ravel())
    print('BIC:', bic.ravel())
    xpos = np.arange(4)
    ax = plt.axes()
    b1 = ax.bar(xpos - 0.3, err, width=0.3, color='#77E0A0')
    b2 = ax.twinx().bar(xpos, bic, width=0.3,
                        color='#FF8080')  #ax1.twinx()#产生一个ax1的镜面坐标
    plt.grid(True)
    bic_min, bic_max = expand(bic.min(), bic.max())
    plt.ylim((bic_min, bic_max))
    plt.xticks(xpos, types)
    plt.legend([b1[0], b2[0]], (u'错误率', u'BIC'))
    plt.title(u'不同方差类型的误差率和BIC', fontsize=18)
    plt.show()
Esempio n. 31
0
em_fitness_times = []

for k in Kclusters:
    t1 = time.time()
    em = GaussianMixture(n_components=k,
                         covariance_type='diag',
                         n_init=1,
                         warm_start=True,
                         random_state=100).fit(X1)
    t2 = time.time()

    em_fitness_times.append(t2 - t1)
    em_sil_scores.append(silhouette_score(X1, em.predict(X1)))
    em_homo_scores.append(homogeneity_score(Y1, em.predict(X1)))
    em_aic_scores.append(em.aic(X1))
    em_bic_scores.append(em.bic(X1))

# [KM] Plot the Cluster Score over K cluster
plt.title("Cluster Score for K Mean (KM) on ICA " + Dataset)
plt.xlabel("K cluster")
plt.ylabel("Inertia")
#plt.ylim(0.0, 1.1)
lw = 2
plt.plot(Kclusters, km_inertia_scores, label="inertia", color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

# [KM] Plot the Fitness Time over K cluster
plt.title("Fitness Time for K Mean (KM) on ICA " + Dataset)
plt.xlabel("K cluster")
plt.ylabel("Fitness Time (s)")
Esempio n. 32
0
def em(X_train,
       X_test,
       y_train,
       y_test,
       no_iter=1000,
       component_list=[3, 4, 5, 6, 7, 8, 9, 10, 11],
       num_class=7,
       toshow=1,
       file_no=1):

    array_aic = []
    array_bic = []
    array_homo = []
    array_comp = []
    array_sil = []
    array_avg_log = []

    for num_classes in component_list:

        clf = GaussianMixture(n_components=num_classes,
                              covariance_type='spherical',
                              max_iter=no_iter,
                              init_params='kmeans')
        #     clf = KMeans(n_clusters= num_classes, init='k-means++')

        clf.fit(X_train)

        y_test_pred = clf.predict(X_test)
        #Per sample average log likelihood
        avg_log = clf.score(X_test)
        array_avg_log.append(avg_log)

        #AIC on the test data
        aic = clf.aic(X_test)
        array_aic.append(aic)

        #BIC on the test data
        bic = clf.bic(X_test)
        array_bic.append(bic)

        #Homogenity score on the test data
        h**o = metrics.homogeneity_score(y_test, y_test_pred)
        array_homo.append(h**o)

        #Completeness score
        comp = metrics.completeness_score(y_test, y_test_pred)
        array_comp.append(comp)

        #Silhoutette score
        sil = metrics.silhouette_score(X_test, y_test_pred, metric='euclidean')
        array_sil.append(sil)

    #Generating plots

    fig1, ax1 = plt.subplots()
    ax1.plot(component_list, array_aic)
    ax1.plot(component_list, array_bic)
    plt.legend(['AIC', 'BIC'])
    plt.xlabel('Number of clusters')
    plt.title('AIC/BIC curve for Expected Maximization')
    if (toshow == 1):
        plt.savefig(file_no + "em1")
    fig2, ax2 = plt.subplots()
    ax2.plot(component_list, array_homo)
    ax2.plot(component_list, array_sil)
    plt.legend(['homogenity', 'silhoutette'])
    plt.xlabel('Number of clusters')
    plt.title('Performance evaluation scores for Expected Maximization')
    if (toshow == 1):
        plt.savefig(file_no + "em2")

    fig3, ax3 = plt.subplots()
    ax3.plot(component_list, array_avg_log)
    plt.xlabel('Number of clusters')
    plt.title('Per sample average log likelihood for Expected Maximization')

    if (toshow == 1):
        plt.savefig(file_no + "em3")
        plt.show()

    #Training and testing accuracy for K = number of classes

    clf = GaussianMixture(n_components=num_class,
                          covariance_type='spherical',
                          max_iter=no_iter,
                          init_params='kmeans')

    #Assigning the initial means as the mean feature vector for the class

    clf.fit(X_train)

    #Training accuracy
    y_train_pred = clf.predict(X_train)
    train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
    print('Training accuracy for Expected Maximization for K = {}:  {}'.format(
        num_class, train_accuracy))

    #Testing accuracy
    y_test_pred = clf.predict(X_test)
    test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100
    print('Testing accuracy for Expected Maximization for K = {}:  {}'.format(
        num_class, test_accuracy))

    return component_list, array_aic, array_bic, array_homo, array_comp, array_sil, array_avg_log
Esempio n. 33
0
visualizer.fit(results)  # Fit the data to the visualizer
# Finalize and render the figure
visualizer.show(
    outpath="charts/income.k-means.randomization.SilhouetteVisualizer.png")

lowest_bic = np.infty
bic = []
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = GaussianMixture(n_components=n_components,
                              covariance_type=cv_type)
        gmm.fit(results)
        bic.append(gmm.bic(results))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm

bic = np.array(bic)
color_iter = itertools.cycle(
    ['navy', 'turquoise', 'cornflowerblue', 'darkorange'])
clf = best_gmm
bars = []

# Plot the BIC scores
plt.figure(figsize=(8, 6))
spl = plt.subplot(2, 1, 1)
for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
    xpos = np.array(n_components_range) + .2 * (i - 2)
Esempio n. 34
0
    def __do_perform(self,
                     custom_out=None,
                     main_experiment=None
                     ):  # ./output/ICA/clustering//{}', ICAExperiment
        if custom_out is not None:
            # if not os.path.exists(custom_out):
            #     os.makedirs(custom_out)
            self._old_out = self._out  # './output/ICA/{}'
            self._out = custom_out  # ./output/ICA/clustering//{}'
        elif self._old_out is not None:
            self._out = self._old_out

        if main_experiment is not None:
            self.log("Performing {} as part of {}".format(
                self.experiment_name(),
                main_experiment.experiment_name()))  # 'clustering', 'ICA'
        else:
            self.log("Performing {}".format(self.experiment_name()))

        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py
        # %% Data for 1-3
        sse = defaultdict(list)
        ll = defaultdict(list)
        bic = defaultdict(list)
        sil = defaultdict(lambda: defaultdict(list))
        sil_s = np.empty(shape=(2 * len(self._clusters) *
                                self._details.ds.training_x.shape[0], 4),
                         dtype='<U21')
        acc = defaultdict(lambda: defaultdict(float))
        adj_mi = defaultdict(lambda: defaultdict(float))
        km = kmeans(random_state=self._details.seed)
        gmm = GMM(random_state=self._details.seed)

        st = clock()
        j = 0
        for k in self._clusters:
            km.set_params(n_clusters=k)
            gmm.set_params(n_components=k)
            km.fit(
                self._details.ds.training_x
            )  # cluster the ICA-transformed input features using kMeans with varying K
            gmm.fit(
                self._details.ds.training_x
            )  # cluster the ICA-transformed input features using GMM with varying k

            km_labels = km.predict(
                self._details.ds.training_x
            )  # give each ICA-transformed input feature a label
            gmm_labels = gmm.predict(self._details.ds.training_x)

            sil[k]['Kmeans'] = sil_score(
                self._details.ds.training_x, km_labels
            )  # compute mean silhouette score for all ICA-transformed input features
            sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels)

            km_sil_samples = sil_samples(
                self._details.ds.training_x, km_labels
            )  # compute silhouette score for each ICA-transformed input feature
            gmm_sil_samples = sil_samples(self._details.ds.training_x,
                                          gmm_labels)
            # There has got to be a better way to do this, but I can't brain right now
            for i, x in enumerate(km_sil_samples):
                sil_s[j] = [
                    k, 'Kmeans', round(x, 6), km_labels[i]
                ]  # record the silhouette score x for each instance i given its label kn_labels[i] by kMeans with value k
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1

            sse[k] = [
                km.score(self._details.ds.training_x)
            ]  # score (opposite of the value of X on the k-Means objective (what is the objective???)
            ll[k] = [gmm.score(self._details.ds.training_x)
                     ]  # per-sample average log-likelihood
            bic[k] = [
                gmm.bic(self._details.ds.training_x)
            ]  # bayesian information criterion (review ???) on the input X

            acc[k]['Kmeans'] = cluster_acc(
                self._details.ds.training_y, km_labels
            )  # compute the accuracy of the clustering algorithm on the ICA-transformed data (against the original y-label) if it predicted the majority y-label for each cluster
            acc[k]['GMM'] = cluster_acc(self._details.ds.training_y,
                                        gmm_labels)

            adj_mi[k]['Kmeans'] = ami(
                self._details.ds.training_y, km_labels
            )  # compute the adjusted mutual information between the true labels and the cluster predicted labels (how well does clustering match truth)
            adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels)

            self.log("Cluster: {}, time: {}".format(k, clock() - st))

        sse = (-pd.DataFrame(sse)).T
        sse.index.name = 'k'
        sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)
                       ]  # Bank sse (left)

        ll = pd.DataFrame(ll).T
        ll.index.name = 'k'
        ll.columns = [
            '{} log-likelihood'.format(self._details.ds_readable_name)
        ]  # Bank log-likelihood

        bic = pd.DataFrame(bic).T
        bic.index.name = 'k'
        bic.columns = ['{} BIC'.format(self._details.ds_readable_name)
                       ]  # Bank BIC

        sil = pd.DataFrame(sil).T
        sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score',
                                             'label']).set_index('k')  #.T
        # sil_s = sil_s.T
        acc = pd.DataFrame(acc).T
        adj_mi = pd.DataFrame(adj_mi).T

        sil.index.name = 'k'
        sil_s.index.name = 'k'
        acc.index.name = 'k'
        adj_mi.index.name = 'k'

        # write scores to files
        sse.to_csv(self._out.format('{}_sse.csv'.format(
            self._details.ds_name)))
        ll.to_csv(
            self._out.format('{}_logliklihood.csv'.format(
                self._details.ds_name)))
        bic.to_csv(self._out.format('{}_bic.csv'.format(
            self._details.ds_name)))
        sil.to_csv(
            self._out.format('{}_sil_score.csv'.format(self._details.ds_name)))
        sil_s.to_csv(
            self._out.format('{}_sil_samples.csv'.format(
                self._details.ds_name)))
        acc.to_csv(self._out.format('{}_acc.csv'.format(
            self._details.ds_name)))
        adj_mi.to_csv(
            self._out.format('{}_adj_mi.csv'.format(self._details.ds_name)))

        # %% NN fit data (2,3)
        # train a NN on clustered data
        grid = {
            'km__n_clusters': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        km = kmeans(random_state=self._details.seed,
                    n_jobs=self._details.threads)
        pipe = Pipeline(
            [('km', km), ('NN', mlp)], memory=experiments.pipeline_memory
        )  # run a NN on the clustered data (only on the cluster labels, or input features + cluster labels???)
        gs, _ = self.gs_with_best_estimator(
            pipe, grid, type='kmeans')  # write the best NN to file
        self.log("KMmeans Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_kmeans.csv'.format(
                self._details.ds_name))
        )  # write grid search results --> bank_cluster_kmeans.csv

        grid = {
            'gmm__n_components': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        gmm = CustomGMM(random_state=self._details.seed)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)],
                        memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(
            pipe, grid, type='gmm')  # write the best NN to file
        self.log("GMM search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_GMM.csv'.format(
                self._details.ds_name))
        )  # write grid search results --> bank_cluster_GMM.csv

        # %% For chart 4/5
        # perform TSNE D.R on training data (why???)
        self._details.ds.training_x2D = TSNE(
            verbose=10, random_state=self._details.seed).fit_transform(
                self._details.ds.training_x)

        ds_2d = pd.DataFrame(
            np.hstack((self._details.ds.training_x2D,
                       np.atleast_2d(self._details.ds.training_y).T)),
            columns=['x', 'y', 'target']
        )  # prepare NN-learnable data using TSNE D.R'd input features + label
        ds_2d.to_csv(
            self._out.format('{}_2D.csv'.format(
                self._details.ds_name)))  # --> bank_2D.csv
        self.log("Done")
Esempio n. 35
0
w = np.exp(-np.exp(3 * w.mean(axis=1)))



# gmm model selection with bic:
lowest_bic = np.infty
bic = []
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a mixture of Gaussians with EM
        gmm = GaussianMixture(n_components=n_components,
                              covariance_type=cv_type, n_init=5)
        gmm.fit(X)
        bic.append(gmm.bic(X))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm

preds = best_gmm.predict(X)
probs = best_gmm.predict_proba(X)

for name, col in zip(cv_types, np.array(bic).reshape(-1, len(cv_types)).T):
    plt.plot(n_components_range, col, label=name)
plt.legend()
plt.savefig('gmm_sklearn_bic/bic.pdf')


data_thr['preds'] = pd.Series(preds).astype("category")