Esempio n. 1
0
    def calc_gmm(self, dim=10, calc_times=100, force=False):
        from sklearn.mixture import GaussianMixture
        logger.info("calculating gmm centers")
        X = self.mus
        gmm_path = self.path_to_save_results / "gmm.pkl"
        if gmm_path.exists():
            logger.info(f"loading {gmm_path}")
            with gmm_path.open("rb") as f:
                best_gmm = pickle.load(f)
                best_aic = best_gmm.aic(X)
        else:
            best_aic = np.inf
            pbar = tqdm(range(calc_times))
            for i in pbar:
                gmm = GaussianMixture(dim, covariance_type="full").fit(X)
                if gmm.aic(X) < best_aic:
                    best_aic = gmm.aic(X)
                    best_gmm = gmm
                pbar.set_description("[" + "⠸⠴⠦⠇⠋⠙"[i % 6] + "]" +
                                     f"{best_aic:.2f}")

            with gmm_path.open("wb") as f:
                pickle.dump(best_gmm, f)

        logger.info(f"best aic : {best_aic}")
        self.gmm = best_gmm
        self.aic = best_aic
        self.gmm_classes = best_gmm.predict(X)
        self.gmm_centers = best_gmm.means_
Esempio n. 2
0
def GMM_find_k(data):
    "Model selection 파트입니다."
    "Scikit-Learn 라이브러리를 이용해 여러 가지 K (2~10)에 대해 GMM을 수행하고, aic를 이용해 적절한 K를 찾습니다."
    "군집화함수의 인수중 하나, random_state=0으로 설정해주세요. (grader 때문입니다.)"
    "Return : 주이진 데이터에 대해 최저의 AIC값을 갖는 number of components K"
    minimum_AIC = 10000000
    for i in range(2, 11):
        GMM = GaussianMixture(n_components=i, random_state=0).fit(data)
        if GMM.aic(data) < minimum_AIC:
            minimum_AIC = GMM.aic(data)
            min_index = i
    return min_index
Esempio n. 3
0
def fitGaussian(ipds, initmean = False, zmw=-1):
    with np.errstate(invalid='ignore'):
        ridx = np.where(ipds > 0)[0]

    lrnn = np.log(ipds[ridx]).reshape(-1,1)
    
    gmm1 = GaussianMixture(1, covariance_type='spherical')
    if initmean:
        gmm2 = GaussianMixture(2, covariance_type='spherical', means_init=np.array([1.1, 3]).reshape((2,1)),
                              weights_init=np.array([.85, .15]), tol=1e-6)
    else:
        gmm2 = GaussianMixture(2, covariance_type='spherical')
    
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=ConvergenceWarning)
        gmm1.fit(lrnn)
        if not gmm1.converged_:
            print('zmw #%d did not converge on gmm1' % (zmw))
        gmm2.fit(lrnn)
        convround = 0
        if not gmm2.converged_:
            gmm2 = GaussianMixture(2, covariance_type='spherical', means_init=np.array([1.1, 3]).reshape((2,1)),
                                  n_init=10, init_params='random', tol=1e-5, max_iter=200)
            gmm2.fit(lrnn)
            convround = 1
            if not gmm2.converged_:
                gmm2 = GaussianMixture(2, covariance_type='spherical', means_init=np.array([1.1, 3]).reshape((2,1)),
                                      n_init=20, init_params='random', tol=1e-4, max_iter=400)
                gmm2.fit(lrnn)
                convround = 2
                if not gmm2.converged_:
                    gmm2 = GaussianMixture(2, covariance_type='spherical', means_init=np.array([1.1, 3]).reshape((2,1)),
                                      n_init=40, init_params='random', tol=1e-3, max_iter=600)
                    gmm2.fit(lrnn)
                    convround = 3
                    if not gmm2.converged_:
                        convround = 9
                        print('zmw #%d did not converge on gmm2 even after extensions' % (zmw))
    aicdif = gmm1.aic(lrnn) - gmm2.aic(lrnn)
    mixmeans = gmm2.means_.flatten()
    mixweights = gmm2.weights_.flatten()
    elevstate = np.argmax(mixmeans)
    resp = gmm2.predict_proba(np.log(ipds[ridx].reshape(-1,1)))
    respfull = np.empty(len(ipds), dtype='float32')
    respfull.fill(np.nan)
    respfull[ridx] = resp[:,elevstate]
    convergInf = str(convround) + '.' + str(gmm2.n_iter_)
    
    return (respfull, np.array([mixmeans[1-elevstate], mixmeans[elevstate]]), 
            np.array([mixweights[1-elevstate], mixweights[elevstate]]), aicdif, convergInf)
Esempio n. 4
0
    def GMM(self):
        data_pull = track_data(self.token, self.seed_track)
        data = data_pull[0]
        tracks = data_pull[1]
        means = []
        vars = []

        # this block optimizes the number of components
        n_comps = 36  # starts with 36 components
        gm_1 = GaussianMixture(n_components=n_comps, random_state=0).fit(data)
        gm_2 = GaussianMixture(n_components=n_comps + 1,
                               random_state=0).fit(data)

        if gm_2.aic(data) > gm_1.aic(data):
            while gm_2.bic(data) > gm_1.bic(data) and n_comps > 1:
                gm_2 = gm_1
                n_comps = n_comps - 1
                gm_1 = GaussianMixture(n_components=n_comps,
                                       random_state=0).fit(data)

        else:
            while gm_2.aic(data) < gm_1.aic(data):
                gm_1 = gm_2
                n_comps = n_comps + 1
                gm_2 = GaussianMixture(n_components=n_comps,
                                       random_state=0).fit(data)

        # block that constructs the final analysis of data set
        gm_out = GaussianMixture(n_components=n_comps,
                                 random_state=0).fit(data)

        for i in range(0, n_comps):

            vars_i = []

            for j in range(0, len(self.stat_names)):

                for k in range(0, len(self.stat_names)):
                    if k == j:
                        vars_i = np.append(vars_i,
                                           gm_out.covariances_[i][j][k])

            if i == 0:
                means = gm_out.means_[i]
                vars = vars_i
            else:
                means = np.vstack((means, gm_out.means_[i]))
                vars = np.vstack((vars, vars_i))

        return [means, vars, n_comps, tracks, data]
Esempio n. 5
0
def _fit_cluster(data, seed=None):
    """
	Fit a Gaussian Mixture Model to the given data.

	Parameters
	----------
	data : array-like, shape=(n_samples, n_features)
		Data.

	seed : None or int or RandomState, default=None
		Initial seed for the RandomState. If seed is None,
		return the RandomState singleton. If seed is an int,
		return a RandomState with the seed set to the int.
		If seed is a RandomState, return that RandomState.

	Returns
	-------
	model : GaussianMixture
		The best fitted Gaussian Miture Model as determined
		by the mean of the BIC and AIC for the respective model.
	"""
    data = np.array(data)
    models = []
    abic = []
    n_components = min([len(data), 10])
    for i in range(n_components):
        if len(data) < 2 * (i + 1): continue
        m = GMM(n_components=i + 1, n_init=5, random_state=seed)
        m.fit(data)
        models.append(m)
        abic.append(np.mean([m.bic(data), m.aic(data)]))
    return models[np.argmin(abic)]
Esempio n. 6
0
    def aic(self):
        AIC = np.zeros(self.n_components - 1, dtype=float)
        for n in range(1, self.n_components):
            clf = GaussianMixture(n_components=n,
                                  covariance_type='diag',
                                  random_state=0)
            clf.fit(self.data)
            AIC[n - 1] = clf.aic(self.data)
        # print(AIC)
        aic_min_index = np.where(AIC == np.min(AIC))
        aic_k = aic_min_index[0][0] + 1
        print("The number of cluster centers AIC choose is :" + str(aic_k))

        aic_gmm = GaussianMixture(n_components=aic_k,
                                  covariance_type='diag',
                                  random_state=0)
        aic_gmm.fit(self.data)
        labels = aic_gmm.predict(self.data)
        for i in range(1, len(labels)):
            for j in range(aic_k):
                if labels[i] == j:
                    plt.scatter(self.data[i][0],
                                self.data[i][1],
                                s=15,
                                c=self.color[j])
        plt.title('GMM-AIC-' + str(len(labels)))
        plt.xlabel('x')
        plt.ylabel('y')
        # plt.savefig("./Fig/sample_size_aic_" + str(len(labels)) + ".png")
        # plt.savefig("./Fig/cluster_num_aic_" + str(len(labels)) + ".png")
        # plt.savefig("./Fig/dimension_num_aic_" + str(len(self.data[0])) + ".png")
        plt.show()
def model_fitting(data, n):

    total_obs = len(data)

    aic = []
    bic = []
    n_components_range = range(1, n + 1)
    print('fitting Gaussian Mixture models to data....')
    for n_components in n_components_range:
        gmm = GMM(n_components=n_components, covariance_type='full')
        gmm.fit(data)

        aic.append(gmm.aic(data))
        bic.append(gmm.bic(data))

    print('evaluating goodness of fit....')

    N = optimal_n_components(aic, total_obs)
    gmm = GMM(n_components=N, covariance_type='full')
    clf = gmm.fit(data)
    mus = clf.means_
    sigmas = clf.covariances_
    weights = clf.weights_
    print('Writing model to disk....')
    model = [mus, sigmas, weights, aic, bic]

    return model
Esempio n. 8
0
def lnp_Xw(X_w, x=None, method='gmm', n_comp_max=10, info_crit='bic', njobs=1):
    ''' Estimate the multi-dimensional pdf at x for a given X_w using a 
    nonparametric density estimation (either KDE or GMM). 
    '''
    if x is None: raise ValueError
    if method not in ['kde', 'gmm']: raise ValueError("method = gkde or gmm")

    if method == 'gmm':
        # find best fit component using information criteria (BIC/AIC)
        gmms, ics = [], []
        for i_comp in range(1, n_comp_max + 1):
            gmm = GMix(n_components=i_comp)
            gmm.fit(X_w)
            gmms.append(gmm)
            if info_crit == 'bic':  # Bayesian Information Criterion
                ics.append(gmm.bic(X_w))
            elif info_crit == 'aic':  # Akaike information criterion
                ics.append(gmm.aic(X_w))
        ibest = np.array(ics).argmin()  # lower the better!
        kern = gmms[ibest]
    elif method == 'kde':
        kern = UT.KayDE(X_w)
    elif method == 'gkde':
        # find the best fit bandwidth using cross-validation grid search
        grid = GridSearchCV(skKDE(), {'bandwidth': np.linspace(0.1, 1.0, 30)},
                            cv=10,
                            njobs=njobs)  # 10-fold cross-validation
        grid.fit(X_w)
        kern = grid.best_estimator_

    if len(x.shape) == 1:
        return kern.score_samples(x[:, None])
    else:
        return kern.score_samples(x)
Esempio n. 9
0
def gmm_eval(data, range_n_clusters=range(2, 16)):
    aics = []
    bics = []
    for n_clusters in range_n_clusters:
        clusterer = GaussianMixture(n_clusters, random_state=42)
        clusterer = clusterer.fit(data)
        cluster_labels = clusterer.predict(data)
        aics.append(clusterer.aic(data))
        bics.append(clusterer.bic(data))

    distances_1 = []
    p1 = Point(initx=np.min(range_n_clusters), inity=aics[0])
    p2 = Point(initx=np.max(range_n_clusters),
               inity=aics[len(range_n_clusters) - 1])
    for i in range(0, len(range_n_clusters) - 1):
        p = Point(initx=i + 1, inity=aics[i])
        distances_1.append(p.distance_to_line(p1, p2))

    distances_2 = []
    p1 = Point(initx=np.min(range_n_clusters), inity=bics[0])
    p2 = Point(initx=np.max(range_n_clusters),
               inity=bics[len(range_n_clusters) - 1])
    for i in range(0, len(range_n_clusters) - 1):
        p = Point(initx=i + 1, inity=bics[i])
        distances_2.append(p.distance_to_line(p1, p2))

    return aics, bics, distances_1, distances_2
Esempio n. 10
0
def GMM_RF(X_test, X_train):
    lowest_bic = np.infty
    bic = []

    # find the original GMM parmeter
    n_components_range = range(1, 7)
    cv_types = ['spherical', 'tied', 'diag', 'full']
    for cv_type in cv_types:
        for n_components in n_components_range:
            gmm = GaussianMixture(n_components=n_components,
                                  covariance_type=cv_type)
            gmm.fit(X_all)
        bic.append(gmm.aic(X_all))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
    #print(best_gmm)

    best_gmm.fit(X_all)
    X_train = best_gmm.predict_proba(X_train)
    X_test = best_gmm.predict_proba(X_test)

    rf = RandomForestClassifier()
    # find the best parameter of random forest
    grid_search_rf = GridSearchCV(rf,
                                  param_grid=dict(),
                                  verbose=3,
                                  scoring='accuracy',
                                  cv=10).fit(X_train, Y_train)
    rf_best = grid_search_rf.best_estimator_

    rf_best.fit(X_train, Y_train)

    pred = rf_best.predict(X_test)
    return pred
Esempio n. 11
0
def get_best_gmm(X_matrix,
                 n_components,
                 score_df,
                 n_sampling=20,
                 means_init=None):
    n_points = len(X_matrix)
    for i in range(n_sampling):
        gmm = GaussianMixture(
            n_components=n_components,
            # reg_covar=0.0000001
            # covariance_type='full',
            means_init=means_init,
            # #weights_init = [0.1, 0.33, 0.26, 0.1]
            # init_params='random'
        )
        sample = random.sample(range(n_points), int(0.8 * n_points))
        X_rand = X_matrix[sample]
        # X_rand = X_matrix

        # np.random.shuffle(X_matrix)
        gmm.fit(X=X_rand)
        this_AIC = gmm.aic(X=X_rand)
        this_BIC = gmm.bic(X=X_rand)
        score_df.loc[i, "AIC"] = this_AIC
        score_df.loc[i, "BIC"] = this_BIC

        if i == 0:
            best_AIC = this_AIC
            best_gmm = gmm
        else:
            if this_AIC < best_AIC:
                best_AIC = this_AIC
                best_gmm = gmm

    return best_gmm, score_df
Esempio n. 12
0
def CV_gauss(input_data, index_to_check):
    X = input_data

    # ros = RandomOverSampler(random_state=0)

    # X = ros.fit_sample(X)

    N, M = X.shape

    # Range of K's to try
    KRange = range(1, 8)
    T = len(KRange)

    covar_type = 'full'  # you can try out 'diag' as well
    reps = 5  # number of fits with different initalizations, best result will be kept

    # Allocate variables
    BIC = np.zeros((T, ))
    AIC = np.zeros((T, ))
    CVE = np.zeros((T, ))

    # K-fold crossvalidation
    CV = model_selection.KFold(n_splits=10, shuffle=True)

    for t, K in enumerate(KRange):
        print('Fitting model for K={0}'.format(K))

        # Fit Gaussian mixture model
        gmm = GaussianMixture(n_components=K,
                              covariance_type=covar_type,
                              n_init=reps).fit(X)

        BIC[t, ] = gmm.bic(X)
        AIC[t, ] = gmm.aic(X)

        # For each crossvalidation fold
        for train_index, test_index in CV.split(X):
            # extract training and test set for current CV fold
            X_train = X[train_index]
            X_test = X[test_index]

            # Fit Gaussian mixture model to X_train
            gmm = GaussianMixture(n_components=K,
                                  covariance_type=covar_type,
                                  n_init=reps).fit(X_train)

            # compute negative log likelihood of X_test
            CVE[t] += -gmm.score_samples(X_test).sum()

    # Plot results

    print(CVE)
    figure(1)
    plot(KRange, BIC, '-*b')
    plot(KRange, AIC, '-xr')
    plot(KRange, 2 * CVE, '-ok')
    legend(['BIC', 'AIC', 'Crossvalidation'])
    xlabel('K')

    show()
Esempio n. 13
0
    def gmm_opt(self, minCluster=5, maxCluster=100, interval=5,
                testSize=0.25, covarType='full', plot=False):
        """
        Perform Gaussian mixture modeling using test set and increasing
        cluster number to optimize number of clusters based on AIC/BIC.

        Parameters
        ----------
        minCluster: int
            minimum number of clusters to test
        maxCluster: int
            maximum number of clusters to test
        interval: int
            interval used to jump cluster numbers
        testSize: float
            fraction of samples to use in test set
        covarType: string
            keyword indicating type of covariance matrix to use

        Returns
        -------
        max of listAIC, max of listBIC, listAIC, listBIC

        """

        # Initialize lists used to calculate scores per cluster size
        listAIC = list()
        listBIC = list()

        # Split features into training and test sets
        train, test = train_test_split(self.featMatrix, test_size=testSize)

        # Perform loop over range of cluster numbers provided
        for i in range(minCluster, (maxCluster+1), interval):
            if i % 20 == 0:
                print("Current cluster number: {}".format(i))
            EMmodel = GaussianMixture(n_components=i, covariance_type=covarType,
                                      init_params='kmeans') # create GMM object
            EMmodel.fit(train) # train model on training set
            
            # Calculate AIC/BIC for test set
            listAIC.append(EMmodel.aic(test))
            listBIC.append(EMmodel.bic(test))
        
        # Create range of cluster sizes to find max (returns max)
        x = np.arange(minCluster, (maxCluster+1), interval)
        
        # Plot results for both AIC and BIC
        if plot==True:
            fig = plt.figure(figsize=(14, 4))
            ax1 = fig.add_subplot(121)
            ax1.set_xlabel("Number of Clusters")
            ax1.set_ylabel("AIC")
            ax2 = fig.add_subplot(122)
            ax2.set_xlabel("Number of Clusters")
            ax2.set_ylabel("BIC")
            ax1.plot(x, listAIC)
            ax2.plot(x, listBIC)

        return x[np.argmin(listAIC)], x[np.argmin(listBIC)], listAIC, listBIC
Esempio n. 14
0
 def elbow(self, isKM=True):
     Error = []
     rng = self.cluster_range
     for i in rng:
         if isKM:
             km = KMeans(n_clusters=i, n_jobs=-1).fit(self.dataX)
             Error.append(km.inertia_)
         else:
             em = GaussianMixture(n_components=i,
                                  init_params='random',
                                  random_state=7).fit(self.dataX)
             Error.append((em.bic(self.dataX), em.aic(self.dataX)))
     import matplotlib.pyplot as plt
     if isKM:
         plt.plot(rng, Error)
     else:
         err = pd.DataFrame(Error)
         plt.plot(rng, err.iloc[:, 0], label='BIC')
         plt.plot(rng, err.iloc[:, 1], label='AIC')
         plt.legend(loc="best")
     clustererType = 'K-Means' if isKM else 'E.M.'
     ylabel = 'Error' if isKM else 'B.I.C.'
     plt.title('Elbow Method Analysis for %s on %s' %
               (clustererType, self.name))
     plt.xlabel('No of clusters')
     plt.ylabel('Error')
     plt.grid(True)
     plt.savefig(
         os.path.join(self.output,
                      self.name + '-' + clustererType + '-elbow.png'))
Esempio n. 15
0
def gmm(X, Y):
    print("Running GMM")

    # Range of k to test
    krange = np.arange(2, 50)

    # Opening log file
    log_path = '../logs/pet_gmm.csv'
    with open(log_path, 'w') as f:
        f.write('k,time,ari,homogeneity,completeness,silhouette,aic,bic\n')

    for k in krange:
        # Computing GMM
        start_time = clock()
        gmm_model = GaussianMixture(k, n_init=10).fit(X)
        clusters = gmm_model.predict(X)
        time_taken = clock() - start_time

        # Computing metrics
        ari = adjusted_rand_score(Y, clusters)
        hom = homogeneity_score(Y, clusters)
        com = completeness_score(Y, clusters)
        sil = silhouette_score(X, clusters)  # Euclidean distance
        aic = gmm_model.aic(X)
        bic = gmm_model.bic(X)

        # Logging metrics
        with open(log_path, 'a') as f:
            out = '{},{},{},{},{},{},{},{}\n'.format(k, time_taken, ari, hom,
                                                     com, sil, aic, bic)
            f.write(out)
Esempio n. 16
0
def optimalNbClustersGMM(pc, c_min, c_max, top=2, plot=False):
    aic = []
    bic = []
    sil = []
    numberOfClusters = range(c_min, c_max)
    for n in numberOfClusters:
        model = GaussianMixture(n, covariance_type='full',
                                random_state=0).fit(pc)
        clusters = model.predict(pc)
        bic.append(model.bic(pc))
        aic.append(model.aic(pc))
        sil.append(
            metrics.silhouette_score(pc,
                                     clusters,
                                     metric='euclidean',
                                     sample_size=None,
                                     random_state=None))

    if plot:
        plt.plot(numberOfClusters, bic, label='BIC')
        plt.plot(numberOfClusters, aic, label='AIC')
        plt.legend()
        plt.title('BIC/AIC')
        plt.xlabel('n_components')
        plt.figure()
        plt.plot(numberOfClusters, sil, label='sil')

    bestBic = np.argsort(bic)[:top] + c_min
    bestAic = np.argsort(aic)[:top] + c_min
    bestSil = np.argsort(sil)[::-1][:top] + c_min
    return bestBic, bestAic, bestSil
Esempio n. 17
0
def dimReducedClusters(x_data, y_data, x, n):

    kmeans = KMeans(n_clusters=2)
    kmeans.fit(x_data)
    kmeans.predict(x_data)
    kLabels = kmeans.labels_

    if x == 0:
        nData = pd.DataFrame(x_data)
        nData['cluster'] = kLabels
        nNetwork(nData, y_data, n + ': After K-Means')

    em = GaussianMixture(n_components=2)
    em.fit(x_data)
    eLabels = em.predict(x_data)
    if x == 0:
        nData = pd.DataFrame(x_data)
        nData['cluster'] = eLabels
        nNetwork(nData, y_data, n + ': After EM')

    a = silhouette_score(x_data, kLabels)
    b = adjusted_rand_score(y_data, kLabels)
    c = adjusted_mutual_info_score(y_data, kLabels)
    d = homogeneity_score(y_data, kLabels)
    e = completeness_score(y_data, kLabels)
    f = fowlkes_mallows_score(y_data, kLabels)
    g = em.bic(x_data)
    h = em.aic(x_data)

    return a, b, c, d, e, f, g, h
Esempio n. 18
0
def plot_gmm_scores(k_range, X_train_transformed, title):
    bic_scores = []
    aic_scores = []
    for k in k_range:
        gmm = GaussianMixture(k, max_iter=500, n_init=10)
        gmm.fit(X_train_transformed)
        bic_scores.append(gmm.bic(X_train_transformed))
        aic_scores.append(gmm.aic(X_train_transformed))

    title_dic = {'fontsize': 7, 'fontweight': 'bold'}

    fig, (ax1) = plt.subplots(1, 1, figsize=(5, 2))
    ax1.set_xlabel("K", title_dic)
    ax1.set_title(title, title_dic)
    ax1.set_ylabel("Score", title_dic)
    ax1.tick_params(axis="x", labelsize=7)
    ax1.tick_params(axis="y", labelsize=7)
    ax1.yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
    ax1.plot(k_range, bic_scores, label="BIC", linewidth=2)
    ax1.grid()

    ax1.plot(k_range, aic_scores, label="AIC", linewidth=2)
    ax1.legend(loc='best', fontsize=6)

    plt.tight_layout()
    plt.grid()

    path = os.path.join(OUTPUT)
    filename = title + ".png"
    filename = os.path.join(path, filename)
    plt.savefig(filename)
    plt.close()
Esempio n. 19
0
def bic_model_selection(x, kval, title, filnam, ylabel):
    plt.clf()
    cv_types = ['spherical', 'tied', 'diag', 'full']
    bic = []
    krange = np.arange(1, kval, 1)
    for cv in cv_types:
        for k in krange:
            gm = GaussianMixture(n_components=k, covariance_type=cv)
            gm.fit(x)
            if ylabel == "BIC Score":
                bic.append(gm.bic(x))
            elif ylabel == "AIC Score":
                bic.append(gm.aic(x))
            else:
                bic.append(gm.score(x))
    color_iter = itertools.cycle(
        ['navy', 'turquoise', 'cornflowerblue', 'darkorange'])
    bars = []
    for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
        xpos = np.array(krange) + .2 * (i - 2)
        bars.append(
            plt.bar(xpos,
                    bic[i * len(krange):(i + 1) * len(krange)],
                    width=.2,
                    color=color))
    plt.title(title)
    plt.xlabel('Number of components')
    plt.ylabel(ylabel)
    plt.legend([b[0] for b in bars], cv_types)
    plt.savefig(filnam)
    return
Esempio n. 20
0
def best_gmm(X,
             max_range=np.arange(2, 11),
             covariance_types=None,
             max_iter=1000,
             n_init=5,
             seed=SEED):
    """
    Return the best Gaussian Mixture Model given the data, a range of K values, and two K selection criteria.

    :param X: usage matrix (made of usage vectors)
    :param max_range: range within the number of clusters should lie
    :param covariance_types: a list containing any subset of this list:
    :param max_iter: maximum number of EM iterations
    :param n_init: number of EM runs
    :param seed: random seed
    :return: best GMM according to Akaike Information Criterion, Bayesian Information Criterion,
             and the respective AIC and BIC scores
    """
    if covariance_types is None:
        covariance_types = ['full', 'spherical', 'tied', 'diag']
    if not isinstance(covariance_types, (list, )):
        covariance_types = [covariance_types]

    aics = defaultdict(list)
    bics = defaultdict(list)
    best_gmm_aic = GMM()
    best_gmm_bic = GMM()

    for i, cov in enumerate(covariance_types):
        for k in max_range:
            m = GaussianMixture(n_components=k,
                                covariance_type=cov,
                                max_iter=max_iter,
                                n_init=n_init,
                                random_state=seed).fit(X)

            if m.aic(X) < best_gmm_aic.aic(X):
                best_gmm_aic = GMM(m)
            if m.bic(X) < best_gmm_bic.bic(X):
                best_gmm_bic = GMM(m)

            bics[cov].append(m.bic(X))
            aics[cov].append(m.aic(X))

    return best_gmm_aic, best_gmm_bic, bics, aics
Esempio n. 21
0
def run_EM(X,y,title):

    kdist = list(np.arange(2,100,5))
    sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = []
    
    for k in kdist:
        start_time = timeit.default_timer()
        em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)
        
        labels = em.predict(X)
        sil_scores.append(sil_score(X, labels))
        y_mode_vote = cluster_predictions(y,labels)
        f1_scores.append(f1_score(y, y_mode_vote))
        homo_scores.append(homogeneity_score(y, labels))
        aic_scores.append(em.aic(X))
        bic_scores.append(em.bic(X))
        
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('Silhouette')
    plt.title(title + ' Exp Max Silhouette')
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('Silhouette')
    plt.title(title + ' Exp Max Silhouette')
    plt.show()
   

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, f1_scores)
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('F1 Score')
    plt.title(title + 'Exp Max F1')
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, aic_scores, label='AIC')
    ax.plot(kdist, bic_scores,label='BIC')
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('Model Complexity Score')
    plt.title(title + 'Exp Max Model Complexity')
    plt.legend(loc="best")
    plt.show()
Esempio n. 22
0
def run_EM(X_norm,y,title):
    range_n_clusters = [2,3,4,5,6]
    silhouette_avg2 = []
    homo2 = []
    comp2 = []
    NMI2 = []
    AIC = []
    BIC = []
    start = time.perf_counter() 
    for index, n_clusters in enumerate(range_n_clusters):
    
        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        
        clusterer2 = GaussianMixture(n_components=n_clusters, random_state=10).fit(X_norm) 
        cluster2_labels = clusterer2.predict(X_norm)
              
        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        
        silhouette_avg2.append(silhouette_score(X_norm, cluster2_labels))
        homo2.append(metrics.homogeneity_score(y, cluster2_labels))
        comp2.append(metrics.completeness_score(y, cluster2_labels))
        NMI2.append(normalized_mutual_info_score(y, cluster2_labels))
        AIC.append(clusterer2.aic(X_norm)) 
        BIC.append(clusterer2.bic(X_norm))  
    end = time.perf_counter() 
    print("EM run time: %.1f [s]" % (start-end))    
    plt.plot(range_n_clusters, silhouette_avg2, label="silhouette")
    plt.plot(range_n_clusters, homo2, label="homogeneity")
    plt.plot(range_n_clusters, comp2, label="completeness")
    plt.plot(range_n_clusters, NMI2, label="NMI")
    plt.ylabel('value')
    plt.xlabel('number of cluster')
    plt.legend(loc="best")
    plt.title(title)
    plt.show()
    
    plt.plot(range_n_clusters, AIC, label="AIC")
    plt.plot(range_n_clusters, BIC, label="BIC")
    plt.ylabel('value')
    plt.xlabel('number of cluster')
    plt.legend(loc="best")
    plt.title(title)
    plt.show()

    #visulization of clusters
    k1 = 4
    plt.figure()
    plt.hist(cluster2_labels, bins=np.arange(0, k1 + 1) - 0.5, rwidth=0.5, zorder=2)
    plt.xticks(np.arange(0, k1))
    plt.xlabel('Cluster label')
    plt.ylabel('Number of samples')
    plt.title(title)
    plt.show()
Esempio n. 23
0
def em_sweep_clusters(clusters, dataset, data, data_labels, dim_red=None):
    if dim_red is None:
        file = './results/em_clusters_' + dataset + '.csv'
    else:
        file = './results/' + dim_red + '_em_clusters_' + dataset + '.csv'
    if dim_red is not None:
        comp_count = best_comp_count(dataset, dim_red)
        if dim_red is "PCA":
            dim_red = PCA(n_components=comp_count, random_state=0)
            transformed_data = dim_red.fit_transform(data)
        if dim_red is "ICA":
            dim_red = FastICA(n_components=comp_count, random_state=0)
            transformed_data = dim_red.fit_transform(data)
        if dim_red is "RP":
            dim_red = SparseRandomProjection(n_components=comp_count,
                                             random_state=0)
            transformed_data = dim_red.fit_transform(data)
        if dim_red is "RF":
            dim_red = RandomForestClassifier(n_estimators=comp_count,
                                             random_state=0,
                                             n_jobs=-1).fit(data, data_labels)
            transformed_data = selectKImportance(dim_red, data, comp_count)
    else:
        transformed_data = data
    print("Transformed data.  Orig shape: ", data.shape, " new shape: ",
          transformed_data.shape)
    with open(file, 'w') as f:
        f.write('{},{},{},{},{},{},{}\n'.format("n_components", "score", "bic",
                                                "aic", "norm_mutual_info",
                                                "purity", "fit_time"))
    for cluster in clusters:
        if dim_red is not None and comp_count < cluster:
            continue
        start = time.time()
        em = GaussianMixture(n_components=cluster,
                             random_state=0).fit(transformed_data)
        end = time.time()
        elapsed = end - start
        print("Fit clusters of: ", cluster, " on ", dataset, " in ", elapsed)
        aic = em.aic(transformed_data)
        print("For clusters of: ", cluster, " on ", dataset,
              " data set, got aic of: ", aic)
        bic = em.bic(transformed_data)
        print("For clusters of: ", cluster, " on ", dataset,
              " data set, got bic of: ", bic)
        score = em.score(transformed_data)
        print("For clusters of ", cluster, " on ", dataset,
              " data set, got score of :", score)
        data_cluster_labels = em.predict(transformed_data)
        nmi = normalized_mutual_info_score(data_labels, data_cluster_labels)
        purity = purity_score(data_labels, data_cluster_labels)
        print("\tValidation: got nmi of:", nmi, " and purity of: ", purity)
        with open(file, 'a') as f:
            f.write('{},{},{},{},{},{},{}\n'.format(cluster, score, bic, aic,
                                                    nmi, purity, elapsed))
    return
 def AIC_selection(self):
     for n_comp in range(1, self.n_components + 1):
         GMM = GaussianMixture(n_components=n_comp, max_iter=10000)
         GMM.fit(self.data)
         self.AIC_score.append(GMM.aic(self.data))
     self.bestAIC_k = np.argmin(self.AIC_score) + 1
     print("Best k by AIC: %d" % (self.bestAIC_k))
     self.model = GaussianMixture(n_components=self.bestAIC_k,
                                  max_iter=10000)
     self.model.fit(self.data)
    def aicMethod(self, data):
        aics = []
        for n_clusters in tqdm(self.cluster_range):
            gmm = GaussianMixture(n_components=n_clusters, covariance_type='full')
            gmm.fit(data)
            aics.append(gmm.aic(data))


        print(aics)
        return self.cluster_range[aics.index(min(aics))]
def gaussian_parameter_search(df, n_components, cov_type="full"):
    AIC = {}
    BIC = {}
    if cov_type == "full":
        for n in n_components:
            gmm = GaussianMixture(n,
                                  covariance_type="full",
                                  max_iter=1000,
                                  n_init=25,
                                  random_state=42).fit(df)
            AIC[n] = gmm.aic(df)
            BIC[n] = gmm.bic(df)
    elif cov_type == "tied":
        for n in n_components:
            gmm = GaussianMixture(n,
                                  covariance_type="tied",
                                  max_iter=1000,
                                  n_init=25,
                                  random_state=42).fit(df)
            AIC[n] = gmm.aic(df)
            BIC[n] = gmm.bic(df)
    elif cov_type == "diag":
        for n in n_components:
            gmm = GaussianMixture(n,
                                  covariance_type="diag",
                                  max_iter=1000,
                                  n_init=25,
                                  random_state=42).fit(df)
            AIC[n] = gmm.aic(df)
            BIC[n] = gmm.bic(df)

    elif cov_type == "spherical":
        for n in n_components:
            gmm = GaussianMixture(n,
                                  covariance_type="spherical",
                                  max_iter=1000,
                                  n_init=25,
                                  random_state=42).fit(df)
            AIC[n] = gmm.aic(df)
            BIC[n] = gmm.bic(df)

    return AIC, BIC
Esempio n. 27
0
 def fit_model(X, n_init=50):
     aic = []
     lowest_aic = np.infty
     for n in range(1, 10):
         mog = GaussianMixture(n_components=n, n_init=n_init)
         mog.fit(X)
         aic.append(mog.aic(X))
         if aic[-1] < lowest_aic:
             lowest_aic = aic[-1]
             best_mog = mog
     return best_mog
Esempio n. 28
0
def compute_em_elbow_curves():
    plt.figure()
    processor.latext_start_figure()
    for dataset in datasets:
        dataset_name = dataset.__class__.__name__
        print('%s' % dataset_name)
        X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans')
        distortions = []
        clusters = []
        times = []
        iterations = []
        silhouette_coefficients = []
        aics = []
        for x in range(2, 11):
            i = int(x)
            print('# of clusters: %i' % i)
            km = GaussianMixture(n_components=i,
                                 n_init=10,
                                 max_iter=600,
                                 random_state=0,
                                 tol=0.0001)
            try:
                t0 = time()
                km.fit(X_train)
                times.append(round(time() - t0, 6))
                print('Converged:',
                      km.converged_)  # Check if the model has converged
                means = km.means_
                covariances = km.covariances_
                aics.append(km.aic(X_train))
                distortions_score = km.score(X=X_train)
                distortions.append(1.0 / distortions_score)
                labels = km.predict(X=X_train)
                score = silhouette_score(X_train, labels)
                clusters.append(i)
                iterations.append(km.n_iter_)
                silhouette_coefficients.append(score)
            except Exception as e:
                pass

        draw_plot(clusters, distortions, 'Distortion', dataset_name, "em")
        draw_plot(clusters, aics, 'AIC', dataset_name, "em")
        draw_plot(clusters, times, 'Training Time', dataset_name, "em")
        draw_plot(clusters, iterations, 'Iterations', dataset_name, "em")
        draw_plot(clusters, silhouette_coefficients, 'Silhouette Coefficient',
                  dataset_name, "em")

        kl = KneeLocator(clusters,
                         distortions,
                         curve="convex",
                         direction="decreasing")
        print(kl.elbow)
    processor.latex_end_figure(caption="Cluster Validation",
                               fig="cluster_curve")
Esempio n. 29
0
    def AIC_extraction(self, X, n_components=10, covariance_type='full'):

        # GMMで学習したモデルをAkaike's Information Criterionで評価
        AIC = []
        for n in range(n_components):
            gm = GaussianMixture(n_components=n + 1,
                                 covariance_type=covariance_type,
                                 random_state=self._random_state)
            gm.fit(X)
            AIC.append(gm.aic(X))

        return AIC
Esempio n. 30
0
 def aic_select(self):
     self.aic_b = True
     minaic = 9999
     for n in range(1, self.n_components + 1):
         gmm = GaussianMixture(n_components=n)
         gmm.fit(self.data)
         self.aic.append(gmm.aic(self.data))
         if self.aic[-1] < minaic:
             minaic = self.aic[-1]
             self.model = deepcopy(gmm)
     print("aic\n", self.aic)
     self.res_n = self.aic.index(minaic) + 1
     print("selected components:", self.res_n, '\n')
Esempio n. 31
0
 def gmm_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='GMM'):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     em_bic = []
     em_aic = []
     em_completeness_score = []
     em_homogeneity_score = []
     em_measure_score = []
     em_adjusted_rand_score = []
     em_adjusted_mutual_info_score = []
     
     cluster_range = np.arange(2, max_clusters+1, 1)
     for k in cluster_range:
         print('K Clusters: ', k)
         
         ##
         ## Expectation Maximization
         ##
         em = GaussianMixture(n_components=k, covariance_type='full')
         em.fit(X_train_scl)
         em_pred = em.predict(X_train_scl)
         
         em_bic.append(em.bic(X_train_scl))
         em_aic.append(em.aic(X_train_scl))        
     
         # metrics
         y_train_score = y_train.reshape(y_train.shape[0],)
         
         em_homogeneity_score.append(homogeneity_score(y_train_score, em_pred))
         em_completeness_score.append(completeness_score(y_train_score, em_pred))
         em_measure_score.append(v_measure_score(y_train_score, em_pred))
         em_adjusted_rand_score.append(adjusted_rand_score(y_train_score, em_pred))
         em_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, em_pred))
         
     
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     ##
     ## BIC/AIC Plot
     ##
     title = 'Information Criterion Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_ic'
     filename = './' + self.out_dir + '/' + name + '.png'
     
     ph.plot_series(cluster_range,
                 [em_bic, em_aic],
                 [None, None],
                 ['bic', 'aic'],
                 cm.viridis(np.linspace(0, 1, 2)),
                 ['o', '*'],
                 title,
                 'Number of Clusters',
                 'Information Criterion',
                 filename)
     
     ##
     ## Score Plot
     ##
     title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_score'
     filename = './' + self.out_dir + '/' + name + '.png'
                 
     ph.plot_series(cluster_range,
                 [em_homogeneity_score, em_completeness_score, em_measure_score, em_adjusted_rand_score, em_adjusted_mutual_info_score],
                 [None, None, None, None, None, None],
                 ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'],
                 cm.viridis(np.linspace(0, 1, 5)),
                 ['o', '^', 'v', '>', '<', '1'],
                 title,
                 'Number of Clusters',
                 'Score',
                 filename)
Esempio n. 32
0
w = np.exp(-np.exp(3 * w.mean(axis=1)))



# gmm model selection with aic:
lowest_aic = np.infty
aic = []
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a mixture of Gaussians with EM
        gmm = GaussianMixture(n_components=n_components,
                              covariance_type=cv_type, n_init=5)
        gmm.fit(X)
        aic.append(gmm.aic(X))
        if aic[-1] < lowest_aic:
            lowest_aic = aic[-1]
            best_gmm = gmm

preds = best_gmm.predict(X)
probs = best_gmm.predict_proba(X)

for name, col in zip(cv_types, np.array(aic).reshape(-1, len(cv_types)).T):
    plt.plot(n_components_range, col, label=name)
plt.legend()
plt.savefig('gmm_sklearn_aic/aic.pdf')


data_thr['preds'] = pd.Series(preds).astype("category")