Exemple #1
0
    def calculate_GMM(self, df, coeff, tsne_results):

        clf = mixture.BayesianGaussianMixture(
            n_components=self.dict_c['clusters'], covariance_type='full')
        clf.fit(coeff)

        clusters = clf.predict(coeff)
        probas = clf.predict_proba(coeff)

        df['clusters'] = clusters

        df['probas'] = [[] for x in range(len(df))]

        for i, c in enumerate(probas):
            df['probas'].iloc[i] = probas[i]

        df = df.apply(self.apply_P_AUC_V, axis=1)

        df_groups = pd.DataFrame(columns=df.columns)
        for group in df.groupby('clusters'):
            x = np.argmax(np.array(group[1]['Score']))
            row = group[1].iloc[x]
            df_groups = df_groups.append(row)

        df_groups, coeff, tsne_results = self.calculate_correlation(df_groups)
        return df
Exemple #2
0
 def _estimate_density(self, ds):
     XU = self._get_training_input(ds)
     if self.variational:
         gmm = mixture.BayesianGaussianMixture(**self.opts)
     else:
         gmm = mixture.GaussianMixture(**self.opts)
     gmm.fit(XU.cpu().numpy())
     return gmm
def gmm_cluster(dec_vars, files_root_directory):

    print 'Running clustering.'
    # Fit a Dirichlet process Gaussian mixture using five components
    dpgmm = mixture.BayesianGaussianMixture(covariance_type='full').fit(dec_vars)

    plot_results(dec_vars, dpgmm.predict(dec_vars), dpgmm.means_,
                 dpgmm.covariances_, 1, files_root_directory)
Exemple #4
0
def cluster_datatrace(dt, n_components=10, n_init=1, excludes='_'):
    datatrace_filter = dt.filter(regex='^(?!' + excludes + ')')
    gm = mixture.BayesianGaussianMixture(n_components=n_components,
                                         covariance_type='full',
                                         max_iter=1000,
                                         n_init=n_init).fit(datatrace_filter)
    cluster_gm = gm.predict(datatrace_filter)
    dt['_cluster'] = cluster_gm
def fit_gmm(path_times_list, num_components=2):
    # costs_dummy = np.array(costs_gaus)
    costs_dummy = np.reshape(path_times_list, (-1, 1))
    # costs_gaus_t = np.ndarray.transpose(costs_dummy)
    bayes_gmm = mixture.BayesianGaussianMixture(n_components=num_components)
    bayes_data = bayes_gmm.fit(costs_dummy)

    return bayes_data
def subCluster(clust_sub):
    print 'Loading subcluster...'
    X = featArray(clust_sub)
    dpgmm = mixture.BayesianGaussianMixture(n_components=5,covariance_type='full',n_init=1,max_iter=1000,init_params='kmeans',weight_concentration_prior_type='dirichlet_process').fit(X)
    #dpgmm = mixture.GaussianMixture(n_components=5,covariance_type='full',n_init=1,max_iter=1000,init_params='kmeans').fit(X)                          
    labels = dpgmm.predict(X)
    labels = labels.reshape(-1,1024)
    return labels
Exemple #7
0
 def Fit(self, Y):
     Y = np.reshape(Y, (len(Y), 1))
     Y = self.preprocess(Y)
     dpgmm = mixture.BayesianGaussianMixture(n_components=self.init_compo,
                                             weight_concentration_prior=1,
                                             max_iter=100,
                                             tol=1e-8).fit(Y)
     return dpgmm
Exemple #8
0
def clusters(transcriptFile, demonstrations=None, transcripts=None):
    #for i in range (demonstrations.shape[0]):
    traj = demonstrations
    temporal_window = 2
    traj = generate_transition_features(traj, temporal_window)
    print traj.shape
    n_components_range = range(5, 6)
    cv_types = ['full']

    for cv_type in cv_types:
        for n_components in n_components_range:
            gmm = mixture.BayesianGaussianMixture(n_components=15,
                                                  covariance_type='full',
                                                  max_iter=1000,
                                                  tol=1e-5,
                                                  random_state=00)

            #gmm = cluster.AgglomerativeClustering(linkage = 'average', n_clusters = 6)
            start = time.time()
            results = gmm.fit(traj)
            end = time.time()
            #gmm.predict(traj[0].reshape(1,-1))
            print "time taken: {}".format(start - end)
            #gmm = mixture.DPGMM(n_components = 7, covariance_type='diag', n_iter = 10000, tol= 1e-4)
            #gmm = mixture.GaussianMixture(n_components=n_components, max_iter = 10000,covariance_type=cv_type,  tol = 1e-5, random_state = 500)
            results = gmm.predict(traj)
            best_gmm = gmm
    score = 0
    cp_times = []
    prev = 0
    time_stamp = np.zeros((results.shape[0], 1))
    for i in range(len(time_stamp)):
        time_stamp[i][0] = i
    results = results.reshape(-1, 1)
    results = np.concatenate((results, time_stamp), axis=1)
    new_segments = np.concatenate((results, traj), axis=1)
    new_segments = np.sort(new_segments, axis=0)

    print new_segments[1][1]

    current_label = new_segments[0][0]
    cluster_array = []
    for i in range(new_segments.shape[0]):
        if (new_segments[i][0] != current_label):
            current_label = new_segments[i][0]
            subClusters(cluster_array)
            cluster_array = []

        else:
            cluster_array.append(new_segments[i][1:])

    #results = results.reshape(-1,1)
    #print "checking results {} {}" .format(results.shape, traj.shape)
    #traj = np.concatenate((traj, results), axis = 1)
    #gmm = mixture.GaussianMixture(n_components=5, max_iter = 10000,covariance_type='full',  tol = 1e-5, random_state = 00)
    #gmm.fit(traj)
    #results = gmm.predict(traj)
    '''transition_points = []
def remove_dc_from_spad_gmm(h,
                            n_components=4,
                            weight_concentration_prior=1e0,
                            depth_values=None,
                            axs=None):
    assert len(h.shape) == 1
    h_denoised = h.copy().astype('float')
    #     bins = np.logspace(np.log(np.min(h[h > 0])), np.log(np.max(h)), 100)
    #     h_hist_hist, cutoffs = np.histogram(h, bins=bins)
    #     print(cutoffs)
    nz_ind = h > 0
    h_nz = h[h > 0].copy()

    #     model = skmix.GaussianMixture(n_components=n_components)
    model = skmix.BayesianGaussianMixture(
        n_components=n_components,
        weight_concentration_prior=weight_concentration_prior)
    if depth_values is None:
        classes = model.fit_predict(np.log(h_nz).reshape(-1, 1))
    else:
        #         print(np.stack([np.log(h_nz), depth_values_nz]).shape)
        #         print(np.log(h_nz).shape)
        depth_values_nz = depth_values[h > 0]
        classes = model.fit_predict(
            np.stack([np.log(h_nz), depth_values_nz], axis=-1))


#         print(classes.shape)
#     classes = model.fit_predict(h_nz.reshape(-1,1))
#     print(model.weights_)
#     print(classes)

    print([(np.mean(h_nz[classes == i]), i) for i in np.unique(classes)])
    noise_class = min(
        (np.mean(h_nz[classes == i]), i) for i in np.unique(classes))[1]
    print(noise_class)
    print(len(h_nz[classes == noise_class]))
    cutoff = (np.max(h_nz[classes == noise_class]) +
              np.min(h_nz[classes != noise_class])) / 2
    if axs is not None:
        axs[0].bar(range(len(h)), h, log=True)
        axs[0].axhline(y=cutoff, color='r', linewidth=0.5)
        h_noise, _ = np.histogram(np.log(h_nz[classes == noise_class]),
                                  bins=200,
                                  range=(np.min(np.log(h_nz)),
                                         np.max(np.log(h_nz))))
        h_signal, _ = np.histogram(np.log(h_nz[classes != noise_class]),
                                   bins=200,
                                   range=(np.min(np.log(h_nz)),
                                          np.max(np.log(h_nz))))
        axs[1].bar(range(len(h_noise)), h_noise)
        axs[1].bar(range(len(h_signal)), h_signal)
    h_denoised[h_denoised <= cutoff] = 0.
    dc = np.mean(h_nz[classes == noise_class])
    h_denoised[h_denoised > cutoff] -= dc
    #     plt.figure()
    #     plt.bar(range(len(h)), h, log=True)
    return h_denoised
Exemple #10
0
def run_ES_SCOREplus(W, k, c=0.1):
    start = time.time()
    r = k + 1
    n = len(W)
    Degree = np.sum(W, axis=1)
    # D = np.diag(Degree)
    delta = c * max(Degree)
    # I = np.identity(len(Degree))
    d = 1. / np.sqrt(np.add(delta, Degree))
    # d = 1. / np.add(delta, Degree)
    # D^(-1/2) L D^(-1/2)
    sqrtMatrix = np.diag(d)
    L = np.dot(np.dot(sqrtMatrix, W), sqrtMatrix)
    eig_val, eig_vect = eigsh(L, r, which='LM')
    tao = 0.2
    ratio = eig_val[k] / eig_val[k - 1]

    F = np.dot(eig_vect[:, :r], np.diag(eig_val[:r]))
    if ratio < 1 + tao:
        F = F[:, :]
        for i in range(r - 1):
            F[:, i] = np.multiply(eig_vect[:, i], 1. / eig_vect[:, r - 1])
        temp = (eig_val[0] - eig_val[1]) / eig_val[1]
        # print(temp)
        if temp < c:
            F = F[:, 1:(r - 1)]
        # sp_kmeans = KMeans(n_clusters=k).fit(F)
        sp_kmeans = mixture.BayesianGaussianMixture(
            n_components=k + 1, covariance_type='full').fit(F)
    else:
        F = F[:, :r - 1]
        for i in range(r - 1):
            F[:, i] = np.multiply(eig_vect[:, i], 1. / eig_vect[:, r - 1])
        temp = (eig_val[0] - eig_val[1]) / eig_val[1]
        # print(temp)
        if temp < c:
            F = F[:, 1:(r - 1)]
        # sp_kmeans = KMeans(n_clusters=k).fit(F)
        sp_kmeans = mixture.BayesianGaussianMixture(
            n_components=k + 1, covariance_type='full').fit(F)
    # print(ratio, 1 + tao)
    end = time.time()
    # print(p, max(l)-min(l)+1)
    # return sp_kmeans.labels_, end - start
    return sp_kmeans.predict(F), end - start
def vbgmm_clustering(dataset, parameters):
    cputime_start = time.process_time()
    vbgmm_result = mixture.BayesianGaussianMixture(n_components=parameters["max_n_components"]).fit(dataset)
    result_labels = vbgmm_result.predict(dataset)
    cputime_end = time.process_time()

    n_clusters = determine_n_clusters(result_labels)

    return result_labels, cputime_end - cputime_start, n_clusters
Exemple #12
0
def dpmm_calc_scores(model, train_dataset, eval_normal_dataset, eval_abn_dataset=None, args=None,
                     ret_metadata=False, dpmm_components=10, dpmm_downsample_fac=10, pt_dpmm_path=None):
    """
    Wrapper for extracting features for DNS experiment, given a trained DCEC models, a normal training dataset and two
    datasets for evaluation, a "normal" one and an "abnormal" one
    :param model: A trained model
    :param train_dataset: "normal" training dataset, for alpha calculation
    :param eval_normal_dataset: "normal" or "mixed" evaluation dataset
    :param eval_abn_dataset: "abnormal" evaluation dataset (optional)
    :param args - command line arguments
    :param ret_metadata:
    :param dpmm_components:  Truncation parameter for DPMM
    :param dpmm_downsample_fac: Downsampling factor for DPMM fitting
    :param pt_dpmm_path: Path to a pretrained DPMM model
    :return actual experiment done after feature extraction (calc_p)
    """
    # Alpha calculation and fitting
    train_p = calc_p(model, train_dataset, args, ret_metadata=False)
    eval_p_ret = calc_p(model, eval_normal_dataset, args, ret_metadata=ret_metadata)
    if ret_metadata:
        eval_p_normal, metadata = eval_p_ret
    else:
        eval_p_normal = eval_p_ret

    p_vec = eval_p_normal
    eval_p_abn = None
    if eval_abn_dataset:
        eval_p_abn = calc_p(model, eval_abn_dataset, args, ret_metadata=ret_metadata)
        p_vec = np.concatenate([eval_p_normal, eval_p_abn])

    print("Started fitting DPMM")
    if pt_dpmm_path is None:
        dpmm_mix = mixture.BayesianGaussianMixture(n_components=dpmm_components,
                                                   max_iter=500, verbose=1, n_init=1)
        dpmm_mix.fit(train_p[::dpmm_downsample_fac])
    else:
        dpmm_mix = load(pt_dpmm_path)

    dpmm_scores = dpmm_mix.score_samples(p_vec)

    if eval_p_abn is not None:
        gt = np.concatenate([np.ones(eval_p_normal.shape[0], dtype=np.int),
                             np.zeros(eval_p_abn.shape[0], dtype=np.int)])
    else:
        gt = np.ones_like(dpmm_scores, dtype=np.int)

    try:  # Model persistence
        dpmm_fn = args.ae_fn.split('.')[0] + '_dpgmm.pkl'
        dpmm_path = os.path.join(args.ckpt_dir, dpmm_fn)
        dump(dpmm_mix, dpmm_path)
    except ModuleNotFoundError:
        print("Joblib missing, DPMM not saved")

    if ret_metadata:
        return dpmm_scores, gt, metadata
    else:
        return dpmm_scores, gt
 def cluster(self, X_train):
     dpgmm = mixture.BayesianGaussianMixture(n_components=self.n_cluster,
                                             covariance_type='full',
                                             max_iter=400).fit(X_train)
     while dpgmm.converged_ == False:
         max_iter = dpgmm.n_iter_ * 2
         print("increase the number of iteration to {} to converge".format(
             max_iter))
         dpgmm = mixture.BayesianGaussianMixture(
             n_components=self.n_cluster,
             covariance_type='full',
             max_iter=max_iter).fit(X_train)
     X_prediction_vgmm = dpgmm.predict(X_train)
     dict = {}
     for i in range(self.n_cluster):
         # dict[i] istore the index of data belongs to cluster i
         dict[str(i)] = np.where(X_prediction_vgmm == i)[0].tolist()
     return dict, X_prediction_vgmm
Exemple #14
0
def predictBayesian(data2D):
    bgmm = mixture.BayesianGaussianMixture(n_components=7,
                                           covariance_type='diag',
                                           weight_concentration_prior=1e-5,
                                           max_iter=10000,
                                           random_state=1).fit(data2D)

    labels = bgmm.predict(data2D)
    return bgmm, labels
    def dirichlet_gmm(self,
                      seed=1,
                      gmm_cmpts=10,
                      prior=1e-3,
                      plot_clusters=False):
        """
        Cluster the data using the Dirichlet Process Gaussian Mixtures method. 
        Approximates an infinite mixture model with a finite one, using the 
        stick-breaknig process. Implemented using the scikit-learn 
        BayesianGaussianMixture() function.
        
        Method developed from:
        http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_sin.html#sphx-glr-auto-examples-mixture-plot-gmm-sin-py
        
        Parameters:
            - seed, int, the random number seed (for repeatability)
            - gmm_cmpts, int, the max number of Gaussian distributions to use
            - prior, float, the Dirichlet concentration of each component. 
                Usually referred to as gamma.
            - plot_clusters, boolean, flag to indicate whether to plot the 
                output. USE WITH CAUTION!! Will only work if data is 2D.
            
        Return:
            - dataframe; each subreddit (string) is assigned to a cluster (int)
        """
        # train the DP-GMM
        dp_gmm = mixture.BayesianGaussianMixture(
            n_components=gmm_cmpts,
            covariance_type='full',
            n_init=10,  # run the model 10 times, and take the best run
            weight_concentration_prior=prior,
            weight_concentration_prior_type='dirichlet_process',
            mean_precision_prior=prior,
            init_params="random",
            random_state=seed).fit(self.data)

        # generate cluster labels
        clusts = dp_gmm.predict(self.data)
        print("Estimated number of DP-GMM clusters: " + str(len(set(clusts))))

        # If required, plot the clusters on a 2D scatterplot with ellipses to
        # show Gaussian components.
        #************************************************
        # *** CAUTION!! WILL ONLY WORK WITH 2D DATA!! ***
        #************************************************
        if plot_clusters:
            X = np.array(self.data.reset_index()[['x', 'y']])
            self.plot_dpgmms(X, clusts, dp_gmm.means_, dp_gmm.covariances_, 1,
                             "Bayesian GMM with a Dirichlet process prior")

        # assign labels to subreddits and return
        df = pd.DataFrame({
            'subreddit': self.data.index.values,
            'cluster': clusts
        }).set_index('subreddit')
        return df
Exemple #16
0
 def getProposalFromSamples(self, iterNO):
     if not self.getSmcSamples():
         RuntimeError("SMC samples not yet loaded...")
     else:
         gmm = mixture.BayesianGaussianMixture(n_components=self.__maxNumComponents,
                                               weight_concentration_prior=self.__priorWeight, covariance_type='full',
                                               tol=1e-5, max_iter=int(1e5), n_init=100)
         gmm.fit(self.getSmcSamples()[iterNO])
         proposal = np.exp(gmm.score_samples(self.getSmcSamples()[iterNO]))
         return proposal / sum(proposal)
def Cluster(X, ncomps):
    dpgmm = mixture.BayesianGaussianMixture(
        n_components=ncomps,
        covariance_type='full',
        n_init=20,
        max_iter=10000,
        init_params='kmeans',
        weight_concentration_prior_type='dirichlet_process').fit(X)
    labels = dpgmm.predict(X)
    return labels
 def unsup_cluster(self, data_idx):
     # http://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html
     max_mean_dist = 0.001
     max_sum_xy = 0.2
     min_n_p = round(data_idx.__len__() / 50)
     pc_tab = self.__mat2tab(self.point_cloud)
     data = pc_tab[data_idx, :]
     import sklearn.mixture as skm
     bgm = skm.BayesianGaussianMixture(
         n_components=50,
         covariance_type='full',
         tol=1e-3,
         reg_covar=1e-6,
         max_iter=200,
         n_init=2,
         init_params='kmeans',
         weight_concentration_prior_type='dirichlet_process',
         weight_concentration_prior=None,
         mean_precision_prior=None,
         mean_prior=None,
         degrees_of_freedom_prior=None,
         covariance_prior=None,
         random_state=None,
         warm_start=False,
         verbose=0,
         verbose_interval=10)
     #bgm.fit(floors, self.__mat2tab(self.label))
     bgm.fit(data)
     predicted = bgm.predict(data)
     planes = []
     for k in range(np.min(predicted), np.max(predicted) + 1):
         klabel = [i for i, j in enumerate(predicted) if j == k]
         if (klabel.__len__() > 10):
             # print('n. of points: '+str(klabel.__len__()))
             idx = [data_idx[kk] for kk in klabel]
             le = idx.__len__()
             plane_coeff = self.__find_plane(pc_tab[idx, :])
             distances = self.point_plane_dist(idx, plane_coeff)
             sq_distances = [
                 distances[i]**2 for i in range(distances.__len__())
             ]
             mean_dist = np.mean(sq_distances)
             if (mean_dist < max_mean_dist and le > min_n_p
                     and abs(plane_coeff[0]) + abs(plane_coeff[1]) <
                     max_sum_xy):
                 planes.append({
                     'dist': mean_dist,
                     'eq': plane_coeff,
                     'idx': idx
                 })
                 print(
                     '[a,b,c,d]=[{0[0]:.2f},{0[1]:.2f},{0[2]:.2f},{0[3]:.2f}]; '
                     .format(plane_coeff) + str(le) + ' points; ' +
                     'dist: {:2.2E}'.format(mean_dist))
     return planes
Exemple #19
0
def dpgmm_(df, n_components):
    X = df.to_numpy()
    dpgmm = mixture.BayesianGaussianMixture(n_components,
                                            covariance_type='full').fit(X)
    plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_,
                 'Bayesian Gaussian Mixture with a Dirichlet process prior')
    labels = dpgmm.predict(X)

    plt.show()
    final_index = user_select(df, labels)
    return final_index
Exemple #20
0
 def train_gmm(self, nComponents=5, nPoints=1e4):
     X = np.random.choice(self.X.ravel(), size=int(nPoints), replace=False)
     gmm = mixture.BayesianGaussianMixture(n_components=nComponents, covariance_type='full').fit(col(X))
     phi = gmm.weights_.squeeze()
     mu = gmm.means_.squeeze()
     var = gmm.covariances_.squeeze()
     # sort
     ix = mu.argsort()
     self.phi = phi[ix]
     self.mu = mu[ix]
     self.var = var[ix]
Exemple #21
0
 def get_gmm_sample_data(self, incoming_df, column_list, sample_size):
     """
     Unsupervised Learning in the form of BayesianGaussianMixture to create sample data.
     """
     gmm = mixture.BayesianGaussianMixture(n_components=2,
                                           covariance_type="full",
                                           n_init=100,
                                           random_state=42).fit(incoming_df)
     clustered_data = gmm.sample(sample_size)
     clustered_df = pd.DataFrame(clustered_data[0], columns=column_list)
     return clustered_df
def variational_gmm_beta(x, alpha, beta):
    n = x.shape[0]
    vi = mixture.BayesianGaussianMixture(
        n_components=2,
        covariance_type='full',
        weight_concentration_prior_type='dirichlet_distribution').fit(x)
    pi_hat = vi.predict_proba(x)[:, 0]
    S = np.sum(pi_hat)
    gamma1 = alpha + S
    gamma2 = beta + n - S
    return stats.beta.mean(gamma1, gamma2), stats.beta.var(gamma1, gamma2)
Exemple #23
0
 def get_bgmm():
     return mixture.BayesianGaussianMixture(n_components=k,
                                            weight_concentration_prior=weight_concentration_prior,
                                            reg_covar=reg_covar,
                                            covariance_type='full',
                                            n_init=n_init,
                                            max_iter=max_iter,
                                            random_state=random_state,
                                            init_params='random',
                                            verbose=3,
                                            verbose_interval=10,
                                            )
def cluster_validation(n_digit, X):
    print("methods  silhouette_score  calinski_harabasz_score  davies_bouldin_score")
    # # Fit a kmeans clustering model
    kmeans = KMeans(init='k-means++', n_clusters=n_digit, max_iter=3000, tol=1e-4, n_init=10, random_state=0)
    get_score("kmeans", kmeans, X)
    # Fit a Gaussian mixture with EM using five components
    gmm = mixture.GaussianMixture(n_components=n_digit, max_iter=3000, tol=1e-4, covariance_type='spherical', random_state=0)
    get_score("GMM", gmm, X)
    # # Fit a Dirichlet process Gaussian mixture using five components
    dpgmm = mixture.BayesianGaussianMixture(n_components=n_digit, max_iter=3000, tol=1e-4,
                                        covariance_type='spherical', random_state=0)
    get_score("DPGMM", dpgmm, X)
 def fit(self, data):
     cov_prior = [self.dp_gmm[4] for _ in range(data.shape[1])]
     # mean_prior = [self.dp_gmm[5] for _ in range(data.shape[1])]
     mean_prior = [0 for _ in range(data.shape[1])]
     self.model = mix.BayesianGaussianMixture(
         n_components=self.dp_gmm[0],
         max_iter=self.dp_gmm[1],
         weight_concentration_prior=self.dp_gmm[2],
         covariance_type=self.dp_gmm[3],
         covariance_prior=cov_prior,
         mean_prior=mean_prior)  # uses a dirichlet process GMM to cluster
     return self.model.fit(data)
Exemple #26
0
def create_bayesian_gaussian_mixture(
        data,
        component_count=1,
        covariance_type='full',
        max_iteration_count=DEFAULT_MAX_ITERATION_COUNT):
    """Creates a Dirichlet process Gaussian mixture with the specified number of components and fits
	the specified data with the expectation-maximization (EM) algorithm. Note that the Dirichlet
	process model adapts the number of components automatically."""
    model = mixture.BayesianGaussianMixture(n_components=component_count,
                                            covariance_type=covariance_type,
                                            max_iter=max_iteration_count)
    return model.fit(data)
def trainGMM(X):
    """
    returns : gmmPredLabels, gmmPredScores
    """
    dpgmm = mixture.BayesianGaussianMixture(n_components=6,covariance_type='full',max_iter=1000).fit(X)
    gmmPredLabels = dpgmm.predict(X)
    gmmPredScores = dpgmm.predict_proba(X)

    with open('gmmModel.p', 'wb') as fp:
        pickle.dump(dpgmm, fp)

    return gmmPredLabels,gmmPredScores
Exemple #28
0
def main():
    researchResults = parse_csv_files(result_file_path)
    df = create_mean_results_dataframe(researchResults)

    df = df.drop('AC', 1)
    numeric_df = df._get_numeric_data()
    pca_2 = PCA(2)
    plot_columns = pca_2.fit_transform(numeric_df)

    is_gmm = True

    optimal_cluster_number = find_best_cluster_number(
        plot_columns, 4, ceil(sqrt(len(df.index))))
    if is_gmm:
        cluster_number = optimal_cluster_number
        gmm = create_em_mixture(plot_columns, optimal_cluster_number)
    else:
        cluster_number = 4
        gmm = mixture.BayesianGaussianMixture(
            n_components=4,
            covariance_type='full',
            weight_concentration_prior_type='dirichlet_process').fit(
                plot_columns)

    ward = AgglomerativeClustering(n_clusters=cluster_number,
                                   linkage='ward').fit(plot_columns)
    predicted = gmm.predict(plot_columns)

    df['Cluster'] = predicted
    clustering_plot(plot_columns, gmm.means_, gmm.covariances_, df['Operator'],
                    predicted)

    clusters = [ClusterData(i + 1) for i in range(len(predicted))]
    for index, row in df.iterrows():
        clusters[row['Cluster']].elements.append(row)

    file = open(output_file_path, "w")
    file.write('Type ')
    for i, val in enumerate(clusters):
        if not val.elements:
            continue
        file.write(str(val.number))
        file.write('\n')
        for j, row in enumerate(val.elements):
            for k, cell in row.iteritems():
                file.write(str(cell))
                file.write(' ')
            file.write('\n')
        file.write('\n')
    file.close()

    plt.show()
Exemple #29
0
def epsilloids(X, n, covariance_type='full'):
    # Fit a Gaussian mixture with EM using five components
    gmm = mixture.GaussianMixture(n_components=n, covariance_type=covariance_type).fit(X)
    plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0,
                 'Gaussian Mixture')

    # Fit a Dirichlet process Gaussian mixture using five components
    dpgmm = mixture.BayesianGaussianMixture(n_components=n,
                                            covariance_type=covariance_type).fit(X)
    plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
                 'Bayesian Gaussian Mixture with a Dirichlet process prior')

    plt.show()
def vbgmm_clustering(dataset, parameters):
    if parameters["max_n_components"] > dataset.shape[0] - 1:
        n_components = dataset.shape[0] - 1
    else:
        n_components = parameters["max_n_components"]

    vbgmm_result = mixture.BayesianGaussianMixture(
        n_components=n_components).fit(dataset)
    result_labels = vbgmm_result.predict(dataset)

    n_clusters = determine_n_clusters(result_labels)

    return result_labels, n_clusters