Beispiel #1
0
def fit_mixtures(X,mag,mbins,binwidth=0.2,seed=None,
                 keepscore=False,keepbic=False,**kwargs):
    kwargs.setdefault('n_components',25)
    kwargs.setdefault('covariance_type','full')
    fits = []
    if keepscore:
        scores = []
    if keepbic:
        bics = []
    if seed:
        np.random.seed(seed)
    for bincenter in mbins:
        # this is not an efficient way to assign bins, but the time
        # is negligible compared to the GMM fitting anyway
        ii = np.where( np.abs(mag-bincenter) < binwidth )[0]
        if False:
            print('{:.2f}: {} qsos'.format(bincenter,len(ii)))
        gmm = GaussianMixture(**kwargs)
        gmm.fit(X[ii])
        fits.append(gmm)
        if keepscore:
            scores.append(gmm.score(X[ii]))
        if keepbic:
            bics.append(gmm.bic(X[ii]))
    rv = (fits,)
    if keepscore:
        rv += (scores,)
    if keepbic:
        rv += (bics,)
    return rv
Beispiel #2
0
    def loggausfit(self):
        self.fitDf['IRM_norm'] = self.fitDf['remanance']/self.fitDf['remanance'].max()
        xstd,distance,means,covras,weights,yfits = [],[],[],[],[],[]
        for i in range(10):
            data = self.rand_data()
            for j in range(20):
                gmm = GMM(self.fitNumber, covariance_type='full')
                model = gmm.fit(data)
                xstd.append(np.std(model.means_))
                means.append(model.means_)
                covras.append(model.covariances_)
                weights.append(model.weights_)

                sample = self.fitDf['field'].values.reshape((-1, 1))

                logprob = model.score_samples(sample)  # M_best.eval(x)
                responsibilities = model.predict_proba(sample)
                pdf = np.exp(logprob)
                pdf_individual = responsibilities * pdf[:, np.newaxis]
                pdf_norm = np.sum(pdf_individual,axis=1)/np.max(np.sum(pdf_individual,
                                                                   axis=1))
                #distance.append(np.max([abs(i-j) for i,j in zip(np.sum(pdf_individual,axis=1),p)]))
                distance.append(1 - spatial.distance.cosine(pdf_norm,self.fitDf['IRM_norm']))
                yfits.append(pdf_individual)
            del data
        df = pd.DataFrame({'xstd':xstd, 'distance':distance, 'means':means,
                           'covras':covras, 'yfits':yfits, 'weights':weights})
        df['cov_max'] = [np.min(i) for i in df['covras']]
        df = df.sort_values(by=['distance','cov_max','xstd'], ascending=[False,True,False])
        pdf_best = df['yfits'].iloc[0]
        self.means = df['means'].iloc[0]
        self.covra = df['covras'].iloc[0]#sigma**2
        self.weights = df['weights'].iloc[0]
        self.pdf_best = pdf_best/np.max(np.sum(pdf_best,axis=1))
Beispiel #3
0
    def fit(self, data, ngauss, n_iter=5000, min_covar=1.0e-6,
            doplot=False, **keys):
        """
        data is shape
            [npoints, ndim]
        """
        from sklearn.mixture import GaussianMixture

        if len(data.shape) == 1:
            data = data[:,numpy.newaxis]

        print("ngauss:   ",ngauss)
        print("n_iter:   ",n_iter)
        print("min_covar:",min_covar)

        gmm=GaussianMixture(
            n_components=ngauss,
            max_iter=n_iter,
            reg_covar=min_covar,
            covariance_type='full',
        )

        gmm.fit(data)

        if not gmm.converged_:
            print("DID NOT CONVERGE")

        self._gmm=gmm
        self.set_mixture(gmm.weights_, gmm.means_, gmm.covariances_)

        if doplot:
            plt=self.plot_components(data=data,**keys)
            return plt
Beispiel #4
0
	def learn_subset(self, search_space):
	
		#Mask undesired features
		current_array = self.vectors[:,search_space]
	
		GM = GaussianMixture(n_components = 2, 
							covariance_type = "full", 
							tol = 0.001, 
							reg_covar = 1e-06, 
							max_iter = 1000, 
							n_init = 25, 
							init_params = "kmeans", 
							weights_init = None, 
							means_init = None, 
							precisions_init = None, 
							random_state = None, 
							warm_start = False, 
							verbose = 0, 
							verbose_interval = 10
							)
							
		GM.fit(current_array)

		labels = GM.predict(current_array)
		unique, counts = np.unique(labels, return_counts = True)
		count_dict = dict(zip(unique, counts))
		
		return count_dict, labels
Beispiel #5
0
 def gmm(nclusters, coords, n_init=50, n_iter=500):
     if USE_GAUSSIAN_MIXTURE:
         est = GaussianMixture(n_components=nclusters, n_init=n_init, max_iter=n_iter)
     else:
         est = GMM(n_components=nclusters, n_init=n_init, n_iter=n_iter)
     est.fit(coords)
     return Partition(est.predict(coords))
Beispiel #6
0
def GaussianMixture(V, **kwargs):
    """Performs clustering on *V* by using Gaussian mixture models. The function uses :func:`sklearn.micture.GaussianMixture`. See sklearn documents 
    for details.

    :arg V: row-normalized eigenvectors for the purpose of clustering.
    :type V: :class:`numpy.ndarray`

    :arg n_clusters: specifies the number of clusters. 
    :type n_clusters: int
    """

    try:
        from sklearn.mixture import GaussianMixture
    except ImportError:
        raise ImportError('Use of this function (GaussianMixture) requires the '
                          'installation of sklearn.')
    
    n_components = kwargs.pop('n_components', None)
    if n_components == None:
        n_components = kwargs.pop('n_clusters',None)
        if n_components == None:
            n_components = 1
    
    n_init = kwargs.pop('n_init', 1)
    
    mixture = GaussianMixture(n_init=n_init, n_components=n_components, **kwargs).fit(V)

    return mixture.fit_predict(V)
Beispiel #7
0
def create_random_gmm(n_mix, n_features, covariance_type, prng=0):
    prng = check_random_state(prng)
    g = GaussianMixture(n_mix, covariance_type=covariance_type)
    g.means_ = prng.randint(-20, 20, (n_mix, n_features))
    g.covars_ = make_covar_matrix(covariance_type, n_mix, n_features)
    g.weights_ = normalized(prng.rand(n_mix))
    return g
def gmm(k, X, run_times=5):
    gm = GMM(k, n_init=run_times, init_params='kmeans')
    #gm = GMM(k)
    gm.fit(X)
    zh = gm.predict(X)
    mu = gm.means_
    cov = gm.covariances_
    return zh, mu, cov
def fit_gmm(samples, ncomponents=2):
    """Given a numpy array of floating point samples, fit a gaussian mixture model."""
    # assume samples is of shape (NSAMPLES,); unsqueeze to (NSAMPLES,1) and train a GMM:
    gmm = GaussianMixture(n_components=ncomponents)
    gmm.fit(samples.reshape(-1,1))
    # return params of GMM in [(coeff, mu, sigma)] format:
    params = [(gmm.weights_[c], gmm.means_[c][0], gmm.covariances_[c][0][0]) for c in range(ncomponents)]
    return params
def gmm(k, X, run_times=10, init='kmeans'):
    """GMM from sklearn library. init = {'kmeans', 'random'}, run_times
    is the number of times the algorithm is gonna run with different
    initializations.
    
    """
    gm = GMM(k, n_init=run_times, init_params=init)
    gm.fit(X)
    zh = gm.predict(X)
    return zh
def main():
    X, Y = get_data(10000)
    print("Number of data points:", len(Y))

    model = GaussianMixture(n_components=10)
    model.fit(X)
    M = model.means_
    R = model.predict_proba(X)

    print("Purity:", purity(Y, R)) # max is 1, higher is better
    print("DBI:", DBI(X, M, R)) # lower is better
Beispiel #12
0
    def fit_conditional_parameters(self, j):
        class_wise_scores = self.get_class_wise_scores(j)
        
        class_wise_parameters = dict()
        for label in self._labels:
            gmm = GaussianMixture(n_components=1)
            gmm.fit(class_wise_scores[label].reshape(-1, 1))
            
            class_wise_parameters[label] = \
                self.Gaussian(mu=gmm.means_.flatten()[0],
                              std=np.sqrt(gmm.covariances_.flatten()[0]))

        return class_wise_parameters
Beispiel #13
0
class GaussianMixture1D(object):
    """
    Simple class to work with 1D mixtures of Gaussians

    Parameters
    ----------
    means : array_like
        means of component distributions (default = 0)
    sigmas : array_like
        standard deviations of component distributions (default = 1)
    weights : array_like
        weight of component distributions (default = 1)
    """
    def __init__(self, means=0, sigmas=1, weights=1):
        data = np.array([t for t in np.broadcast(means, sigmas, weights)])

        components = data.shape[0]
        self._gmm = GaussianMixture(components, covariance_type='spherical')

        self._gmm.means_ = data[:, :1]
        self._gmm.weights_ = data[:, 2] / data[:, 2].sum()
        self._gmm.covariances_ = data[:, 1] ** 2

        self._gmm.precisions_cholesky_ = 1 / np.sqrt(self._gmm.covariances_)

        self._gmm.fit = None  # disable fit method for safety

    def sample(self, size):
        """Random sample"""
        return self._gmm.sample(size)

    def pdf(self, x):
        """Compute probability distribution"""

        if x.ndim == 1:
            x = x[:, np.newaxis]
        logprob = self._gmm.score_samples(x)
        return np.exp(logprob)

    def pdf_individual(self, x):
        """Compute probability distribution of each component"""

        if x.ndim == 1:
            x = x[:, np.newaxis]
        logprob = self._gmm.score_samples(x)
        responsibilities = self._gmm.predict_proba(x)
        return responsibilities * np.exp(logprob[:, np.newaxis])
Beispiel #14
0
    def finish(self):
        print("Calculating mean ToT for each PMT from gaussian fits...")
        gmm = GaussianMixture()
        xs, ys = [], []
        for (dom_id, channel_id), tots in self.tot_data.iteritems():
            dom = self.db.doms.via_dom_id(dom_id)
            gmm.fit(np.array(tots)[:, np.newaxis]).means_[0][0]
            mean_tot = gmm.means_[0][0]
            xs.append(31 * (dom.floor - 1) + channel_id + 600 * (dom.du - 1))
            ys.append(mean_tot)

        fig, ax = plt.subplots()
        ax.scatter(xs, ys, marker="+")
        ax.set_xlabel("31$\cdot$(floor - 1) + channel_id + 600$\cdot$(DU - 1)")
        ax.set_ylabel("ToT [ns]")
        plt.title("Mean ToT per PMT")
        plt.savefig(self.plotfilename)
  def fit(self, X, Y=None):
    if self.method == 'random':
      N = len(X)
      idx = np.random.randint(N, size=self.M)
      self.samples = X[idx]
    elif self.method == 'normal':
      # just sample from N(0,1)
      D = X.shape[1]
      self.samples = np.random.randn(self.M, D) / np.sqrt(D)
    elif self.method == 'kmeans':
      X, Y = self._subsample_data(X, Y)

      print("Fitting kmeans...")
      t0 = datetime.now()
      kmeans = KMeans(n_clusters=len(set(Y)))
      kmeans.fit(X)
      print("Finished fitting kmeans, duration:", datetime.now() - t0)

      # calculate the most ambiguous points
      # we will do this by finding the distance between each point
      # and all cluster centers
      # and return which points have the smallest variance
      dists = kmeans.transform(X) # returns an N x K matrix
      variances = dists.var(axis=1)
      idx = np.argsort(variances) # smallest to largest
      idx = idx[:self.M]
      self.samples = X[idx]
    elif self.method == 'gmm':
      X, Y = self._subsample_data(X, Y)

      print("Fitting GMM")
      t0 = datetime.now()
      gmm = GaussianMixture(
        n_components=len(set(Y)),
        covariance_type='spherical',
        reg_covar=1e-6)
      gmm.fit(X)
      print("Finished fitting GMM, duration:", datetime.now() - t0)

      # calculate the most ambiguous points
      probs = gmm.predict_proba(X)
      ent = stats.entropy(probs.T) # N-length vector of entropies
      idx = np.argsort(-ent) # negate since we want biggest first
      idx = idx[:self.M]
      self.samples = X[idx]
    return self
Beispiel #16
0
    def finish(self):
        print("Calculating mean ToT for each PMT from gaussian fits...")
        gmm = GaussianMixture()
        xs, ys = [], []
        df = pd.DataFrame(self.tot_data)
        for (dom_id, channel_id), data in df.groupby(['dom_id', 'channel_id']):
            tots = data['tot']
            dom = self.db.doms.via_dom_id(dom_id)
            gmm.fit(tots[:, np.newaxis]).means_[0][0]
            mean_tot = gmm.means_[0][0]
            xs.append(31 * (dom.floor - 1) + channel_id + 600 * (dom.du - 1))
            ys.append(mean_tot)

        fig, ax = plt.subplots()
        ax.scatter(xs, ys, marker="+")
        ax.set_xlabel("31$\cdot$(floor - 1) + channel_id + 600$\cdot$(DU - 1)")
        ax.set_ylabel("ToT [ns]")
        plt.title("Mean ToT per PMT")
        plt.savefig(self.plotfilename)
Beispiel #17
0
    def Recognize(self, fn):
        im = Image.open(fn)
        im = util.CenterExtend(im, radius=20)

        vec = np.asarray(im.convert('L')).copy()
        Y = []
        for i in range(vec.shape[0]):
            for j in range(vec.shape[1]):
                if vec[i][j] <= 200:
                    Y.append([i, j])

        gmm = GaussianMixture(n_components=7, covariance_type='tied', reg_covar=1e2, tol=1e3, n_init=9)
        gmm.fit(Y)
        
        centers = gmm.means_

        points = []
        for i in range(7):
            scoring = 0.0
            for w_i in range(3):
                for w_j in range(3):
                    p_x = centers[i][0] -1 +w_i
                    p_y = centers[i][1] -1 +w_j

                    cr = util.crop(im, p_x, p_y, radius=20)
                    cr = cr.resize((40, 40), Image.ANTIALIAS)

                    X = np.asarray(cr.convert('L'), dtype='float')
                    X = (X.astype("float") - 180) /200

                    x0 = np.expand_dims(X, axis=0)
                    x1 = np.expand_dims(x0, axis=3)

                    global model
                    if self.model.predict(x1)[0][0] < 0.5:
                        scoring += 1

            if scoring > 4:
                points.append((centers[i][0] -20, centers[i][1] -20))
                
        return points
Beispiel #18
0
    def __init__(self, means=0, sigmas=1, weights=1):
        data = np.array([t for t in np.broadcast(means, sigmas, weights)])

        components = data.shape[0]
        self._gmm = GaussianMixture(components, covariance_type='spherical')

        self._gmm.means_ = data[:, :1]
        self._gmm.weights_ = data[:, 2] / data[:, 2].sum()
        self._gmm.covariances_ = data[:, 1] ** 2

        self._gmm.precisions_cholesky_ = 1 / np.sqrt(self._gmm.covariances_)

        self._gmm.fit = None  # disable fit method for safety
Beispiel #19
0
    def fit(self, X_train, y_train):
        X_train = np.asarray(X_train)
        y_train = np.asarray(y_train)
        # from sklearn.mixture import GMM as GaussianMixture
        from sklearn.mixture import GaussianMixture

        unlabels = range(0, np.max(y_train) + 1)

        for lab in unlabels:
            if self.each_class_params is not None:
                # print 'eacl'
                # print self.each_class_params[lab]
                model = GaussianMixture(**self.each_class_params[lab])
                # print 'po gmm ', model
            elif len(self.same_params) > 0:
                model = GaussianMixture(**self.same_params)
                # print 'ewe ', model
            else:
                model = GaussianMixture()
            X_train_lab = X_train[y_train == lab]
            # logger.debug('xtr lab shape ' + str(X_train_lab))
            model.fit(X_train_lab)

            self.models.insert(lab, model)

y_pred


# # exercise DBSCAN
1. run DBSCAN with different 
# ## GaussianMixture
# #https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#examples-using-sklearn-mixture-gaussianmixture

# In[71]:


from sklearn.mixture import GaussianMixture

gaussian_mixture = GaussianMixture(n_components=10).fit(x)#, covariance_type='full'
y_pred = gaussian_mixture.predict(x)


cluster_center = gaussian_mixture.means_
cluster_center.shape


# In[72]:


#https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html
aic_error = list()
bic_error = list()

start_ = 2
Beispiel #21
0
 def find_gaussian_clusters(self):
     self.gaussian = GaussianMixture(n_components=4).fit(self.X)
Beispiel #22
0
class TorchDataGenerator(SyntheticDataGenerator):
    def __init__(self, verbose=True):
        self._implicit_coef = None
        self.model = None
        self.verbose = verbose

        self.items_view = None
        self.avg_ratings_per_user = None

        self.item_ids = None
        self.user_vectors = None
        self.item_vectors = None
        self.gmm = None

    def _build_mf(self, ds: Dataset, task: Task, components):
        task.logger.report_text("Training MF")
        u, s, v = sparsesvd(ds.rating_matrix, components)

        self.user_vectors = u.T
        self.item_vectors = np.dot(np.diag(s), v).T

    def _build_torch_model(self, ds: Dataset, task: Task, components):
        task.logger.report_text("Building torch model")

        model = MatrixFactorization(ds.n_users, ds.n_items, components)
        model.user_factors = torch.nn.Embedding(ds.n_users,
                                                components,
                                                _weight=FloatTensor(
                                                    self.user_vectors))
        model.item_factors = torch.nn.Embedding(ds.n_items,
                                                components,
                                                _weight=FloatTensor(
                                                    self.item_vectors))
        self.model = model

    def _sample_unrated_items(self, ds: Dataset, batch_u, batch_i, batch_r):
        return [np.random.randint(0, ds.n_items) for _ in range(len(batch_u))]

    def _construct_loss(self, ds: Dataset, batch_r, batch_u, batch_i,
                        logloss_weight, return_loss_components):
        gt_rating = FloatTensor([batch_r])
        u_tensor = LongTensor([batch_u])
        i_tensor = LongTensor([batch_i])

        batch_unrated_items = self._sample_unrated_items(
            ds, batch_u, batch_i, batch_r)
        batch_unrated_items_tensor = LongTensor(batch_unrated_items)

        rated_prediction, rated_items_logits = self.model(u_tensor, i_tensor)
        unrated_prediction, unrated_items_logits = self.model(
            u_tensor, batch_unrated_items_tensor)

        ones_tensor = torch.ones_like(rated_items_logits)
        zeroes_tensor = torch.zeros_like(unrated_items_logits)

        proba_loss = (((rated_items_logits - ones_tensor)**2).mean() / 2 +
                      ((unrated_items_logits - zeroes_tensor)**2).mean() / 2)
        rating_loss = ((gt_rating - rated_prediction) *
                       (gt_rating - rated_prediction)).mean()
        loss = rating_loss + proba_loss * logloss_weight

        loss_components = {}
        if return_loss_components:
            loss_components = {
                "proba": proba_loss.item(),
                "rating": rating_loss.item(),
                "total": loss.item(),
            }
        return loss, loss_components

    def _train_torch_model(self, ds: Dataset, task: Task, epochs,
                           logloss_weight, lr, batch_size):
        task.logger.report_text("Training torch model")

        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

        loss_components_list = []
        batch_u, batch_i, batch_r = [], [], []

        if self.items_view is None:
            m2 = dok_matrix(ds.rating_matrix)
            self.items_view = list(m2.items())

        for e in range(epochs):
            shuffle(self.items_view)
            for j, ((u, i), r) in enumerate(tqdm(self.items_view)):
                if (len(batch_u) > 0) and (j % batch_size
                                           == 0):  # or j == len(m2.items())):
                    optimizer.zero_grad()

                    loss, loss_components = self._construct_loss(
                        ds, batch_r, batch_u, batch_i, logloss_weight,
                        self.verbose)

                    loss_components_list.append(loss_components)

                    loss.backward()
                    optimizer.step()
                    batch_u = []
                    batch_i = []
                    batch_r = []

                batch_u.append(u)
                batch_r.append(r)
                batch_i.append(i)

            if loss_components_list:
                average_metrics = {
                    component:
                    float(np.mean([x[component]
                                   for x in loss_components_list]))
                    for component in loss_components_list[0]
                }

                if self.verbose:
                    print(
                        "Epoch ", e,
                        "\t".join("%s: %f" % (k, v)
                                  for k, v in average_metrics.items()))

                for component in average_metrics:
                    task.logger.report_scalar("Loss", component,
                                              average_metrics[component], e)

    def _build_gmm(self, task: Task, gmm_clusters):
        task.logger.report_text("Building GMM")
        self.gmm = GaussianMixture(gmm_clusters, verbose=2, verbose_interval=1)
        self.gmm.fit(self.user_vectors)

    def build(self,
              task: Task,
              base_dataset: Dataset,
              epochs=50,
              components=200,
              gmm_clusters=10,
              logloss_weight=2.0,
              lr=5e-3,
              batch_size=5000):
        task.set_user_properties(
            **to_clear_ml_params(locals(), ["task", "self"]))
        self.item_ids = list(range(len(base_dataset.id_to_item)))
        self.avg_ratings_per_user = base_dataset.n_ratings / base_dataset.n_users

        if self.user_vectors is None:
            self._build_mf(base_dataset, task, components)

        self._build_torch_model(base_dataset, task, components)

        self._train_torch_model(base_dataset,
                                task,
                                epochs,
                                logloss_weight=logloss_weight,
                                lr=lr,
                                batch_size=batch_size)

        self._build_gmm(task, gmm_clusters)

    def _sample_users(self, n_users):
        user_vectors, _ = self.gmm.sample(n_users)
        ratings_per_user = np.random.exponential(self.avg_ratings_per_user,
                                                 size=n_users)

        return user_vectors, ratings_per_user

    def _sample_user_items(self, dotproducts, n_items):
        # MAGIC HERE
        # We transform arbitrary scores trained with RMSE loss
        # into probabilities
        # we can do this in a bunch of ways
        # hence, sigmoid and * 10 here
        dotproducts = dotproducts * self._implicit_coef
        logits = 1.0 / (1e-7 + np.exp(-dotproducts))
        logits = logits / np.sum(logits)

        sampled_items = np.random.choice(self.item_ids,
                                         n_items,
                                         False,
                                         p=logits)

        return sampled_items

    def _get_user_rating(self, user_vector, item):
        sampled_rating = user_vector.dot(self.item_vectors[item])
        return sampled_rating

    def generate(self,
                 task,
                 n_users=None,
                 use_actual_user_vectors=False,
                 use_actual_item_choice=False,
                 implicit_coef=15.0,
                 **kwargs):
        task.set_user_properties(
            **to_clear_ml_params(locals(), ["task", "self"]))
        self._implicit_coef = implicit_coef

        if n_users is None:
            n_users = self.user_vectors.shape[0]

        user_vectors, ratings_per_user = self._sample_users(n_users)

        n_items = self.item_vectors.shape[0]
        rating_matrix = dok_matrix((n_users, n_items))

        batches = max(1, int(n_users / 1000))
        for batch_n, user_vectors_batch in enumerate(
                tqdm(np.array_split(user_vectors, batches))):
            user_index_base = batch_n * 1000

            # Super-efficient batched matrix multiplication that exploits pytorch (=GPU)
            probas = self.model.user_choice_probas(user_vectors_batch)

            for user_index_offset in range(len(probas)):
                u = user_index_base + user_index_offset

                ratings_count = int(ratings_per_user[u])
                ratings_count = min(ratings_count, n_items)

                v = user_vectors[user_index_offset, :]

                sampled_items = self._sample_user_items(
                    probas[user_index_offset, :], ratings_count)

                for i in sampled_items:
                    rating_matrix[u, i] = self._get_user_rating(v, i)
                    rating_matrix[u, i] = np.minimum(
                        1.0, np.maximum(rating_matrix[u, i], -1.0))
        rating_matrix = csc_matrix(rating_matrix)
        ds = Dataset(rating_matrix=rating_matrix)
        return ds
Beispiel #23
0
'''
高斯混合聚类
'''

print(__doc__)

from sklearn.mixture import GaussianMixture
from sklearn.datasets import load_iris
from plot_function.cluster_plot import plot_cluster
from sklearn.preprocessing import StandardScaler

iris = load_iris()
data = iris.data
target = iris.target

train_data = StandardScaler().fit_transform(data)

gm = GaussianMixture(n_components=4)
gm.fit(train_data)

labels = gm.predict(train_data)

plot_cluster(train_data, labels)

# labels=gm.labels_
# center=gm.cluster_centers_
# print(gm.inertia_)
# plot_cluster(data,labels)
Beispiel #24
0
    plt.ylabel("Log likelihood")
    plt.legend(['lowest component likelihood'])
    plt.show()
    """
    pca = FA(n_components=3)
    Z = pca.fit_transform(X)

    for k in ks:
        clust = KMeans(n_clusters=k).fit(Z)
        W = clust.predict(Z)
        ss[k - 1] = clust.inertia_

    plt.plot(ks, ss)
    plt.title("Wine Quality - KM")
    plt.xlabel("# of clusters")
    plt.ylabel("Sum of Squares")
    plt.legend(["kmeans"])
    plt.show()

    for k in ks:
        clust = GaussianMixture(n_components=k).fit(Z)
        W = clust.predict(Z)
        ll[k - 1] = clust.score(Z)

    plt.plot(ks, ll)
    plt.title("Wine Quality - EM")
    plt.xlabel("# of clusters")
    plt.ylabel("log of likelihood")
    plt.legend(["EM"])
    plt.show()
Beispiel #25
0
def find_num_clusters(max_clusters):
    # SETUP #
    data = pd.read_csv('./data/scaled.csv')
    x = data.values
    range_n_clusters = range(2, max_clusters + 1)
    all_silhouette_scores = []

    # CLUSTER ITERATION #
    for n_clusters in range_n_clusters:
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.2, 1]
        plt.xlim([-0.2, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        plt.ylim([0, len(x) + (n_clusters + 1) * 10])
        # Initialize the clusterer with n_clusters value
        clusterer = GaussianMixture(n_clusters)
        cluster_labels = clusterer.fit_predict(x)
        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed clusters
        silhouette_avg = silhouette_score(x, cluster_labels)
        all_silhouette_scores.append(silhouette_avg)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(x, cluster_labels)
        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = sample_silhouette_values[
                cluster_labels == i]
            ith_cluster_silhouette_values.sort()
            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i
            color = cm.get_cmap("Spectral")(float(i) / n_clusters)
            plt.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)
            # Label the silhouette plots with their cluster numbers at the middle
            plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples
        plt.title(("Silhouette analysis for GMM clustering with %d clusters" %
                   n_clusters),
                  fontsize=10,
                  fontweight='bold')
        plt.xlabel("Silhouette coefficient values")
        plt.ylabel("Cluster label")
        # The vertical line for average silhouette score of all the values
        plt.axvline(x=silhouette_avg, color="red", linestyle="--")
        plt.yticks([])  # Clear the yaxis labels / ticks
        plt.xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.savefig(f'./graphs/silhouette_{n_clusters}_clusters.png')
        plt.show()
    plt.scatter(range_n_clusters, all_silhouette_scores)
    plt.plot(range_n_clusters, all_silhouette_scores)
    plt.xticks(range_n_clusters)
    plt.title("Silhouette Scores of Clusters")
    plt.xlabel("Number of Clusters")
    plt.ylabel("Silhouette Scores")
    plt.savefig(f'./graphs/silhouette_scores.png')
    plt.show()
    data_df = pd.DataFrame(data)
    writer = pd.ExcelWriter(path)
    data_df.to_excel(writer, 'page_1', float_format='%.5f') # float_format 控制精度
    writer.save()

data_path = 'D:/ZLW_data/SAMM/flow_cut_eye'
frame_list = read_img(data_path, 0)
data_set = np.empty([0, 256*3], dtype=np.float32)
for i in range(len(frame_list)):
    count_b, count_g, count_r = calc_bgr_count(frame_list[i])
    m = bgr_count_form(count_b, count_g, count_r)
    m = np.array([m])
    data_set = np.concatenate([data_set, m])
    print(data_set.shape)

gmm = GaussianMixture(n_components=2).fit(data_set)
labels = gmm.predict(data_set)
save_data_to_excel(labels, 'D:/guass/labels.xlsx')
frame_list0 = []
frame_list1 = []
for i in range(labels.shape[0]):
    if labels[i] == 0:
        print(str(i) + '是标签1')
        frame_list0.append(i)
for i in range(labels.shape[0]):
    if labels[i] == 1:
        print(str(i) + '是标签0')
        frame_list1.append(i)
frame_list0 = np.transpose(np.array([frame_list0]))
frame_list1 = np.transpose(np.array([frame_list1]))
save_data_to_excel(frame_list0, 'D:/guass/frame00.xlsx')
Beispiel #27
0
def test_gaussian_mixture_attributes():
    # test bad parameters
    rng = np.random.RandomState(0)
    X = rng.rand(10, 2)

    n_components_bad = 0
    gmm = GaussianMixture(n_components=n_components_bad)
    assert_raise_message(
        ValueError, "Invalid value for 'n_components': %d "
        "Estimation requires at least one component" % n_components_bad,
        gmm.fit, X)

    # covariance_type should be in [spherical, diag, tied, full]
    covariance_type_bad = 'bad_covariance_type'
    gmm = GaussianMixture(covariance_type=covariance_type_bad)
    assert_raise_message(
        ValueError, "Invalid value for 'covariance_type': %s "
        "'covariance_type' should be in "
        "['spherical', 'tied', 'diag', 'full']" % covariance_type_bad, gmm.fit,
        X)

    tol_bad = -1
    gmm = GaussianMixture(tol=tol_bad)
    assert_raise_message(
        ValueError, "Invalid value for 'tol': %.5f "
        "Tolerance used by the EM must be non-negative" % tol_bad, gmm.fit, X)

    reg_covar_bad = -1
    gmm = GaussianMixture(reg_covar=reg_covar_bad)
    assert_raise_message(
        ValueError, "Invalid value for 'reg_covar': %.5f "
        "regularization on covariance must be "
        "non-negative" % reg_covar_bad, gmm.fit, X)

    max_iter_bad = 0
    gmm = GaussianMixture(max_iter=max_iter_bad)
    assert_raise_message(
        ValueError, "Invalid value for 'max_iter': %d "
        "Estimation requires at least one iteration" % max_iter_bad, gmm.fit,
        X)

    n_init_bad = 0
    gmm = GaussianMixture(n_init=n_init_bad)
    assert_raise_message(
        ValueError, "Invalid value for 'n_init': %d "
        "Estimation requires at least one run" % n_init_bad, gmm.fit, X)

    init_params_bad = 'bad_method'
    gmm = GaussianMixture(init_params=init_params_bad)
    assert_raise_message(
        ValueError,
        "Unimplemented initialization method '%s'" % init_params_bad, gmm.fit,
        X)

    # test good parameters
    n_components, tol, n_init, max_iter, reg_covar = 2, 1e-4, 3, 30, 1e-1
    covariance_type, init_params = 'full', 'random'
    gmm = GaussianMixture(n_components=n_components,
                          tol=tol,
                          n_init=n_init,
                          max_iter=max_iter,
                          reg_covar=reg_covar,
                          covariance_type=covariance_type,
                          init_params=init_params).fit(X)

    assert gmm.n_components == n_components
    assert gmm.covariance_type == covariance_type
    assert gmm.tol == tol
    assert gmm.reg_covar == reg_covar
    assert gmm.max_iter == max_iter
    assert gmm.n_init == n_init
    assert gmm.init_params == init_params
### Step-1 ###
oof = np.zeros(len(train))
pred = np.zeros(len(test))
for i in range(MAX_MAGIC_NO):
    print('.', end='')
    oof_i_list = []
    pred_i_list = []
    train_i = train[magic_tr == i][:, infomative_cols[i]]
    target_i = target[magic_tr == i]
    test_i = test[magic_te == i][:, infomative_cols[i]]
    for n in range(1, MAX_COMPONENTS):
        oof_i_n = np.zeros(len(train_i))
        pred_i_n = np.zeros(len(test_i))
        gmm0 = GaussianMixture(n_components=n,
                               covariance_type='full',
                               random_state=RANDOM_SEED)
        gmm1 = GaussianMixture(n_components=n,
                               covariance_type='full',
                               random_state=RANDOM_SEED)

        for trn_idx, val_idx in kfold.split(train_i, target_i):
            trn_train = train_i[trn_idx, :]
            trn_target = target_i[trn_idx]
            val_train = train_i[val_idx, :]
            gmm0.fit(trn_train[trn_target == 0])
            gmm1.fit(trn_train[trn_target == 1])
            oof_i_n[val_idx] = gmm1.score_samples(
                val_train) - gmm0.score_samples(val_train)
            pred_i_n += (gmm1.score_samples(test_i) -
                         gmm0.score_samples(test_i)) / kfold.n_splits
GMMs will be trained separately on each classes TFIDF samples
'''
TFIDF_class = []
for class_num in range(1, 16):
    TFIDF_class.append(samples_from_class(TFIDFsvd, class_num, labels))
'''
GMM training
We train #classes = 15 GMMS to estimate the distribution of the features 
Each row of the TFIDFsummed is a feature vector on which we train a GMM
'''
GMMS = []
for class_num in range(1, 16):
    # ATTENTION: indexes of TF go from 0 - 14
    #            whereas the class numbers go from 1 - 15
    GMMS.append(
        GaussianMixture(n_components=gmm_components).fit(
            TFIDF_class[class_num - 1]))
'''
Testing
'''
test_labels = []
with open('data/final.test') as test_file:
    testsamples = test_file.readlines()
    num_of_test_data = 0
    #count rows
    for line in testsamples:
        num_of_test_data += 1
        testwords = line.split()
        test_labels.append(testwords[0])
test_file.closed
'''
Find the term document matrix from the test data
    x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)

    x = np.vstack((x1, x2))
    y = np.array([0] * N1 + [1] * N2)

    '''
        spherical:圆形
        diag:对角线
        tied:方差一样
        full:方差可以不一样
    '''
    types = ('spherical', 'diag', 'tied', 'full')
    err = np.empty(len(types))
    bic = np.empty(len(types))
    for i, type in enumerate(types):
        gmm = GaussianMixture(n_components=2, covariance_type=type, random_state=0)
        gmm.fit(x)
        err[i] = 1 - accuracy_rate(gmm.predict(x), y)
        bic[i] = gmm.bic(x)
    print('错误率:', err.ravel())
    print('BIC:', bic.ravel())

    # 画图
    xpos = np.arange(4)
    ax = plt.axes()
    # -0.3~0 || 0.7~1 || 1.7~2 || 2.7~3
    b1 = ax.bar(xpos - 0.3, err, width=0.3, color='#77E0A0')
    # 0~0.3 || 1~1.3 || 2~2.3 || 3~3.3
    b2 = ax.twinx().bar(xpos, bic, width=0.3, color='#FF8080')
    plt.grid(True)
    bic_min, bic_max = expand(bic.min(), bic.max())
Beispiel #31
0
X = mat_data['X']
y = mat_data['y'].squeeze()
attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()]
classNames = [name[0][0] for name in mat_data['classNames']]
#X_old = X
#X = np.hstack([X,X])
N, M = X.shape
C = len(classNames)
# Number of clusters
K = 10
cov_type = 'full'
# type of covariance, you can try out 'diag' as well
reps = 1
# number of fits with different initalizations, best result will be kept
# Fit Gaussian mixture model
gmm = GaussianMixture(n_components=K, covariance_type=cov_type,
                      n_init=reps).fit(X)
cls = gmm.predict(X)
# extract cluster labels
cds = gmm.means_
# extract cluster centroids (means of gaussians)
covs = gmm.covariances_
# extract cluster shapes (covariances of gaussians)
if cov_type.lower() == 'diag':
    new_covs = np.zeros([K, M, M])

    count = 0
    for elem in covs:
        temp_m = np.zeros([M, M])
        new_covs[count] = np.diag(elem)
        count += 1
Beispiel #32
0
#### rh

print 'getting intensity values for the mask ....'

maskObj = nib.load(
    '{subjects_dir}/{subj}/mri/rh_choroid+ventricle_mask.nii.gz'.format(
        subjects_dir=subjects_dir, subj=subj))
mask = maskObj.get_data()
mask_indices = np.where(mask)
mask_indices_array = np.array(mask_indices)
mask_T1_vals = T1[mask_indices]

## GMM
X = np.reshape(mask_T1_vals, (-1, 1))
gmm = GaussianMixture(n_components=2, covariance_type='full').fit(X)
gmmb = BayesianGaussianMixture(n_components=2, covariance_type='full').fit(X)
save_segmentation(gmmb, 'rh_choroid_gmmb_mask.nii.gz')

## susan
input_img = '{subjects_dir}/{subj}/mri/rh_choroid_gmmb_mask.nii.gz'.format(
    subjects_dir=subjects_dir, subj=subj)
susan(input_img)

## read choroid_gmmb_mask_susan.nii.gz
choroid_gmmb_mask = nib.load(
    '{subjects_dir}/{subj}/mri/rh_choroid_gmmb_mask.nii.gz'.format(
        subjects_dir=subjects_dir, subj=subj))
choroid_gmmb_mask_ = choroid_gmmb_mask.get_data()

choroid_gmmb_susan = nib.load(
#PCA
LABEL_DIM = 10
x = X_gen.reshape(N, 28 * 28).detach().numpy()

from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.mixture import GaussianMixture

x_pca = PCA(n_components=2).fit_transform(x)
df_pca = pd.DataFrame(
    x_pca, columns=["principal component 1", "principal component 2"])
df_pca_labels = pd.concat(
    [df_pca, pd.DataFrame(np.array(Y_gen), columns=["labels"])], axis=1)

gmm_pred_labels = GaussianMixture(n_components=LABEL_DIM,
                                  reg_covar=1e-5).fit_predict(x)

df_pca_gmm_labels = pd.concat(
    [df_pca_labels,
     pd.DataFrame(gmm_pred_labels, columns=["gmm_labels"])],
    axis=1)

from collections import Counter

asgnd_gmm_labels = np.unique(np.array(
    df_pca_gmm_labels["gmm_labels"])).astype(int)
corr_gmm_labels = []

for i in asgnd_gmm_labels:
    most_common = Counter(df_pca_gmm_labels[df_pca_gmm_labels["gmm_labels"] ==
                                            i]["labels"]).most_common()[0][0]
        no = DF[DF['xAttack'] == C].shape[0]
        #print(no,Clus[i].shape[0])
        if (Maxi < no):
            Class = C
            Maxi = no
    impurity[i] = Maxi / DF.shape[0]
    purity[i].append(impurity[i])
print(impurity)

# In[ ]:

from sklearn.mixture import GaussianMixture

# In[ ]:

GMM = GaussianMixture(n_components=5).fit_predict(df1)
clustering_method.append('GMM')

# In[ ]:

#print(GMM)

# In[ ]:

Inp['predict'] = np.array(GMM) + 1

# In[ ]:

impurity = {}
for i in range(1, 6):
    Maxi = 0
Beispiel #35
0
def visualize_clusters(n_clusters, dim):
    data = pd.read_csv('./data/scaled.csv')
    x = data.values
    clusterer = GaussianMixture(n_clusters)
    cluster_labels = clusterer.fit_predict(x)
    scaler = StandardScaler().fit(pd.read_csv('./data/cleaned.csv'))
    cluster_label_means = []
    cluster_label_stds = []
    for n in range(n_clusters):
        print(
            f'Number of Plays in Cluster {n + 1}: {len(x[cluster_labels == n])}'
        )
        means = np.average(scaler.inverse_transform(x)[cluster_labels == n],
                           axis=0).round(4)
        stds = np.std(scaler.inverse_transform(x)[cluster_labels == n],
                      axis=0).round(4)
        cluster_label_means.append(means)
        cluster_label_stds.append(stds)
    DataFrame(cluster_label_means, columns=data.columns)\
        .to_csv("./data/Cluster_Means.csv")
    DataFrame(cluster_label_stds, columns=data.columns)\
        .to_csv("./data/Cluster_STDs.csv")
    # One dimension
    if dim == 1:
        tsne_1d = TSNE(n_components=1)
        pca_1d = PCA(n_components=1)
        tcs_1d = pd.DataFrame(tsne_1d.fit_transform(x))
        pcs_1d = pd.DataFrame(pca_1d.fit_transform(x))
        tcs_1d.columns = ["TC1_1d"]
        pcs_1d.columns = ["PC1_1d"]
        plot_x_tsne = pd.concat([data, tcs_1d], axis=1, join='inner')
        plot_x_pca = pd.concat([data, pcs_1d], axis=1, join='inner')
        plot_x_tsne["zero"] = 0
        plot_x_pca["zero"] = 0
    # Two dimensions
    elif dim == 2:
        tsne_2d = TSNE(n_components=2)
        pca_2d = PCA(n_components=2)
        tcs_2d = pd.DataFrame(tsne_2d.fit_transform(x))
        pcs_2d = pd.DataFrame(pca_2d.fit_transform(x))
        tcs_2d.columns = ["TC1_2d", "TC2_2d"]
        pcs_2d.columns = ["PC1_2d", "PC2_2d"]
        plot_x_tsne = pd.concat([data, tcs_2d], axis=1, join='inner')
        plot_x_pca = pd.concat([data, pcs_2d], axis=1, join='inner')
    # Three dimensions
    elif dim == 3:
        tsne_3d = TSNE(n_components=3)
        pca_3d = PCA(n_components=3)
        tcs_3d = pd.DataFrame(tsne_3d.fit_transform(x))
        pcs_3d = pd.DataFrame(pca_3d.fit_transform(x))
        tcs_3d.columns = ["TC1_3d", "TC2_3d", "TC3_3d"]
        pcs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]
        plot_x_tsne = pd.concat([data, tcs_3d], axis=1, join='inner')
        plot_x_pca = pd.concat([data, pcs_3d], axis=1, join='inner')
    else:
        print("Invalid Dimension...")
        return
    graph_colors = [
        'red', 'blue', 'green', 'yellow', 'pink', 'purple', 'black',
        'lightskyblue', 'orange', 'darkred', 'salmon', 'cyan', 'lime',
        'slategray', 'teal', 'peru', 'orchid', 'crimson', 'thistle', 'lavender'
    ]
    make_visualization('T', dim, n_clusters, plot_x_tsne, cluster_labels,
                       graph_colors)
    make_visualization('P', dim, n_clusters, plot_x_pca, cluster_labels,
                       graph_colors)
            yticklabels=digits.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.show()

# check for accuracy of the classification
accuracy_score(y_test, labels)

##########################################################################################
########################### GMM model ####################################################
##########################################################################################
data = X_train.data
# np.random.seed(1)

# Your code here
gmm_model = GMM(n_components=10, covariance_type='full', random_state=1)
gmm_model.fit(data)
print(gmm_model.converged_)

# Extract the means as well as the covariances
# Your code here
mns = gmm_model.means_
covs = gmm_model.covariances_

# Reshape the images
im = mns.reshape(10, 8, 8)

# Don't change this code
# Figure size in inches
fig = plt.figure(figsize=(8, 3))
Beispiel #37
0
of operators is for model *GaussianMixture*.
"""
import os
from timeit import timeit
import numpy as np
import matplotlib.pyplot as plt
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
from onnxruntime import InferenceSession
from sklearn.mixture import GaussianMixture
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from skl2onnx import to_onnx

data = load_iris()
X_train, X_test = train_test_split(data.data)
model = GaussianMixture()
model.fit(X_train)

###################################
# Default conversion
# ++++++++++++++++++

model_onnx = to_onnx(model,
                     X_train[:1].astype(np.float32),
                     options={id(model): {
                                  'score_samples': True
                              }},
                     target_opset=12)
sess = InferenceSession(model_onnx.SerializeToString())

xt = X_test[:5].astype(np.float32)
Beispiel #38
0
def test_warm_start(seed):
    random_state = seed
    rng = np.random.RandomState(random_state)
    n_samples, n_features, n_components = 500, 2, 2
    X = rng.rand(n_samples, n_features)

    # Assert the warm_start give the same result for the same number of iter
    g = GaussianMixture(n_components=n_components,
                        n_init=1,
                        max_iter=2,
                        reg_covar=0,
                        random_state=random_state,
                        warm_start=False)
    h = GaussianMixture(n_components=n_components,
                        n_init=1,
                        max_iter=1,
                        reg_covar=0,
                        random_state=random_state,
                        warm_start=True)

    g.fit(X)
    score1 = h.fit(X).score(X)
    score2 = h.fit(X).score(X)

    assert_almost_equal(g.weights_, h.weights_)
    assert_almost_equal(g.means_, h.means_)
    assert_almost_equal(g.precisions_, h.precisions_)
    assert score2 > score1

    # Assert that by using warm_start we can converge to a good solution
    g = GaussianMixture(n_components=n_components,
                        n_init=1,
                        max_iter=5,
                        reg_covar=0,
                        random_state=random_state,
                        warm_start=False,
                        tol=1e-6)
    h = GaussianMixture(n_components=n_components,
                        n_init=1,
                        max_iter=5,
                        reg_covar=0,
                        random_state=random_state,
                        warm_start=True,
                        tol=1e-6)

    g.fit(X)
    assert not g.converged_

    h.fit(X)
    # depending on the area_data there is large variability in the number of
    # refit necessary to converge due to the complete randomness of the
    # area_data
    for _ in range(1000):
        h.fit(X)
        if h.converged_:
            break
    assert h.converged_
Beispiel #39
0
def fit_GMM(n):
    model = GaussianMixture(n, covariance_type='full', random_state=0).fit(training_data)
    pickle.dump(model, open(outdirr+savename+str(n)+'.gmm', 'wb'))
    return
Beispiel #40
0
    print("Auto-Encoder with GMM Clustering")
    k = 10  # Number of clusters

    print("Loading dataset...")
    ((x_train, y_train), (x_test,
                          y_test)) = keras.datasets.fashion_mnist.load_data()
    x_train = np.reshape(x_train, (x_train.shape[0], 784))
    x_train = x_train / 255.0
    x_test = np.reshape(x_test, (x_test.shape[0], 784))
    x_test = x_test / 255.0

    # Use auto encoder to reduce dimensionality, returns compressed rep of x_train, x_test
    cx_train, cx_test = autoencode(x_train, x_test)

    # Perform GMM clustering
    print("Training GMM...")
    gmm = GaussianMixture(n_components=k)
    gmm.fit(cx_train)
    print("Clustering training data...")
    clusterAssmentTrain = gmm.predict(cx_train)
    print("Clustering test data...")
    clusterAssmentTest = gmm.predict(cx_test)
    print("Done!")

    # Compute Metrics
    print("Training Metrics:")
    evaluate_clusters(10, clusterAssmentTrain, y_train)
    print("Testing Metrics:")
    evaluate_clusters(10, clusterAssmentTest, y_test)

    plt.show()
Beispiel #41
0
from sklearn.mixture import GaussianMixture as GMM
import matplotlib.pyplot as plt
import numpy as np

if __name__ == "__main__":
    X = getAdultX()
    y = getAdultY()

    tester = emtc.ExpectationMaximizationTestCluster(X,
                                                     y,
                                                     clusters=range(1, 11),
                                                     plot=True,
                                                     targetcluster=2,
                                                     stats=True)
    tester.run()

    # plot clustering
    gmm = GMM(covariance_type='diag', n_components=3)
    model = gmm.fit(X)
    labels = model.predict(X)
    # View the results
    # Set the size of the plot
    plt.figure(figsize=(14, 7))

    # Create a colormap
    colormap = np.array(['red', 'lime', 'black', 'blue', 'yellow'])
    x1 = X.iloc[:, 0]
    x2 = X.iloc[:, 1]
    plt.scatter(x=x1, y=x2, c=colormap[labels], s=40)
    plt.title('adult EM Classification')
Beispiel #42
0
class OWCK(GaussianProcess_extra):
    """The Optimal Weighted Cluster Kriging/Gaussian Process class
    
    This class inherited from GaussianProcess class in sklearn library
    Most of the parameters are contained in sklearn.gaussian_process.
    
    Please check the docstring of Gaussian Process parameters in sklearn.
    Only newly introduced parameters are documented below.

    Parameters
    ----------
    n_cluster : int, optional
        The number of clusters, determines the number of the Gaussian Process
        model to build. It is the speed-up factor in OWCK.
    min_leaf_size : int, optional
        if min_leaf_size > 0, min_leaf_size is used to determine the number of clusters for
        the model tree clustering method.
    cluster_method : string, optional
        The clustering algorithm used to partition the data set.
        Built-in clustering algorithm are:
            'k-mean', 'GMM', 'fuzzy-c-mean', 'random', 'tree'
            Note that GMM, fuzzy-c-mean  are fuzzy clustering algorithms 
            With these algorithms you can set the overlap you desire.
            Tree is a non-fuzzy algorithm using local models per leaf in a regression tree
            The tree algorithm is also able to update the model with new records
    overlap : float, optional
        The percentage of overlap when using a fuzzy cluster method.
        Each cluster will be of the same size.
    is_parallel : boolean, optional
        A boolean switching parallel model fitting on. If it is True, then
        all the underlying Gaussian Process model will be fitted in parallel,
        supported by MPI. Otherwise, all the models will be fitted sequentially.
        
    Attributes
    ----------
    cluster_label : the cluster label of the training set after clustering
    clusterer : the clustering algorithm used.
    models : a list of (fitted) Gaussian Process models built on each cluster.
    
    References
    ----------
    
    .. [SWKBE15] `Bas van Stein, Hao Wang, Wojtek Kowalczyk, Thomas Baeck 
        and Michael Emmerich. Optimally Weighted Cluster Kriging for Big 
        Data Regression. In 14th International Symposium, IDA 2015, pages 
        310-321, 2015`
        http://link.springer.com/chapter/10.1007%2F978-3-319-24465-5_27#
    """
    def __init__(self,
                 regr='constant',
                 corr='squared_exponential',
                 n_cluster=8,
                 min_leaf_size=0,
                 cluster_method='k-mean',
                 overlap=0.0,
                 beta0=None,
                 storage_mode='full',
                 verbose=False,
                 theta0=0.1,
                 thetaL=None,
                 thetaU=None,
                 sigma2=None,
                 optimizer='BFGS',
                 random_start=1,
                 normalize=False,
                 nugget=10. * MACHINE_EPSILON,
                 random_state=None,
                 nugget_estim=True,
                 is_parallel=False):

        super(OWCK, self).__init__(regr=regr,
                                   corr=corr,
                                   beta0=beta0,
                                   verbose=verbose,
                                   theta0=theta0,
                                   thetaL=thetaL,
                                   thetaU=thetaU,
                                   sigma2=sigma2,
                                   optimizer=optimizer,
                                   random_start=random_start,
                                   normalize=normalize,
                                   nugget=nugget,
                                   nugget_estim=nugget_estim,
                                   random_state=random_state)

        self.empty_model = GaussianProcess_extra(regr=regr,
                                                 corr=corr,
                                                 beta0=beta0,
                                                 verbose=verbose,
                                                 theta0=theta0,
                                                 thetaL=thetaL,
                                                 thetaU=thetaU,
                                                 sigma2=sigma2,
                                                 optimizer=optimizer,
                                                 random_start=random_start,
                                                 normalize=normalize,
                                                 nugget=nugget,
                                                 nugget_estim=nugget_estim,
                                                 random_state=random_state)

        self.n_cluster = n_cluster
        self.is_parallel = is_parallel
        self.verbose = verbose
        self.overlap = overlap  #overlap for fuzzy clusters
        self.min_leaf_size = min_leaf_size
        self.regr_label = regr
        self.fitted = False

        if cluster_method not in [
                'k-mean', 'GMM', 'fuzzy-c-mean', 'random', 'tree'
        ]:
            raise Exception(
                '{} clustering is not supported!'.format(cluster_method))
        else:
            self.cluster_method = cluster_method

    def __clustering(self, X, y=None):
        """
        The clustering procedure of the Optimal Weighted Clustering Gaussian 
        Process. This function should not be called externally
        """

        self.sizeX = len(X)

        if self.cluster_method == 'k-mean':
            clusterer = KMeans(n_clusters=self.n_cluster)
            clusterer.fit(X)
            self.cluster_label = clusterer.labels_
            self.clusterer = clusterer
        elif self.cluster_method == 'tree':

            if (self.min_leaf_size > 0):
                self.minsamples = self.min_leaf_size
                tree = IncrementalRegressionTree(
                    min_samples_leaf=self.min_leaf_size)
            else:
                self.minsamples = int(len(X) / (self.n_cluster))
                tree = IncrementalRegressionTree(
                    min_samples_leaf=self.minsamples)

            tree.fit(X, y)
            labels = tree.apply(X)
            clusters = np.unique(labels)
            k = len(clusters)
            if self.verbose:
                print("leafs:", k)
            self.n_cluster = k
            self.leaf_labels = np.unique(labels)
            self.cluster_label = labels
            self.clusterer = tree

        elif self.cluster_method == 'random':
            r = self.n_sample % self.n_cluster
            m = (self.n_sample - r) / self.n_cluster
            self.cluster_label = array(range(self.n_cluster) * m + range(r))
            self.clusterer = None
            shuffle(self.cluster_label)
        elif self.cluster_method == 'GMM':  #GMM from sklearn
            self.clusterer = GaussianMixture(n_components=self.n_cluster,
                                             n_init=10)
            self.clusterer.fit(X)
            self.cluster_labels_proba = self.clusterer.predict_proba(X)
            self.cluster_label = self.clusterer.predict(X)
        elif self.cluster_method == 'fuzzy-c-mean':  #Fuzzy C-means from sklearn
            cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X.T,
                                                             self.n_cluster,
                                                             2,
                                                             error=0.000005,
                                                             maxiter=10000,
                                                             init=None)
            self.clusterer = cntr  #save the centers for cmeans_predict
            self.cluster_labels_proba = u.T
            self.cluster_labels_proba = np.array(self.cluster_labels_proba)
            self.cluster_label = np.argmax(u, axis=0)
            self.cluster_label = np.array(self.cluster_label)

    def __fit(self, X, y):
        """
        The Optimal Weighted Cluster Gaussian Process model fitting method.
        Parameters
        ----------
        X : double array_like
            An array with shape (n_samples, n_features) with the input at which
            observations were made.
        y : double array_like
            An array with shape (n_samples, ) or shape (n_samples, n_targets)
            with the observations of the output to be predicted.
        Returns
        -------
        ocwk : self
            A fitted Cluster Gaussian Process model object awaiting data to 
            perform predictions.
        """
        self.n_sample, self.n_feature = X.shape

        if y.shape[0] != self.n_sample:
            raise Exception('Training input and target do not match!')

        # clustering
        self.__clustering(X, y)

        # model creation
        self.models = None
        if (self.cluster_method == 'tree'):
            self.models = [deepcopy(self) for i in self.leaf_labels]
        else:
            self.models = [deepcopy(self) for i in range(self.n_cluster)]

        for m in self.models:
            m.__class__ = GaussianProcess_extra

        self.X = X
        self.y = y

        # model fitting
        if self.is_parallel:  # parallel model fitting
            #now using parralel threading
            #
            # prepare the training set for each GP model

            if (self.cluster_method == 'k-mean'
                    or self.cluster_method == 'random'):
                idx = [self.cluster_label == i for i in range(self.n_cluster)]
            elif (self.cluster_method == 'tree'):
                idx = [
                    self.cluster_label == self.leaf_labels[i]
                    for i in range(self.n_cluster)
                ]
                if (self.verbose):
                    print "len cluster", len(idx)
            else:
                targetMemberSize = (len(self.X) /
                                    self.n_cluster) * (1.0 + self.overlap)
                idx = []

                minindex = np.argmin(self.y)
                maxindex = np.argmax(self.y)
                for i in range(self.n_cluster):
                    idx_temp = np.argsort(
                        self.cluster_labels_proba[:, i])[-targetMemberSize:]
                    if (minindex not in idx_temp):
                        idx_temp = np.hstack((idx_temp, [minindex]))
                    if (maxindex not in idx_temp):
                        idx_temp = np.hstack((idx_temp, [maxindex]))
                    idx.append(idx_temp)

            training = [(X[index, :], y[index]) for index in idx]
            training_set = itertools.izip(range(self.n_cluster),
                                          deepcopy(self.models), training)

            pool = Pool(self.n_cluster)
            models = pool.map(train_modelstar, training_set)
            pool.close()
            pool.join()
            self.models = models

            #print models
            #
            '''
            raise Exception('Parallel mode has been disabled for now.')
            # spawning processes...
            #os.chdir('/home/wangronin')
            comm = MPI.COMM_SELF.Spawn(sys.executable, 
                                       args=['-m', 'owck.OWCK_slave'],
                                       maxprocs=self.n_cluster)
            
            # prepare the training set for each GP model    
                
            if (self.cluster_method=='k-mean' or self.cluster_method=='random'):
                idx = [self.cluster_label == i for i in range(self.n_cluster)]
            elif (self.cluster_method=='tree'):
                idx = [self.cluster_label == self.leaf_labels[i] for i in range(self.n_cluster)]
                if (verbose):
                    print "len cluster",len(idx)
            else:
                targetMemberSize = (len(self.X) / self.n_cluster)*(1.0+self.overlap)
                idx = []

                minindex = np.argmin(self.y)
                maxindex = np.argmax(self.y)
                for i in range(self.n_cluster):
                    idx_temp = np.argsort(self.cluster_labels_proba[:,i])[-targetMemberSize:]
                    if (minindex not in idx_temp):
                        idx_temp = np.hstack((idx_temp,[minindex]))
                    if (maxindex not in idx_temp):
                        idx_temp = np.hstack((idx_temp,[maxindex]))
                    idx.append(idx_temp)


            training_set = [(X[index, :], y[index]) for index in idx]
            
            # scatter the models and data
            comm.scatter([(k, training_set[k]) \
                for k in range(self.n_cluster)], root=MPI.ROOT)
            comm.scatter(self.models, root=MPI.ROOT)
           
            
            # Synchronization while the slave process are performing 
            # heavy computations...
            comm.Barrier()
                
            # Gether the fitted model from the childrenn process
            # Note that 'None' is only valid in master-slave working mode
            results = comm.gather(None, root=MPI.ROOT)
            
            # keep the fitted model align with their cluster
            fitted = DataFrame([[d['index'], d['model']] \
                for d in results], columns=['index', 'model'])
            fitted.sort('index', ascending=[True], inplace=True)
            
            self.models[:] = fitted['model']
                
            # free all slave processes
            comm.Disconnect()
            '''

        else:  # sequential model fitting
            # get min and max value indexes such that no cluster gets
            # only one value instances.
            #            minindex = np.argmin(self.training_y)
            #            maxindex = np.argmax(self.training_y)
            for i in range(self.n_cluster):

                if (self.cluster_method == 'k-mean'
                        or self.cluster_method == 'random'):
                    idx = self.cluster_label == i
                elif (self.cluster_method == 'tree'):
                    idx = self.cluster_label == self.leaf_labels[i]
                else:
                    targetMemberSize = (len(self.X) /
                                        self.n_cluster) * (1.0 + self.overlap)
                    idx = []

                    minindex = np.argmin(self.y)
                    maxindex = np.argmax(self.y)
                    # TODO: fix line here
                    idx = np.argsort(
                        self.cluster_labels_proba[:,
                                                  i])[-int(targetMemberSize):]
                    if (minindex not in idx):
                        idx = np.hstack((idx, [minindex]))
                    if (maxindex not in idx):
                        idx = np.hstack((idx, [maxindex]))

                model = self.models[i]
                # TODO: discuss this will introduce overlapping samples
                #                idx[minindex] = True
                #                idx[maxindex] = True

                # dirty fix so that low nugget errors will increase the
                # nugget till the model fits
                while True:
                    try:
                        # super is needed here to call the 'fit' function in the
                        # parent class (GaussianProcess_extra)
                        if (self.cluster_method == 'tree' and self.verbose):
                            print 'leaf: ', self.leaf_labels[i]
                        length_lb = 1e-10
                        length_ub = 1e2
                        X = self.X[idx, :]
                        x_lb, x_ub = X.min(0), X.max(0)
                        model.thetaL = length_ub**-2. / (
                            x_ub - x_lb)**2. * np.ones(self.n_feature)
                        model.thetaU = length_lb**-2. / (
                            x_ub - x_lb)**2. * np.ones(self.n_feature)

                        model.fit(self.X[idx, :], self.y[idx])
                        break
                    except Exception as e:
                        print e
                        if self.verbose:
                            print('Current nugget setting is too small!' +\
                                ' It will be tuned up automatically')
                        #pdb.set_trace()
                        model.noise_var *= 10

    def gradient(self, x):
        """
        Calculate the gradient of the posterior mean and variance
        """
        check_is_fitted(self, 'X')
        x = np.atleast_2d(x)

        if self.cluster_method == 'tree':
            idx = self.clusterer.apply(x.reshape(1, -1))[0]
            active_GP_idx = np.nonzero(self.leaf_labels == idx)[0][0]
            active_GP = self.models[active_GP_idx]

            y_dx, mse_dx = active_GP.gradient(x)

        elif self.cluster_method == 'GMM':
            # TODO: implement this
            pass

        elif self.cluster_method in ['random', 'k-mean']:
            par = {}
            _ = self.predict(x, eval_MSE=False, par_out=par)

            weights = par['weights'].reshape(-1, 1)
            y = par['y'].reshape(-1, 1)
            mse = par['mse'].reshape(-1, 1)
            normalized_mse = par['mse_normalized'].reshape(-1, 1)
            U = par['U'].reshape(-1, 1)

            y_jac, mse_jac = zip(*[model.gradient(x) for model in self.models])
            y_jac, mse_jac = np.c_[y_jac], np.c_[mse_jac]

            M = (1. / normalized_mse).sum()
            tmp = np.dot(mse_jac, (1. / normalized_mse**2.) / U)
            weights_jacobian =  -mse_jac * normalized_mse.T ** -2. / U.T / M \
                + (np.repeat(tmp, len(weights), axis=1) / normalized_mse.T) / M ** 2.

            y_dx = np.dot(y_jac, weights) + np.dot(weights_jacobian, y)
            mse_dx = np.dot(mse_jac, weights**2.) + np.dot(
                weights.T * weights_jacobian, mse)

        return y_dx, mse_dx

    def __mse_upper_bound(self, model):
        """
        This function computes the tight upper bound of the Mean Square Error(
        Kriging variance) for the underlying Posterior Gaussian Process model, 
        whose usage should be subject to Simple or Ordinary Kriging (constant trend)
        Parameters
        ----------
        model : a fitted Gaussian Process/Kriging model, in which 'self.regr'
                should be 'constant'
        Returns
        ----------
        upper_bound : the upper bound of the Mean Squared Error
        """

        if self.regr_label != 'constant':
            raise Exception('MSE upper bound only exists for constant trend')

        C = model.C
        if C is None:
            # Light storage mode (need to recompute C, F, Ft and G)
            if model.verbose:
                print(
                    "This GaussianProcess used 'light' storage mode "
                    "at instantiation. Need to recompute "
                    "autocorrelation matrix...")
            _, par = model.reduced_likelihood_function()
            model.C = par['C']
            model.Ft = par['Ft']
            model.G = par['G']

        n_samples, n_features = model.X.shape
        tmp = 1 / model.G**2

        upper_bound = np.sum(model.sigma2 * (1 + tmp))
        return upper_bound

    def __check_duplicate(self, X, y):

        # TODO: show a warning here
        X = np.atleast_2d(X)
        new_X = []
        new_Y = []
        for i, x in enumerate(X):
            idx = np.nonzero(np.all(np.isclose(self.X, x), axis=1))[0]

            if len(idx) == 0:
                new_X.append(x)
                new_Y.append(y[i])

            if y[i] != self.y[idx]:
                raise Exception(
                    'The same input can not have different respones!')

        return np.array(new_X), new_Y

    def updateModel(self, newX, newY):
        """
        Deprecated function, just call fit with new database.
        """
        newY = newY.reshape(-1, 1)
        #print newY.shape, self.y.shape
        X = np.r_[self.X, newX]
        y = np.r_[self.y, newY]
        self.fit(X, y)

    def update_data(self, X, y):
        self.X = X
        self.y = y

        # note that the clusters are not rebuilt
        if self.cluster_method == 'tree':
            self.cluster_label = self.clusterer.apply(self.X)

            for i, model in enumerate(self.models):
                idx = self.cluster_label == self.leaf_labels[i]
                if not np.any(idx):
                    raise Exception('No data point in cluster {}!'.format(i +
                                                                          1))
                model.update_data(self.X[idx, :], self.y[idx])
        else:
            # TODO: to implement for the rest options
            pass

        return self

    def fit(self, newX, newY, re_estimate_all=False):
        """
        Add several instances to the data and rebuild models
        newX is a 2d array of (instances,features) and newY a vector
        """
        if not hasattr(self, 'X'):
            self.__fit(newX, newY)
            return

        newX, newY = self.__check_duplicate(newX, newY)

        if self.cluster_method == 'tree':

            #first update our data
            if len(newY) != 0:
                self.X = np.r_[self.X, newX]
                self.y = np.r_[self.y, newY]
            #self.X = np.append(self.X, newX, axis=0)
            #self.y = np.append(self.y, newY)

            #check the size of the new data
            if re_estimate_all:
                #in this case build additional models
                if self.verbose:
                    print("refitting all models")
                self.__fit(self.X, self.y)
            elif len(self.X) > (self.sizeX + self.minsamples * 2.0):
                #in this case build additional models if needed
                if self.verbose:
                    print("refitting new models")
                    #print("Current tree")
                    #print(self.clusterer)
                rebuildmodels = np.unique(self.clusterer.apply(newX))
                rebuildmodelstemp = []
                rebuild_index = 0
                self.cluster_label = self.clusterer.apply(self.X)
                new_leaf_labels = []
                for i in rebuildmodels:
                    leafindex = np.where(self.leaf_labels == i)[0][0]

                    idx = self.cluster_label == i
                    #check size of model
                    if (len(idx) > self.minsamples * 2.0):
                        if self.verbose:
                            print("Trying to split leaf node", i)
                        #split the leaf and fit 2 additional models
                        new_labels = []
                        if self.clusterer.split_terminal(
                                i, self.X[idx, :], self.y[idx]):
                            self.cluster_label = self.clusterer.apply(self.X)
                            new_labels = np.unique(self.cluster_label)
                            self.n_cluster = len(new_labels)
                        delete_old = False
                        for n in new_labels:
                            if n not in self.leaf_labels:
                                delete_old = True
                                new_leafindex = np.where(new_labels == n)[0][0]
                                if self.verbose:
                                    print("New model with id", new_leafindex)
                                    #print self.leaf_labels
                                new_model = deepcopy(self.empty_model)
                                self.models.append(new_model)
                                self.leaf_labels = np.append(
                                    self.leaf_labels, n)
                                #rebuildmodelstemp.append(new_leafindex)
                                new_leaf_labels.append(n)
                        if delete_old:
                            self.leaf_labels = np.delete(
                                self.leaf_labels, leafindex)
                            del (self.models[leafindex])
                            #if self.verbose:
                            #print("New tree")
                            #print(self.clusterer)
                            #print self.leaf_labels
                    else:
                        #just refit this model
                        #rebuildmodelstemp.append(leafindex)
                        new_leaf_labels.append(i)

                for n in new_leaf_labels:
                    rebuildmodelstemp.append(
                        np.where(self.leaf_labels == n)[0][0])

                rebuildmodels = np.unique(
                    np.array(rebuildmodelstemp, dtype=int))
                labels = self.clusterer.apply(self.X)
                self.cluster_label = labels
                self.leaf_labels = np.unique(labels)

                for i in rebuildmodels:

                    idx = self.cluster_label == self.leaf_labels[i]
                    if self.verbose:
                        print("updating model on position " + str(i) +
                              " attached to leaf id " +
                              str(self.leaf_labels[i]) + " and " +
                              str(sum(idx)) + " data points")
                    model = self.models[i]
                    while True:
                        try:
                            # super is needed here to call the 'fit' function in the
                            # parent class (GaussianProcess)

                            model.fit(self.X[idx, :], self.y[idx])
                            break
                        except ValueError:
                            if self.verbose:
                                print('Current nugget setting is too small!' +\
                                    ' It will be tuned up automatically')
                            model.nugget *= 10
            else:
                rebuildmodels = np.unique(self.clusterer.apply(newX))
                rebuildmodelstemp = []
                for i in rebuildmodels:
                    rebuildmodelstemp.append(
                        np.where(self.leaf_labels == i)[0][0])
                rebuildmodels = np.array(rebuildmodelstemp, dtype=int)
                labels = self.clusterer.apply(self.X)
                self.cluster_label = labels
                if self.is_parallel:  # parallel model fitting
                    idx = [
                        self.cluster_label == self.leaf_labels[i]
                        for i in rebuildmodels
                    ]
                    modelstosend = [
                        deepcopy(self.models[i]) for i in rebuildmodels
                    ]
                    training = [(self.X[index, :], self.y[index])
                                for index in idx]
                    training_set = itertools.izip(rebuildmodels, modelstosend,
                                                  training)

                    pool = Pool(self.n_cluster)
                    models = pool.map(train_modelstar, training_set)
                    pool.close()
                    pool.join()
                    for i in range(len(rebuildmodels)):
                        self.models[rebuildmodels[i]] = models[i]

                else:  # is_parralel = false

                    for i in rebuildmodels:
                        if self.verbose:
                            print("updating model " + str(i))
                        idx = self.cluster_label == self.leaf_labels[i]
                        model = self.models[i]
                        while True:
                            try:
                                # super is needed here to call the 'fit' function in the
                                # parent class (GaussianProcess)

                                model.fit(self.X[idx, :], self.y[idx])
                                break
                            except ValueError:
                                if self.verbose:
                                    print('Current nugget setting is too small!' +\
                                        ' It will be tuned up automatically')
                                model.nugget *= 10
        else:
            #rebuild all models
            self.X = np.r_[self.X, newX]
            self.y = np.r_[self.y, newY]
            self.__fit(self.X, self.y)

    # TODO: implementating batch_size option to reduce the memory usage
    def predict(self, X, eval_MSE=False, par_out=None):
        """
        This function evaluates the Optimal Weighted Gaussian Process model at x.
        Parameters
        ----------
        X : array_like
            An array with shape (n_eval, n_features) giving the point(s) at
            which the prediction(s) should be made.
        eval_MSE : boolean, optional
            A boolean specifying whether the Mean Squared Error should be
            evaluated or not.
            Default assumes evalMSE = False and evaluates only the BLUP (mean
            prediction).
        batch_size : integer, Not available yet 
            An integer giving the maximum number of points that can be
            evaluated simultaneously (depending on the available memory).
            Default is None so that all given points are evaluated at the same
            time.
        Returns
        -------
        y : array_like, shape (n_samples, ) or (n_samples, n_targets)
            An array with shape (n_eval, ) if the Gaussian Process was trained
            on an array of shape (n_samples, ) or an array with shape
            (n_eval, n_targets) if the Gaussian Process was trained on an array
            of shape (n_samples, n_targets) with the Best Linear Unbiased
            Prediction at x.
        MSE : array_like, optional (if eval_MSE == True)
            An array with shape (n_eval, ) or (n_eval, n_targets) as with y,
            with the Mean Squared Error at x.
        """

        X = np.atleast_2d(X)
        X = X.T if size(X, 1) != self.n_feature else X

        n_eval, n_feature = X.shape

        if n_feature != self.n_feature:
            raise Exception('Dimensionality does not match!')

        if self.cluster_method == 'tree':
            pred = np.zeros(n_eval)
            if eval_MSE:
                mse = np.zeros(n_eval)

            for i, x in enumerate(X):
                #                modelindex = self.clusterer
                ix = self.clusterer.apply(x.reshape(1, -1))
                model = self.models[np.where(self.leaf_labels == ix)[0][0]]

                _ = model.predict(x.reshape(1, -1), eval_MSE)

                if eval_MSE:
                    pred[i], mse[i] = _
                else:
                    pred[i] = _

            if eval_MSE:
                return pred, mse
            else:
                return pred

        elif self.cluster_method in ['random', 'k-mean']:
            # compute predictions and MSE from all underlying GP models
            # super is needed here to call the 'predict' function in the
            # parent class

            res = array([model.predict(X, eval_MSE=True) \
                for model in self.models])

            # compute the upper bound of MSE from all underlying GP models
            mse_upper_bound = array([self.__mse_upper_bound(model) \
                for model in self.models])

            if np.any(mse_upper_bound == 0):
                raise Exception('Something weird happened!')

            pred, mse = res[:, 0, :], res[:, 1, :]
            normalized_mse = mse / mse_upper_bound.reshape(-1, 1)

            # inverse of the MSE matrices
            Q_inv = [diag(1.0 / normalized_mse[:, i]) for i in range(n_eval)]

            _ones = ones(self.n_cluster)
            weight = lambda Q_inv: dot(_ones, Q_inv)
            normalizer = lambda Q_inv: dot(dot(_ones, Q_inv),
                                           _ones.reshape(-1, 1))

            # compute the weights of convex combination
            weights = array(
                [weight(q_inv) / normalizer(q_inv) for q_inv in Q_inv])
            # make sure the weights sum to 1...
            if np.any(abs(np.sum(weights, axis=1) - 1.0) > 1e-8):
                raise Exception('Computed weights do not sum to 1!')

            # convex combination of predictions from the underlying GP models
            pred_combined = array([inner(pred[:, i], weights[i, :]) \
                for i in range(n_eval)])

            if par_out is not None:
                par_out['weights'] = weights
                par_out['y'] = pred
                par_out['mse'] = mse
                par_out['mse_normalized'] = normalized_mse
                par_out['U'] = mse / normalized_mse

            # if overall MSE is needed
            if eval_MSE:
                mse_combined = array([inner(mse[:, i], weights[i, :]**2) \
                    for i in range(n_eval)])

                return pred_combined, mse_combined

            else:
                return pred_combined

        elif self.cluster_method == 'GMM':
            # TODO: implement the MSE calculation for 'GMM' approach: mixed of Gaussian processes
            pass
Beispiel #43
0
class Scdv(BaseEstimator, ClusterMixin, TransformerMixin):
    """ implementation of https://dheeraj7596.github.io/SDV/"""

    # TODO accept args for idfvectorizer
    def __init__(self,
                 word_emb_func=None,
                 stop_words=None,
                 sparse_threshold_p=0.04,
                 n_components=50,
                 covariance_type="full",
                 tol=0.001,
                 reg_covar=1e-06,
                 max_iter=100,
                 n_init=1,
                 init_params="kmeans",
                 weights_init=None,
                 means_init=None,
                 precisions_init=None,
                 random_state=None,
                 warm_start=False,
                 verbose=0,
                 verbose_interval=10):
        self.semantic_soft_clustering = GaussianMixture(
            n_components, covariance_type, tol, reg_covar, max_iter, n_init,
            init_params, weights_init, means_init, precisions_init,
            random_state, warm_start, verbose, verbose_interval)
        super(Scdv, self).__init__()

        self._tfidf_vectorizer = IdfStoredCountVectorizer(
            stop_words=stop_words, use_idf=True, smooth_idf=True)

        self.sparse_threshold_p = sparse_threshold_p
        self._sparse_threshold_ratio = None
        self.sparse_threshold = None

        self._word_emb_func = word_emb_func if word_emb_func else lambda x: x
        self._word_embs = {}

        self._word_topic_vecs = None

    def __getstate__(self):
        state = self.__dict__.copy()
        # Remove the unpicklable entries.
        del state['_word_emb_func']
        return state

    def to_idf(self, index):
        word_index = index

        if isinstance(index, str):
            word_index = self._tfidf_vectorizer.vocabulary_[index]
        return self._tfidf_vectorizer.idf_[word_index]

    def fit(self, docs, y=None):
        logger.info("creating dictionary and computing idf...")
        self._tfidf_vectorizer.fit(docs)

        self._word_embs = OrderedDict({
            index: self._word_emb_func(word)
            for word, index in sorted(
                self._tfidf_vectorizer.vocabulary_.items(),
                key=lambda key_value: key_value[1])
        })

        word_vecs = np.vstack(list(self._word_embs.values()))

        logger.info("clustering in-vocabulary words (size: %d) ...",
                    len(word_vecs))
        self.semantic_soft_clustering.fit(word_vecs)

        logger.info("getting word-topic_vectors...")
        self._word_topic_vecs = self._to_word_topic_vectors(word_vecs)

        logger.info("computing threshold to make sparse...")
        self._compute_sparse_threshold_ratio(self.transform_into_ncdv(docs))

        logger.info("fitting has finished!!")
        return self

    def transform(self, raw_documents, should_compress=True):
        return self._make_sparse(self.transform_into_ncdv(raw_documents),
                                 should_compress)

    def fit_transform(self, raw_documents, y=None, **kwargs):
        return self.fit(raw_documents).transform(raw_documents)

    def _to_word_topic_vectors(self, word_vecs):
        semantic_cluster_probs = self.semantic_soft_clustering.predict_proba(
            word_vecs)
        # TODO replace with faster way. Can I use mode product?
        topic_vector_dim = (word_vecs.shape[0],
                            semantic_cluster_probs.shape[1] *
                            word_vecs.shape[1])
        word_topic_vectors = np.zeros(topic_vector_dim)
        for i, word_index in zip(range(word_vecs.shape[0]),
                                 self._word_embs.keys()):
            word_vec = word_vecs[i]
            word_vec = word_vec.reshape((word_vec.shape[0], 1))
            word_cluster_prob = semantic_cluster_probs[i]
            word_cluster_prob = word_cluster_prob.reshape(
                (1, word_cluster_prob.shape[0]))
            word_topic_vectors[i, :] = (np.dot(word_vec, word_cluster_prob) *
                                        self.to_idf(word_index)).reshape(
                                            word_topic_vectors.shape[1:])
        return word_topic_vectors

    def _compute_sparse_threshold_ratio(self, non_sparse_doc_vectors):

        feature_min_value = np.mean(np.min(non_sparse_doc_vectors, axis=0))
        feature_max_value = np.mean(np.max(non_sparse_doc_vectors, axis=0))

        self._sparse_threshold_ratio = (np.abs(feature_max_value) +
                                        np.abs(feature_min_value)) / 2
        self.sparse_threshold = self.sparse_threshold_p * self._sparse_threshold_ratio

    def transform_into_ncdv(self, raw_documents):
        bag_of_words_list = self._tfidf_vectorizer.transform(raw_documents)

        doc_vectors = bag_of_words_list.dot(self._word_topic_vecs)
        return doc_vectors

    def _make_sparse(self, matrix, should_compress=True):
        matrix[np.where(np.abs(should_compress) < self.sparse_threshold)] = 0
        if should_compress:
            return sparse.csr_matrix(matrix)
        return matrix
# (25%) sets.
skf = StratifiedKFold(n_folds=4)
# Only take the first fold.
train_index, test_index = next(iter(skf.split(iris.data, iris.target)))

X_train = iris.data[train_index]
y_train = iris.target[train_index]
X_test = iris.data[test_index]
y_test = iris.target[test_index]

n_classes = len(np.unique(y_train))

# Try GMMs using different types of covariances.
estimators = dict((cov_type,
                   GaussianMixture(n_components=n_classes,
                                   covariance_type=cov_type,
                                   max_iter=20,
                                   random_state=0))
                  for cov_type in ['spherical', 'diag', 'tied', 'full'])

n_estimators = len(estimators)

plt.figure(figsize=(3 * n_estimators // 2, 6))
plt.subplots_adjust(bottom=.01,
                    top=0.95,
                    hspace=.15,
                    wspace=.05,
                    left=.01,
                    right=.99)

for index, (name, estimator) in enumerate(estimators.items()):
    # Since we have class labels for the training data, we can
Beispiel #45
0
def gmm_information_criteria_report(
        X_mat,
        k=np.arange(1, 20),
        covar_type=['full', 'tied', 'diag', 'spherical'],
        random_seed=11238,
        out="Graph"):
    # Dataframe transposing closure type funct

    tmp_global_aic, tmp_global_bic = [], []
    for i in covar_type:
        tmp_iter_aic, tmp_iter_bic = [], []
        for j in k:
            tmp_model = GaussianMixture(j,
                                        covariance_type=i,
                                        random_state=random_seed).fit(X_mat)
            tmp_iter_aic.append(tmp_model.aic(X_mat))
            tmp_iter_bic.append(tmp_model.bic(X_mat))
        tmp_global_aic.append(tmp_iter_aic)
        tmp_global_bic.append(tmp_iter_bic)

    covar_type = covar_type
    tmp_get_aic = handle_df(tmp_global_aic, covar_type)
    tmp_get_bic = handle_df(tmp_global_bic, covar_type)
    tmp_get_aic_max = pd.melt(tmp_get_aic,
                              id_vars=['n_components'],
                              value_vars=covar_type).sort_values(by='value')
    tmp_get_bic_max = pd.melt(tmp_get_bic,
                              id_vars=['n_components'],
                              value_vars=covar_type).sort_values(by='value')
    tmp_top_aic = tmp_get_aic_max.head(3)
    tmp_top_bic = tmp_get_bic_max.head(3)

    if out is "Graph":
        plt.subplot(2, 1, 1)
        for colname, index in tmp_get_aic.drop(
                columns='n_components').iteritems():
            plt.plot(index, label=colname)
        plt.scatter(tmp_top_aic['n_components'],
                    tmp_top_aic['value'],
                    edgecolors='slategrey',
                    facecolor='none',
                    lw=2,
                    label="Best hyperparams")
        plt.title('Akaike Information Criteria')
        plt.xticks(k - 1, k)
        plt.xlabel('Number of clusters estimated')
        plt.ylabel('AIC')
        plt.legend()

        plt.subplot(2, 1, 2)
        for colname, index, in tmp_get_bic.drop(
                columns='n_components').iteritems():
            plt.plot(index, label=colname)
        plt.scatter(tmp_top_bic['n_components'],
                    tmp_top_bic['value'],
                    edgecolors='slategrey',
                    facecolor='none',
                    lw=2,
                    label="Best hyperparams")
        plt.title('Bayesian Information Criteria')
        plt.xticks(k - 1, k)
        plt.xlabel('Number of clusters estimated')
        plt.ylabel('BIC')
        plt.legend()

    elif out is not "Graph":
        return tmp_get_aic_max, tmp_get_bic_max
print(X[0])
print(y) # 20개의 종류로 나옵니다.



##
# GMM( Gaussian Mixture model ) 확률 분포를 사용해서 타원으로 클러스터링
# kmeans : 원형,
import numpy as np
import matplotlib.pylab as plt
from sklearn.datasets.samples_generator import make_blobs
X, y_true = make_blobs(n_samples=400, centers=4,
        cluster_std=0.60, random_state=0)
X = X[:, ::-1]
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture (n_components=4).fit(X)
labels = gmm.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');
probs = gmm.predict_proba(X)
print(probs[:5].round(3))
plt.show()

##가우시안이 믹스쳐 되어있는 데이터이다. 타원으로.

from matplotlib.patches import Ellipse
def draw_ellipse(position, covariance, ax=None, **kwargs):
    ax = ax or plt.gca()
    if covariance.shape == (2, 2):
        U, s, Vt = np.linalg.svd(covariance)
        angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
        width, height = 2 * np.sqrt(s)
Beispiel #47
0
    def fit(self, X, y=None):
        """
        Fits gaussian mixure model to the data. 
        Estimate model parameters with the EM algorithm.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.
        
        y : array-like, shape (n_samples,), optional (default=None)
            List of labels for X if available. Used to compute
            ARI scores.

        Returns
        -------
        self
        """

        # Deal with number of clusters
        if self.max_components is None:
            lower_ncomponents = 1
            upper_ncomponents = self.min_components
        else:
            lower_ncomponents = self.min_components
            upper_ncomponents = self.max_components

        n_mixture_components = upper_ncomponents - lower_ncomponents + 1

        if upper_ncomponents > X.shape[0]:
            if self.max_components is None:
                msg = "if max_components is None then min_components must be >= "
                msg += "n_samples, but min_components = {}, n_samples = {}".format(
                    upper_ncomponents, X.shape[0])
            else:
                msg = "max_components must be >= n_samples, but max_components = "
                msg += "{}, n_samples = {}".format(upper_ncomponents,
                                                   X.shape[0])
            raise ValueError(msg)
        elif lower_ncomponents > X.shape[0]:
            msg = "min_components must be <= n_samples, but min_components = "
            msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0])
            raise ValueError(msg)

        # Get parameters
        random_state = self.random_state

        param_grid = dict(
            covariance_type=self.covariance_type,
            n_components=range(lower_ncomponents, upper_ncomponents + 1),
            random_state=[random_state],
        )

        param_grid = list(ParameterGrid(param_grid))

        models = [[] for _ in range(n_mixture_components)]
        bics = [[] for _ in range(n_mixture_components)]
        aris = [[] for _ in range(n_mixture_components)]

        for i, params in enumerate(param_grid):
            model = GaussianMixture(**params)
            model.fit(X)
            models[i % n_mixture_components].append(model)
            bics[i % n_mixture_components].append(model.bic(X))
            if y is not None:
                predictions = model.predict(X)
                aris[i % n_mixture_components].append(
                    adjusted_rand_score(y, predictions))

        self.bic_ = pd.DataFrame(
            bics,
            index=np.arange(lower_ncomponents, upper_ncomponents + 1),
            columns=self.covariance_type,
        )

        if y is not None:
            self.ari_ = pd.DataFrame(
                aris,
                index=np.arange(lower_ncomponents, upper_ncomponents + 1),
                columns=self.covariance_type,
            )
        else:
            self.ari_ = None

        # Get the best cov type and its index within the dataframe
        best_covariance = self.bic_.min(axis=0).idxmin()
        best_covariance_idx = self.covariance_type.index(best_covariance)

        # Get the index best component for best_covariance
        best_component = self.bic_.idxmin()[best_covariance]

        self.n_components_ = best_component
        self.covariance_type_ = best_covariance
        self.model_ = models[best_component -
                             self.min_components][best_covariance_idx]

        return self
Beispiel #48
0
def n_random_integers(n, low=0, high=10):
    ''' generate random numbers with random.randint'''
    ii = []
    for i in range(n):
        ii.append(random.randint(low, high))
    return np.array(ii)

if __name__=='__main__':
    data = load_from_pickle('data','daily_array_all.pkl')
    # shape: (1440, 48)

    # cluster as gaussian mixture
    X = data
    n = 10
    gmm = GaussianMixture(n_components=n)
    gmm.fit(X)
    y = gmm.predict(X)
    probs = gmm.predict_proba(X)

    # sort results into clusters based on labels
    def clusters_from_lables(X,y):
        labels = np.unique(y)
        clusters = []
        for label in labels:
            clusters.append(X[np.where(y == label)])
        return clusters

    clusters = clusters_from_lables(X,y)
    cluster_sizes = [x.shape[0] for x in clusters]
Beispiel #49
0
# find cities that are prime numbers
prime_cities = eratosthenes(max(cities.CityId))
cities['prime'] = prime_cities
b=len(cities)
num=random.sample(range(1,b ), 156)
num.append(0)
df=cities.iloc[num]
df = df.reset_index(drop=True)
len(np.unique(df))
plt.scatter(df.X, df.Y,  c='blue',alpha=0.5,s=1)

#------------------ clustering --------------------------

from sklearn.mixture import GaussianMixture
n_cluster=4
mclusterer = GaussianMixture(n_components=n_cluster, tol=0.01, random_state=66, verbose=1).fit(df[['X', 'Y']].values)
df['mclust'] = mclusterer.predict(df[['X', 'Y']].values)
centers = df.groupby('mclust')['X', 'Y'].agg('mean').reset_index()
clust_c=['#630C3A', '#39C8C6', '#D3500C', '#FFB139']
colors = np.where(df["mclust"]%4==0,'#630C3A','-')
colors[df['mclust']%4==1] = '#39C8C6'
colors[df['mclust']%4==2] = '#D3500C'
colors[df['mclust']%4==3] = '#FFB139' 

    
plt.figure(figsize=(8, 5))
plt.scatter(df.X, df.Y,  color=colors,alpha=0.5,s=5)
for i in range(n_cluster):
    plt.scatter(centers.iloc[i].X, centers.iloc[i].Y, c='black', s=50) 
    #plt.scatter(zeros[0],zeros[1],c='green',s=50)
plt.show()