Beispiel #1
0
def fit_mixtures(X,mag,mbins,binwidth=0.2,seed=None,
                 keepscore=False,keepbic=False,**kwargs):
    kwargs.setdefault('n_components',25)
    kwargs.setdefault('covariance_type','full')
    fits = []
    if keepscore:
        scores = []
    if keepbic:
        bics = []
    if seed:
        np.random.seed(seed)
    for bincenter in mbins:
        # this is not an efficient way to assign bins, but the time
        # is negligible compared to the GMM fitting anyway
        ii = np.where( np.abs(mag-bincenter) < binwidth )[0]
        if False:
            print('{:.2f}: {} qsos'.format(bincenter,len(ii)))
        gmm = GaussianMixture(**kwargs)
        gmm.fit(X[ii])
        fits.append(gmm)
        if keepscore:
            scores.append(gmm.score(X[ii]))
        if keepbic:
            bics.append(gmm.bic(X[ii]))
    rv = (fits,)
    if keepscore:
        rv += (scores,)
    if keepbic:
        rv += (bics,)
    return rv
Beispiel #2
0
    def fit(self, data, ngauss, n_iter=5000, min_covar=1.0e-6,
            doplot=False, **keys):
        """
        data is shape
            [npoints, ndim]
        """
        from sklearn.mixture import GaussianMixture

        if len(data.shape) == 1:
            data = data[:,numpy.newaxis]

        print("ngauss:   ",ngauss)
        print("n_iter:   ",n_iter)
        print("min_covar:",min_covar)

        gmm=GaussianMixture(
            n_components=ngauss,
            max_iter=n_iter,
            reg_covar=min_covar,
            covariance_type='full',
        )

        gmm.fit(data)

        if not gmm.converged_:
            print("DID NOT CONVERGE")

        self._gmm=gmm
        self.set_mixture(gmm.weights_, gmm.means_, gmm.covariances_)

        if doplot:
            plt=self.plot_components(data=data,**keys)
            return plt
Beispiel #3
0
	def learn_subset(self, search_space):
	
		#Mask undesired features
		current_array = self.vectors[:,search_space]
	
		GM = GaussianMixture(n_components = 2, 
							covariance_type = "full", 
							tol = 0.001, 
							reg_covar = 1e-06, 
							max_iter = 1000, 
							n_init = 25, 
							init_params = "kmeans", 
							weights_init = None, 
							means_init = None, 
							precisions_init = None, 
							random_state = None, 
							warm_start = False, 
							verbose = 0, 
							verbose_interval = 10
							)
							
		GM.fit(current_array)

		labels = GM.predict(current_array)
		unique, counts = np.unique(labels, return_counts = True)
		count_dict = dict(zip(unique, counts))
		
		return count_dict, labels
Beispiel #4
0
 def gmm(nclusters, coords, n_init=50, n_iter=500):
     if USE_GAUSSIAN_MIXTURE:
         est = GaussianMixture(n_components=nclusters, n_init=n_init, max_iter=n_iter)
     else:
         est = GMM(n_components=nclusters, n_init=n_init, n_iter=n_iter)
     est.fit(coords)
     return Partition(est.predict(coords))
def fit_gmm(samples, ncomponents=2):
    """Given a numpy array of floating point samples, fit a gaussian mixture model."""
    # assume samples is of shape (NSAMPLES,); unsqueeze to (NSAMPLES,1) and train a GMM:
    gmm = GaussianMixture(n_components=ncomponents)
    gmm.fit(samples.reshape(-1,1))
    # return params of GMM in [(coeff, mu, sigma)] format:
    params = [(gmm.weights_[c], gmm.means_[c][0], gmm.covariances_[c][0][0]) for c in range(ncomponents)]
    return params
def gmm(k, X, run_times=5):
    gm = GMM(k, n_init=run_times, init_params='kmeans')
    #gm = GMM(k)
    gm.fit(X)
    zh = gm.predict(X)
    mu = gm.means_
    cov = gm.covariances_
    return zh, mu, cov
def gmm(k, X, run_times=10, init='kmeans'):
    """GMM from sklearn library. init = {'kmeans', 'random'}, run_times
    is the number of times the algorithm is gonna run with different
    initializations.
    
    """
    gm = GMM(k, n_init=run_times, init_params=init)
    gm.fit(X)
    zh = gm.predict(X)
    return zh
def main():
    X, Y = get_data(10000)
    print("Number of data points:", len(Y))

    model = GaussianMixture(n_components=10)
    model.fit(X)
    M = model.means_
    R = model.predict_proba(X)

    print("Purity:", purity(Y, R)) # max is 1, higher is better
    print("DBI:", DBI(X, M, R)) # lower is better
Beispiel #9
0
    def fit_conditional_parameters(self, j):
        class_wise_scores = self.get_class_wise_scores(j)
        
        class_wise_parameters = dict()
        for label in self._labels:
            gmm = GaussianMixture(n_components=1)
            gmm.fit(class_wise_scores[label].reshape(-1, 1))
            
            class_wise_parameters[label] = \
                self.Gaussian(mu=gmm.means_.flatten()[0],
                              std=np.sqrt(gmm.covariances_.flatten()[0]))

        return class_wise_parameters
Beispiel #10
0
    def loggausfit(self):
        self.fitDf['IRM_norm'] = self.fitDf['remanance']/self.fitDf['remanance'].max()
        xstd,distance,means,covras,weights,yfits = [],[],[],[],[],[]
        for i in range(10):
            data = self.rand_data()
            for j in range(20):
                gmm = GMM(self.fitNumber, covariance_type='full')
                model = gmm.fit(data)
                xstd.append(np.std(model.means_))
                means.append(model.means_)
                covras.append(model.covariances_)
                weights.append(model.weights_)

                sample = self.fitDf['field'].values.reshape((-1, 1))

                logprob = model.score_samples(sample)  # M_best.eval(x)
                responsibilities = model.predict_proba(sample)
                pdf = np.exp(logprob)
                pdf_individual = responsibilities * pdf[:, np.newaxis]
                pdf_norm = np.sum(pdf_individual,axis=1)/np.max(np.sum(pdf_individual,
                                                                   axis=1))
                #distance.append(np.max([abs(i-j) for i,j in zip(np.sum(pdf_individual,axis=1),p)]))
                distance.append(1 - spatial.distance.cosine(pdf_norm,self.fitDf['IRM_norm']))
                yfits.append(pdf_individual)
            del data
        df = pd.DataFrame({'xstd':xstd, 'distance':distance, 'means':means,
                           'covras':covras, 'yfits':yfits, 'weights':weights})
        df['cov_max'] = [np.min(i) for i in df['covras']]
        df = df.sort_values(by=['distance','cov_max','xstd'], ascending=[False,True,False])
        pdf_best = df['yfits'].iloc[0]
        self.means = df['means'].iloc[0]
        self.covra = df['covras'].iloc[0]#sigma**2
        self.weights = df['weights'].iloc[0]
        self.pdf_best = pdf_best/np.max(np.sum(pdf_best,axis=1))
Beispiel #11
0
    def finish(self):
        print("Calculating mean ToT for each PMT from gaussian fits...")
        gmm = GaussianMixture()
        xs, ys = [], []
        for (dom_id, channel_id), tots in self.tot_data.iteritems():
            dom = self.db.doms.via_dom_id(dom_id)
            gmm.fit(np.array(tots)[:, np.newaxis]).means_[0][0]
            mean_tot = gmm.means_[0][0]
            xs.append(31 * (dom.floor - 1) + channel_id + 600 * (dom.du - 1))
            ys.append(mean_tot)

        fig, ax = plt.subplots()
        ax.scatter(xs, ys, marker="+")
        ax.set_xlabel("31$\cdot$(floor - 1) + channel_id + 600$\cdot$(DU - 1)")
        ax.set_ylabel("ToT [ns]")
        plt.title("Mean ToT per PMT")
        plt.savefig(self.plotfilename)
def main():
    Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()
    dae = DeepAutoEncoder([500, 300, 2])
    dae.fit(Xtrain)
    mapping = dae.map2center(Xtrain)
    plt.scatter(mapping[:,0], mapping[:,1], c=Ytrain, s=100, alpha=0.5)
    plt.show()

    # purity measure from unsupervised machine learning pt 1
    gmm = GaussianMixture(n_components=10)
    gmm.fit(Xtrain)
    responsibilities_full = gmm.predict_proba(Xtrain)
    print "full purity:", purity(Ytrain, responsibilities_full)

    gmm.fit(mapping)
    responsibilities_reduced = gmm.predict_proba(mapping)
    print "reduced purity:", purity(Ytrain, responsibilities_reduced)
  def fit(self, X, Y=None):
    if self.method == 'random':
      N = len(X)
      idx = np.random.randint(N, size=self.M)
      self.samples = X[idx]
    elif self.method == 'normal':
      # just sample from N(0,1)
      D = X.shape[1]
      self.samples = np.random.randn(self.M, D) / np.sqrt(D)
    elif self.method == 'kmeans':
      X, Y = self._subsample_data(X, Y)

      print("Fitting kmeans...")
      t0 = datetime.now()
      kmeans = KMeans(n_clusters=len(set(Y)))
      kmeans.fit(X)
      print("Finished fitting kmeans, duration:", datetime.now() - t0)

      # calculate the most ambiguous points
      # we will do this by finding the distance between each point
      # and all cluster centers
      # and return which points have the smallest variance
      dists = kmeans.transform(X) # returns an N x K matrix
      variances = dists.var(axis=1)
      idx = np.argsort(variances) # smallest to largest
      idx = idx[:self.M]
      self.samples = X[idx]
    elif self.method == 'gmm':
      X, Y = self._subsample_data(X, Y)

      print("Fitting GMM")
      t0 = datetime.now()
      gmm = GaussianMixture(
        n_components=len(set(Y)),
        covariance_type='spherical',
        reg_covar=1e-6)
      gmm.fit(X)
      print("Finished fitting GMM, duration:", datetime.now() - t0)

      # calculate the most ambiguous points
      probs = gmm.predict_proba(X)
      ent = stats.entropy(probs.T) # N-length vector of entropies
      idx = np.argsort(-ent) # negate since we want biggest first
      idx = idx[:self.M]
      self.samples = X[idx]
    return self
Beispiel #14
0
    def finish(self):
        print("Calculating mean ToT for each PMT from gaussian fits...")
        gmm = GaussianMixture()
        xs, ys = [], []
        df = pd.DataFrame(self.tot_data)
        for (dom_id, channel_id), data in df.groupby(['dom_id', 'channel_id']):
            tots = data['tot']
            dom = self.db.doms.via_dom_id(dom_id)
            gmm.fit(tots[:, np.newaxis]).means_[0][0]
            mean_tot = gmm.means_[0][0]
            xs.append(31 * (dom.floor - 1) + channel_id + 600 * (dom.du - 1))
            ys.append(mean_tot)

        fig, ax = plt.subplots()
        ax.scatter(xs, ys, marker="+")
        ax.set_xlabel("31$\cdot$(floor - 1) + channel_id + 600$\cdot$(DU - 1)")
        ax.set_ylabel("ToT [ns]")
        plt.title("Mean ToT per PMT")
        plt.savefig(self.plotfilename)
def main():
    Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()
    dae = DeepAutoEncoder([500, 300, 2])
    dae.fit(Xtrain)
    mapping = dae.map2center(Xtrain)
    plt.scatter(mapping[:,0], mapping[:,1], c=Ytrain, s=100, alpha=0.5)
    plt.show()

    # purity measure from unsupervised machine learning pt 1
    # NOTE: this will take a long time (i.e. just leave it overnight)
    gmm = GaussianMixture(n_components=10)
    gmm.fit(Xtrain)
    print("Finished GMM training")
    responsibilities_full = gmm.predict_proba(Xtrain)
    print("full purity:", purity(Ytrain, responsibilities_full))

    gmm.fit(mapping)
    responsibilities_reduced = gmm.predict_proba(mapping)
    print("reduced purity:", purity(Ytrain, responsibilities_reduced))
Beispiel #16
0
    def Recognize(self, fn):
        im = Image.open(fn)
        im = util.CenterExtend(im, radius=20)

        vec = np.asarray(im.convert('L')).copy()
        Y = []
        for i in range(vec.shape[0]):
            for j in range(vec.shape[1]):
                if vec[i][j] <= 200:
                    Y.append([i, j])

        gmm = GaussianMixture(n_components=7, covariance_type='tied', reg_covar=1e2, tol=1e3, n_init=9)
        gmm.fit(Y)
        
        centers = gmm.means_

        points = []
        for i in range(7):
            scoring = 0.0
            for w_i in range(3):
                for w_j in range(3):
                    p_x = centers[i][0] -1 +w_i
                    p_y = centers[i][1] -1 +w_j

                    cr = util.crop(im, p_x, p_y, radius=20)
                    cr = cr.resize((40, 40), Image.ANTIALIAS)

                    X = np.asarray(cr.convert('L'), dtype='float')
                    X = (X.astype("float") - 180) /200

                    x0 = np.expand_dims(X, axis=0)
                    x1 = np.expand_dims(x0, axis=3)

                    global model
                    if self.model.predict(x1)[0][0] < 0.5:
                        scoring += 1

            if scoring > 4:
                points.append((centers[i][0] -20, centers[i][1] -20))
                
        return points
Beispiel #17
0
    def fit(self, X_train, y_train):
        X_train = np.asarray(X_train)
        y_train = np.asarray(y_train)
        # from sklearn.mixture import GMM as GaussianMixture
        from sklearn.mixture import GaussianMixture

        unlabels = range(0, np.max(y_train) + 1)

        for lab in unlabels:
            if self.each_class_params is not None:
                # print 'eacl'
                # print self.each_class_params[lab]
                model = GaussianMixture(**self.each_class_params[lab])
                # print 'po gmm ', model
            elif len(self.same_params) > 0:
                model = GaussianMixture(**self.same_params)
                # print 'ewe ', model
            else:
                model = GaussianMixture()
            X_train_lab = X_train[y_train == lab]
            # logger.debug('xtr lab shape ' + str(X_train_lab))
            model.fit(X_train_lab)

            self.models.insert(lab, model)
conditions = behavioral['labels']

fmri_masked = masker.fit_transform(fmri_filename)

fmri_train, fmri_test, conditions_train, conditions_test = train_test_split(
    fmri_masked, conditions, test_size=0.2, random_state=0)

svc = SVC(kernel='linear')
svc.fit(fmri_train, conditions_train)
svm_prediction = svc.predict(fmri_test)
svm_accuracy = accuracy_score(conditions_test, svm_prediction)
print(svm_accuracy)

gnb = GaussianNB()
gnb.fit(fmri_train, conditions_train)
gnb_prediction = gnb.predict(fmri_test)
gnb_accuracy = accuracy_score(conditions_test, gnb_prediction)
print(gnb_accuracy)

kneigh = KNeighborsClassifier(n_neighbors=3)
kneigh.fit(fmri_train, conditions_train)
kneigh_prediction = kneigh.predict(fmri_test)
kneigh_accuracy = accuracy_score(conditions_test, kneigh_prediction)
print(kneigh_accuracy)

gmm = GaussianMixture(n_components=3, covariance_type='spherical', max_iter=10)
gmm.fit(fmri_train, conditions_train)
gmm_prediction = gmm.predict(fmri_test)
gmm_accuracy = accuracy_score(conditions_test, gmm_prediction)
print(gmm_accuracy)
    # 绘图使用
    colors = '#A0FFA0', '#2090E0', '#FF8080'
    cm = mpl.colors.ListedColormap(colors)
    x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
    x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
    grid_test = np.stack((x1.flat, x2.flat), axis=1)

    plt.figure(figsize=(6, 6), facecolor='w')
    plt.suptitle('GMM/DPGMM比较', fontsize=15)

    ax = plt.subplot(211)
    gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
    gmm.fit(x)
    centers = gmm.means_
    covs = gmm.covariances_
    print('GMM均值 = \n', centers)
    print('GMM方差 = \n', covs)
    y_hat = gmm.predict(x)

    grid_hat = gmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020')

    clrs = list('rgbmy')
    for i, (center, cov) in enumerate(zip(centers, covs)):
        value, vector = sp.linalg.eigh(cov)
        width, height = value[0], value[1]
Beispiel #20
0
def main():
    target_dir = "gmm"

    train_data_file = "data/ext/train_data.npy"
    train_labels_file = "data/ext/train_labels.npy"
    test_data_file = "data/ext/test_data.npy"
    test_labels_file = "data/ext/test_labels.npy"

    #Not used atm but could train several different GMMs
    estimators = dict((cov_type,
                       GaussianMixture(n_components=30,
                                       covariance_type=cov_type,
                                       max_iter=200,
                                       random_state=0))
                      for cov_type in ['spherical', 'diag', 'tied', 'full'])

    train_data = np.load(train_data_file)
    train_labels = np.load(train_labels_file)
    test_data = np.load(test_data_file)
    test_labels = np.load(test_labels_file)
    """ Fitting of the GMMs """
    #number of different speakers
    n_classes = len(np.unique(test_labels))
    gmm = GaussianMixture(n_components=1,
                          tol=1e-3,
                          max_iter=200,
                          n_init=1,
                          verbose=1)
    gmms = []
    for i in range(0, n_classes):
        speaker_train_data = train_data[train_labels == i]
        gmm.fit(speaker_train_data)
        joblib.dump(gmm, f'{target_dir}/gmm_{i}.pkl')

    for i in range(0, n_classes):
        gmm = joblib.load(f'{target_dir}/gmm_{i}.pkl')
        gmms.append(gmm)
    """ Predict using the GMMs """
    metadata_filepath = "data/ext/metadata.json"
    test_file_dir = "data/test"
    test_file_names = os.listdir(test_file_dir)

    #load metadata json
    with open(metadata_filepath, 'r') as f:
        metadata = json.load(f)

    labels = []
    preds = []
    #Make prediction per file in test_file_dir
    for file_name in test_file_names:
        parts = file_name.split('_')  #Get speaker from filename
        if (len(parts) != 2):  #data without deltas has 2 parts
            continue

        data = np.load(f'{test_file_dir}/{file_name}')
        testscores = np.zeros((len(data), n_classes))
        #Score each sample in a file with all GMMs
        for i in range(0, n_classes):
            testscores[:, i] = gmms[i].score_samples(data)
        #Predict label(highest scoring GMM index) for each sample
        predictions = np.sum(testscores, axis=0)

        #Majority vote between predictions for the file
        prediction = predictions.argmax()

        #Gather predictions and correct labels for accuracy score
        preds.append(prediction)
        label = metadata['LABELS'][parts[0]]  #Get label matching speaker
        labels.append(label)
        print(f'pred:{prediction}, label:{label}')
    #Print accuracy score
    print(accuracy_score(labels, preds))
Beispiel #21
0
    def fit(self, X, y=None):
        """
        Fits gaussian mixure model to the data.
        Estimate model parameters with the EM algorithm.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        y : array-like, shape (n_samples,), optional (default=None)
            List of labels for X if available. Used to compute
            ARI scores.

        Returns
        -------
        self
        """

        # Deal with number of clusters
        if self.max_components is None:
            lower_ncomponents = 1
            upper_ncomponents = self.min_components
        else:
            lower_ncomponents = self.min_components
            upper_ncomponents = self.max_components

        n_mixture_components = upper_ncomponents - lower_ncomponents + 1

        if upper_ncomponents > X.shape[0]:
            if self.max_components is None:
                msg = "if max_components is None then min_components must be >= "
                msg += "n_samples, but min_components = {}, n_samples = {}".format(
                    upper_ncomponents, X.shape[0])
            else:
                msg = "max_components must be >= n_samples, but max_components = "
                msg += "{}, n_samples = {}".format(upper_ncomponents,
                                                   X.shape[0])
            raise ValueError(msg)
        elif lower_ncomponents > X.shape[0]:
            msg = "min_components must be <= n_samples, but min_components = "
            msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0])
            raise ValueError(msg)

        # Get parameters
        random_state = self.random_state

        param_grid = dict(
            covariance_type=self.covariance_type,
            n_components=range(lower_ncomponents, upper_ncomponents + 1),
            tol=[self.tol],
            reg_covar=[self.reg_covar],
            max_iter=[self.max_iter],
            n_init=[self.n_init],
            init_params=[self.init_params],
            random_state=[random_state],
        )

        param_grid = list(ParameterGrid(param_grid))

        models = [[] for _ in range(n_mixture_components)]
        bics = [[] for _ in range(n_mixture_components)]
        aris = [[] for _ in range(n_mixture_components)]

        for i, params in enumerate(param_grid):
            model = GaussianMixture(**params)
            model.fit(X)
            models[i % n_mixture_components].append(model)
            bics[i % n_mixture_components].append(model.bic(X))
            if y is not None:
                predictions = model.predict(X)
                aris[i % n_mixture_components].append(
                    adjusted_rand_score(y, predictions))

        self.bic_ = pd.DataFrame(
            bics,
            index=np.arange(lower_ncomponents, upper_ncomponents + 1),
            columns=self.covariance_type,
        )

        if y is not None:
            self.ari_ = pd.DataFrame(
                aris,
                index=np.arange(lower_ncomponents, upper_ncomponents + 1),
                columns=self.covariance_type,
            )
        else:
            self.ari_ = None

        # Get the best cov type and its index within the dataframe
        best_covariance = self.bic_.min(axis=0).idxmin()
        best_covariance_idx = self.covariance_type.index(best_covariance)

        # Get the index best component for best_covariance
        best_component = self.bic_.idxmin()[best_covariance]

        self.n_components_ = best_component
        self.covariance_type_ = best_covariance
        self.model_ = models[best_component -
                             self.min_components][best_covariance_idx]

        return self
Beispiel #22
0
def plot_cluster_means(
    data_loader,
    transformer_path,
    dataset,
    output_dir,
    file_prefix,
    kmeans_clusters=2,
    em_clusters=2,
):
    if dataset == "intention":
        X_untransformed = load_intention()
    else:
        X_untransformed = load_pulsar()

    Xtransformed, y = data_loader()
    with open(transformer_path, "rb") as f:
        transformer = pickle.load(f)

    X = get_inverse_transform(transformer, Xtransformed)
    X = pd.DataFrame(X, columns=X_untransformed.columns)

    if data_loader is load_intention:
        X_plot = X[
            [
                "Administrative",
                "Administrative_Duration",
                "Informational",
                "Informational_Duration",
                "ProductRelated",
                "ProductRelated_Duration",
                "BounceRates",
                "ExitRates",
                "PageValues",
            ]
        ]

    else:
        X_plot = X

    fig, (ax1, ax2) = plt.subplots(1, 2)
    kmeans = KMeans(kmeans_clusters, random_state=1)
    em = GaussianMixture(n_components=em_clusters, random_state=1)
    kmeans.fit(X)
    em.fit(X)

    kmeans_df = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
    kmeans_df = kmeans_df[X_plot.columns]

    em_df = pd.DataFrame(em.means_, columns=X.columns)
    em_df = em_df[X_plot.columns]

    kmeans_df.plot(kind="bar", ax=ax1)
    em_df.plot(kind="bar", ax=ax2)

    ax1.set_ylabel("Mean Value")
    ax2.set_ylabel("Mean Value")

    ax1.set_xlabel("Cluster")
    ax2.set_xlabel("Cluster")

    ax1.set_title("K-Means Cluster Centers")
    ax2.set_title("EM Cluster Centers")
    ax1.get_legend().remove()
    plt.savefig(os.path.join(output_dir, f"{file_prefix}_clusterprojections.png"))
    plt.close()
Beispiel #23
0
def gaussian_overlap(w1, w2):
    '''
    estimate cluster overlap from a 2-mean Gaussian mixture model

    Description  
    -------
    Estimates the overlap between 2 spike clusters by fitting with two
    multivariate Gaussians.  Implementation makes use of scikit learn 'GMM'. 

    The percent of false positive and false negative errors are estimated for 
    both classes and stored as a confusion matrix. Error rates are calculated 
    by integrating the posterior probability of a misclassification.  The 
    integral is then normalized by the number of events in the cluster of
    interest. See description of confusion matrix below.

    NOTE: The dimensionality of the data set is reduced to the top 99% of 
    principal components to increase the time efficiency of the fitting
    algorithm.

    Parameters
    --------
    w1 : array-like [Event x Sample ] 
        waveforms of 1st cluster
    w2 : array-like [Event x Sample ] 
        waveforms of 2nd cluster

    Returns
    ------
    C 
        a confusion matrix

    C[0,0] - False positive fraction in cluster 1 (waveforms of neuron 2 that were assigned to neuron 1)
    C[0,1] - False negative fraction in cluster 1 (waveforms of neuron 1 that were assigned to neuron 2)
    C[1,0] - False negative fraction in cluster 2 
    C[1,1] - False positive fraction in cluster 2
    '''
    # reduce dimensionality to 98% of top Principal Components
    N1 = w1.shape[0]
    N2 = w2.shape[0]

    X = np.concatenate((w1, w2))
    pca = PCA()
    pca.fit(X)
    Xn = pca.transform(X)

    cutoff = 0.98
    num_dims = (np.cumsum(pca.explained_variance_ratio_) < cutoff).sum()

    w1 = Xn[:N1, :num_dims]
    w2 = Xn[N1:, :num_dims]

    # fit 2 multivariate gaussians
    gmm = GMM(n_components=2)
    gmm.fit(np.vstack((w1, w2)))

    # get posteriors
    pr1 = gmm.predict_proba(w1)
    pr2 = gmm.predict_proba(w2)

    # in the unlikely case that the cluster identities were flipped during the
    # fitting procedure, flip them back
    if pr1[:, 0].mean() + pr2[:, 1].mean() < 1:
        pr1 = pr1[:, [1, 0]]
        pr2 = pr2[:, [1, 0]]

    # create confusion matrix
    confusion = np.zeros((2, 2))

    confusion[0, 0] = pr1[:, 1].mean()   # probability that a member of 1 is false
    # relative proportion of spikes that were placed in cluster 2 by mistake
    confusion[0, 1] = pr2[:, 0].sum() / N1
    confusion[1, 1] = pr2[:, 0].mean()   # probability that a member of 2 was really from 1
    # relative proportion of spikes that were placed in cluster 1 by mistake
    confusion[1, 0] = pr1[:, 1].sum() / N2

    return confusion
Beispiel #24
0
plot_digits(digits.data)

#PCA
from sklearn.decomposition import PCA
pca = PCA(0.99, whiten=True)
data = pca.fit_transform(digits.data)
data.shape

#use AIC:
n_components = np.arange(50, 310, 10)
models = [GaussianMixture(n, covariance_type='full', random_state=0) for n in n_components]
aics = [model.fit(data).aic(data) for model in models]
plt.plot(n_components, aics);

gmm = GaussianMixture(140, covariance_type='full', random_state=0)
gmm.fit(data)
print(gmm.converged_)

#draw new data
data_new = gmm.sample(100)
data_new[0].shape

#inverse transform from the PCA
digits_new = pca.inverse_transform(data_new[0])
plot_digits(digits_new)

#Such a generative model of digits can prove very useful as a component of a Bayesian generative classifier

#Kernel density estimation (KDE) is in some senses an algorithm that takes the mixture-of-Gaussians idea to its logical extreme: it uses a mixture consisting of one Gaussian component per point, resulting in an essentially nonparametric estimator of density

#For one-dimensional data, you are probably already familiar with one simple density estimator: the histogram
Beispiel #25
0
#loading data-set for EM algorithm

iris = datasets.load_iris()

X = pd.DataFrame(iris.data)

Y = pd.DataFrame(iris.target)


#Defining EM Model
from sklearn.mixture import GaussianMixture
model2=GaussianMixture(n_components=3,random_state=3425)

#Training of the model

model2.fit(X)


#Predicting classes for our data

uu= model2.predict(X)

#Accuracy of EM Model

from sklearn.metrics import confusion_matrix

cmem=confusion_matrix(Y,uu)
print('The Confusion matrixof EM-algo:\n',cmem)
#print(cm)
print('\n')
from sklearn.metrics import accuracy_score
Beispiel #26
0
                        n_neighbors=4,
                        eigen_solver='arpack',
                        n_jobs=1)
result_sc = sc.fit_predict(tfidf.toarray())
#DBSCAN算法
db = DBSCAN(eps=0.7, min_samples=1)
result_db = db.fit_predict(tfidf.toarray())
#AgglomerativeClustering算法
ac = AgglomerativeClustering(n_clusters=89,
                             affinity='euclidean',
                             linkage='ward')
result_ac = ac.fit_predict(tfidf.toarray())
#GaussianMixture算法
gm = GaussianMixture(n_components=89,
                     covariance_type='diag',
                     max_iter=20,
                     random_state=0)
#for cov_type in ['spherical', 'diag', 'tied', 'full']
gm.fit(tfidf.toarray())
result_gm = gm.predict(tfidf.toarray())
print('K-means的准确率:', normalized_mutual_info_score(result_kmeans, label_list))
print('AffinityPropagation算法的准确率:',
      normalized_mutual_info_score(result_ap, label_list))
print('meanshift算法的准确率:', normalized_mutual_info_score(result_ms, label_list))
print('SpectralClustering算法的准确率:',
      normalized_mutual_info_score(result_sc, label_list))
print('DBSCAN算法的准确率:', normalized_mutual_info_score(result_db, label_list))
print('AgglomerativeClustering算法的准确率:',
      normalized_mutual_info_score(result_ac, label_list))
print('GaussianMixture算法的准确率:',
      normalized_mutual_info_score(result_gm, label_list))
Beispiel #27
0
# In[7]:

#clustering neighbourhood

lat_long = train_test[train_test.longitude > -74.05][
    train_test.longitude < -73.875][train_test.latitude > 40.63][
        train_test.latitude < 40.87]
cluster = lat_long[['latitude', 'longitude']]

model_gm = GaussianMixture(n_components=40,
                           covariance_type='full',
                           tol=0.01,
                           max_iter=5000,
                           random_state=7,
                           verbose=0)
pred_gm = pd.DataFrame(model_gm.fit(cluster).predict(cluster)).set_index(
    cluster.index)
pred_gm.columns = ['pred_gm']

train_test = pd.merge(train_test,
                      pred_gm,
                      how='left',
                      left_index=True,
                      right_index=True)
train_test.pred_gm[train_test.pred_gm.isnull()] = -1

dummy_neighbourhood = pd.get_dummies(train_test.pred_gm, prefix='dummy_nb_')

train_test = train_test.merge(dummy_neighbourhood,
                              how='left',
                              left_index=True,
Beispiel #28
0
    CDBS_data.append(count)

Al_data = ndimage.rotate(np.array(CDBS_data, dtype=float), -45, reshape=False)

chn_num, photons, max_point = find_sudo_peak(Al_data, width=100)

print("CDBS data imported")
print(time.time()-start, "sec")
#######################################################################################################
"""Setting ROI"""
#######################################################################################################
# hist, bin_edges = np.histogram(Al_data, bins=60)
# bin_centers = 0.5*(bin_edges[:-1] + bin_edges[1:])

classif = GaussianMixture(n_components=7)
classif.fit(Al_data.reshape((Al_data.size, 1)))
print("Gaussian Mixture finished")
print(time.time()-start, "sec")

means_ = np.sort(np.squeeze(classif.means_))
threshold = means_[2]+2000
binary_img = Al_data > threshold

masked_Al = np.ma.masked_less_equal(Al_data, threshold)

mask_x = np.any(binary_img, axis=0)
mask_y = np.any(binary_img, axis=1)
x1 = np.argmax(mask_x)
y1 = np.argmax(mask_y)
x2 = len(mask_x) - np.argmax(mask_x[::-1])
y2 = len(mask_y) - np.argmax(mask_y[::-1])
Beispiel #29
0
class TwoStageClustering:
    """
    Class to make a two-stage clustering model, where the first stage is a SOM network and the second stage
    is a clustering method such as k-means or GMM.
    """
    def __init__(self,
                 X,
                 W=None,
                 map_shape=(8, 8),
                 n_clusters=10,
                 init_lr=0.1,
                 init_response=1,
                 max_iter_SOM=10000,
                 max_iter_clus=5000,
                 clus_method="kmeans",
                 normalize_data=False,
                 seed=0):

        # data and SOM map shape
        self.X = X
        if normalize_data:
            self.X = minmax_scale(self.X, axis=0)  # column-wise
        (self.N, self.d) = np.shape(X)
        self.map_shape = map_shape
        self.M = map_shape[0] * map_shape[1]  # number of nodes in the network
        self.W = W  # the weights of the output map

        # hyperparameters
        self.max_iter_SOM = max_iter_SOM
        self.max_iter_clus = max_iter_clus
        self.seed = seed
        self.n_clusters = n_clusters
        self.init_lr = init_lr
        self.init_response = init_response

        # first stage model
        self.model_SOM = SOM(X=self.X,
                             map_shape=self.map_shape,
                             init_lr=self.init_lr,
                             init_response=self.init_response,
                             max_iter=self.max_iter_SOM,
                             seed=self.seed)

        #  second stage model
        self.clus_method = clus_method
        if self.clus_method == "kmeans":
            self.model_clus = KMeans(n_clusters=self.n_clusters,
                                     random_state=self.seed,
                                     algorithm="full",
                                     max_iter=self.max_iter_clus,
                                     n_init=10)
        else:
            self.model_clus = GaussianMixture(n_components=self.n_clusters,
                                              max_iter=self.max_iter_clus,
                                              n_init=10,
                                              init_params="random")

    def train(self, print_progress=True):
        """
        First trains the SOM network, then the second stage model with the prototypes from the SOM network.
        """

        # training first stage SOM network

        t0 = time()  # starting time training SOM
        if self.W is not None:
            self.model_SOM.map = self.W
            print(
                "The SOM is already trained! Continuing with the clusterig method..."
            )
        else:
            print(
                "Start training the two stage clustering procedure with %s..."
                % self.clus_method)
            self.model_SOM.train(print_progress=print_progress)
            self.W = self.model_SOM.map  # 3D array containing the M prototypes

        # fitting second stage clustering method
        t1 = time()  # starting time second stage clustering method
        print("Training %s clustering method..." % self.clus_method)
        self.model_clus.fit(self.W.reshape(
            (self.M, self.d)))  # reshape to a (M, d) matrix
        print(
            "%s clustering method with %d iterations finished in %.3f seconds"
            % (self.clus_method, self.max_iter_clus, time() - t1))
        print("The two stage clustering procedure with %s took %.3f" %
              (self.clus_method, time() - t0))

    def predict(self, X):
        """
        Predicts the labels of X with the two stage clustering procedure. First, get the corresponding prototype of each sample
        of X, then predict the label of the prototype with the clustering method.
        :param X: the data sample to be predicted
        :return: the predicted labels
        """
        W, indices, _ = self.model_SOM.predict(X)
        labels = self.model_clus.predict(W)
        return labels.astype(int)

    def save(self, file_name=None):
        """
        Method to save the model as a pickle file
        """
        if file_name == None:
            print("No file name is given!!!!")
            return
        dir_name = "Models/TwoStageClustering/"
        make_dir(dir_name)
        filehandler = open(dir_name + file_name + ".pkl", "wb")
        pkl.dump(self, filehandler)
        filehandler.close()
clus_KMeans = cluster.KMeans(n_clusters=7, random_state=161227)
clus_KMeans.fit(X)
predicted_label_KMeans = clus_KMeans.fit_predict(X)

clus_AgglomerativeClustering = cluster.AgglomerativeClustering(n_clusters=6)
clus_AgglomerativeClustering.fit(X)
predicted_label_AgglomerativeClustering = clus_AgglomerativeClustering.fit_predict(
    X)

clus_MiniBatchKMeans = cluster.MiniBatchKMeans(n_clusters=5,
                                               random_state=161227)
clus_MiniBatchKMeans.fit(X)
predicted_label_MiniBatchKMeans = clus_MiniBatchKMeans.fit_predict(X)

clus_GM = GaussianMixture(n_components=5, random_state=161227)
clus_GM.fit(X)
predicted_label_GM = clus_GM.fit_predict(X)

cluster_class = pd.DataFrame({
    'Station': station_name,
    'x': station_x,
    'y': station_y,
    'KMeans': predicted_label_KMeans,
    'AgglomerativeClustering': predicted_label_AgglomerativeClustering,
    'MiniBatchKMeans': predicted_label_MiniBatchKMeans,
    'GM': predicted_label_GM
})

cluster_class.to_csv('Cluster_Results.csv', index=False)

# plot_data(settle, predicted_label_KMeans)
Beispiel #31
0
ax.set_yticks(np.arange(0, 1., 0.1))
plt.grid()
plt.scatter(X[:, 0], X[:, 1], color='yellow')
plt.xlim((0, 1))
plt.ylim((0, 1))
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Fortnite Loot Box Locations')
plt.savefig("rawdata.png")

# Use a gaussian mixture model to identify clusters of boxes
from sklearn.mixture import GaussianMixture
RANDOM_STATE = 66
N_CLUSTERS = 28
gmm = GaussianMixture(n_components=N_CLUSTERS, random_state=RANDOM_STATE)
gmm.fit(X)
predict = gmm.predict(X)
means = gmm.means_

# Plot the clusters centers that were found and overlay them
plt.scatter(means[:, 0], means[:, 1], color='blue')
plt.savefig("clusters.png")

# Remove any clusters that are not within a 30 second run of a cluster center
from scipy.spatial.distance import cdist
TIME_SECONDS = 30
UNITS_PER_SECOND = 4.34  # Calculated empirically
units_per_second_scaled = UNITS_PER_SECOND / MAX_DIMENSION
radius = units_per_second_scaled * TIME_SECONDS
center_points = np.zeros((N_CLUSTERS, 2))
Beispiel #32
0
gmm = GMM(random_state=42)

Score = defaultdict(list)
adjMI = defaultdict(list)
S_homog = defaultdict(list)
S_adjMI = defaultdict(list)
S_vm = defaultdict(list)

for i in dim:
    reduced_X = FastICA(n_components=i,
                        random_state=42).fit_transform(X_scaled)
    k = 10
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(reduced_X)
    gmm.fit(reduced_X)
    Score['km'].append(km.score(reduced_X))
    Score['gmm'].append(gmm.score(reduced_X))
    S_homog['km'].append(
        metrics.homogeneity_score(labels, km.predict(reduced_X)))
    S_homog['gmm'].append(
        metrics.homogeneity_score(labels, gmm.predict(reduced_X)))
    S_adjMI['km'].append(
        metrics.adjusted_mutual_info_score(labels, km.predict(reduced_X)))
    S_adjMI['gmm'].append(
        metrics.adjusted_mutual_info_score(labels, gmm.predict(reduced_X)))
    S_vm['km'].append(metrics.v_measure_score(labels, km.predict(reduced_X)))
    S_vm['gmm'].append(metrics.v_measure_score(labels, gmm.predict(reduced_X)))

#plt.legend(['Train', 'Test'], loc='lower right')
Beispiel #33
0
ax.set_ylabel('Model Depth')
ax.grid(True)
fig.suptitle('RMS difference between reduced and original dataset', fontsize=12)
plt.show()


# ## GMM Classification
# We classify with a GMM the reduce dataset
# 
# Doc: http://scikit-learn.org/stable/modules/mixture.html

# In[14]:

# Set-up and train the classifier:
gmm = GaussianMixture(n_components=K,                      covariance_type='full',                      init_params='kmeans',                      max_iter=1000,                      tol=1e-6)
gmm.fit(Xr) # Training on reduced data

# Extract GMM parameters:
priors = gmm.weights_ # [K,1]
centers= gmm.means_   # [K,Nc]
covars = gmm.covariances_ # [K,Nc,Nc] if 'full'

# Classify the dataset:
LABELS = gmm.predict(Xr) # [Np,1]
POST   = gmm.predict_proba(Xr) # [Np,Nc]


# ## Time for a lot of figures

# In[15]:
Beispiel #34
0

#Get the data
obs_wave, obs_flux = data[:,0], data[:,1]

#Center the x data in zero and normalized the y data to the area of the curve
n_wave = obs_wave - obs_wave[np.argmax(obs_flux)]
n_flux = obs_flux / sum(obs_flux) 

#Generate a distribution of points matcthing the curve
line_distribution   = np.random.choice(a = n_wave, size = 100000, p = n_flux)
number_points       = len(line_distribution)

#Run the fit
gmm = GaussianMixture(n_components = 4)
gmm.fit(np.reshape(line_distribution, (number_points, 1)))
gauss_mixt = np.array([p * norm.pdf(n_wave, mu, sd) for mu, sd, p in zip(gmm.means_.flatten(), np.sqrt(gmm.covariances_.flatten()), gmm.weights_)])
gauss_mixt_t = np.sum(gauss_mixt, axis = 0)  

#Plot the data
fig, axis = plt.subplots(1, 1, figsize=(10, 12))
axis.plot(n_wave, n_flux, label = 'Normalized observed flux')
axis.plot(n_wave, gauss_mixt_t, label = '4 components fit')
  
for i in range(len(gauss_mixt)):
    axis.plot(n_wave, gauss_mixt[i], label = 'Gaussian '+str(i), linestyle = '--')

axis.set_xlabel('normalized wavelength', fontsize = 15)
axis.set_ylabel('normalized flux', fontsize = 15)
axis.set_title('Sklearn GM fit', fontsize = 15)
Beispiel #35
0
    data2 = np.random.multivariate_normal(np.array([2, 2]), cov2, 50)
    data = np.concatenate((data1, data2), axis=0)
    classes = np.array([0 for i in range(50)] + [1 for i in range(50)])
    return data, classes


x, y = generate_dataset()

plt.scatter(x[:, 0], x[:, 1], c=y)
plt.show()

n_components = 2
gauss = GaussianMixture(
    n_components=n_components, covariance_type="diag"
)  # Kovarianztype = Diag nur wenn Kovarianzmatrix Werte auf der Diagnoalen hat
gauss.fit(x)

print(
    "Model converged: ", gauss.converged_
)  # Modell hört auf zu itereren wenn Mittelwert und Kovarianzen sich nicht großartig weiter verbessert (innerhalb einer Range). Wert gibt TRUE oder FALSE zurück

covs = gauss.covariances_
means = gauss.means_

#print("Cov:\n", covs, "\n") # Kovarianzen in der Originalausgabe
print(
    np.diag(covs[0]), "\n"
)  # Kovarianzmatrix zur ersten Normalverteilung (Datenwolke 1) -> Vergleiche Z.25
print(
    np.diag(covs[1]), "\n"
)  # Kovarianzmatrix zur zweiten Normalverteilung (Datenwolke 2) -> Vergleiche Z.27
Beispiel #36
0
def main():
    print("Generating Part3 Plots")
    # Intention PCA
    intention_pca, y_intention = load_intention_PCA_reduced()
    em = GaussianMixture(2, random_state=1)

    pulsar_lle, y_pulsar = load_pulsar_LLE_reduced()
    kmeans = KMeans(2, random_state=1)

    em.fit(intention_pca)
    kmeans.fit(pulsar_lle)

    kmeans_clusters = kmeans.predict(pulsar_lle)
    em_probs = em.predict_proba(intention_pca)[:, 0]

    fig = plt.figure(figsize=(10, 8))
    ax1 = fig.add_subplot(221, projection="3d")
    ax2 = fig.add_subplot(222, projection="3d")
    ax3 = fig.add_subplot(223, projection="3d")
    ax4 = fig.add_subplot(224, projection="3d")
    ax1.scatter(
        intention_pca[:, 0],
        intention_pca[:, 1],
        zs=intention_pca[:, 2],
        c=1 - em_probs,
        alpha=0.3,
    )
    ax2.scatter(
        pulsar_lle[:, 0],
        pulsar_lle[:, 1],
        zs=pulsar_lle[:, 2],
        c=1 - kmeans_clusters,
        alpha=0.3,
    )
    ax3.scatter(
        intention_pca[:, 0],
        intention_pca[:, 1],
        zs=intention_pca[:, 2],
        c=y_intention,
        alpha=0.3,
    )
    ax4.scatter(
        pulsar_lle[:, 0], pulsar_lle[:, 1], zs=pulsar_lle[:, 2], c=y_pulsar, alpha=0.3
    )

    ax1.set_xlabel("PCA Dimension 1")
    ax1.set_ylabel("PCA Dimension 2")
    ax1.set_zlabel("PCA Dimension 3")
    ax2.set_xlabel("LLE Dimension 1")
    ax2.set_ylabel("LLE Dimension 2")
    ax2.set_zlabel("LLE Dimension 3")
    ax3.set_xlabel("PCA Dimension 1")
    ax3.set_ylabel("PCA Dimension 2")
    ax3.set_zlabel("PCA Dimension 3")
    ax4.set_xlabel("LLE Dimension 1")
    ax4.set_ylabel("LLE Dimension 2")
    ax4.set_zlabel("LLE Dimension 3")

    ax1.set_title("EM-Predicted Clusters on PCA")
    ax2.set_title("K-Means Predicted Clusters on LLE")

    ax3.set_title("True Labels in PCA Embedding")
    ax4.set_title("True Labels in LLE Embedding")

    plot_dir = os.path.join("plots", "part3")
    plt.savefig(os.path.join(plot_dir, "BestClustering.png"))
    plt.close()

    intention_pca_datafile = get_datafile_path("intention", "pca")
    intention_ica_datafile = get_datafile_path("intention", "ica")
    intention_rp_datafile = get_datafile_path("intention", "rp")
    intention_lle_datafile = get_datafile_path("intention", "lle")
    pulsar_pca_datafile = get_datafile_path("pulsar", "pca")
    pulsar_ica_datafile = get_datafile_path("pulsar", "ica")
    pulsar_rp_datafile = get_datafile_path("pulsar", "rp")
    pulsar_lle_datafile = get_datafile_path("pulsar", "lle")

    save_clustering_plots(intention_pca_datafile, "intention_pca", plot_dir)
    save_clustering_plots(intention_ica_datafile, "intention_ica", plot_dir)
    save_clustering_plots(intention_rp_datafile, "intention_rp", plot_dir)
    save_clustering_plots(intention_lle_datafile, "intention_lle", plot_dir)

    save_clustering_plots(pulsar_pca_datafile, "pulsar_pca", plot_dir)
    save_clustering_plots(pulsar_ica_datafile, "pulsar_ica", plot_dir)
    save_clustering_plots(pulsar_rp_datafile, "pulsar_rp", plot_dir)
    save_clustering_plots(pulsar_lle_datafile, "pulsar_lle", plot_dir)

    print("Intention pca results")
    print_evaluation_stats(intention_pca_datafile)
    print()

    print("Intention ica results")
    print_evaluation_stats(intention_ica_datafile, kmeans_clusters=3)
    print()

    print("Intention rp results")
    print_evaluation_stats(intention_rp_datafile)
    print()

    print("Intention lle results")
    print_evaluation_stats(intention_lle_datafile)
    print()

    print("pulsar pca results")
    print_evaluation_stats(pulsar_pca_datafile)
    print()

    print("pulsar ica results")
    print_evaluation_stats(pulsar_ica_datafile)
    print()

    print("pulsar rp results")
    print_evaluation_stats(pulsar_rp_datafile)
    print()

    print("pulsar lle results")
    print_evaluation_stats(pulsar_lle_datafile)
    print()

    pulsar_PCA_X, pulsar_PCA_y = load_pulsar_PCA_reduced()
    intention_PCA_X, intention_PCA_y = load_intention_PCA_reduced()

    pulsar_ICA_X, pulsar_ICA_y = load_pulsar_ICA_reduced()
    intention_ICA_X, intention_ICA_y = load_intention_ICA_reduced()

    pulsar_RP_X, pulsar_RP_y = load_pulsar_RP_reduced()
    intention_RP_X, intention_RP_y = load_intention_RP_reduced()

    pulsar_LLE_X, pulsar_LLE_y = load_pulsar_LLE_reduced()
    intention_LLE_X, intention_LLE_y = load_intention_LLE_reduced()

    datafile = os.path.join(data_folder, "intention_pca_clustering.json")

    print_clustering_stats(intention_LLE_X, intention_LLE_y)
Beispiel #37
0
 def fit_new(self, x, label):
     self.y.append(label)
     gmm = GMM(self.gmm_order)
     gmm.fit(x)
     self.gmms.append(gmm)
Beispiel #38
0
    data1 = np.random.multivariate_normal(mu1_fact, cov_fact, 400)
    mu2_fact = (2, 2, 1)
    cov_fact = np.identity(3)
    data2 = np.random.multivariate_normal(mu2_fact, cov_fact, 100)
    data = np.vstack((data1, data2))
    y = np.array([True] * 400 + [False] * 100)

    if style == 'sklearn':
        g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000)
        """
            'full' (each component has its own general covariance matrix),
            'tied' (all components share the same general covariance matrix),
            'diag' (each component has its own diagonal covariance matrix),
            'spherical' (each component has its own single variance).
        """
        g.fit(data)
        print '类别概率:\t', g.weights_[0]
        print '均值:\n', g.means_, '\n'
        print '方差:\n', g.covariances_, '\n'
        mu1, mu2 = g.means_
        sigma1, sigma2 = g.covariances_
    else:
        num_iter = 100
        n, d = data.shape
        # 随机指定
        # mu1 = np.random.standard_normal(d)
        # print mu1
        # mu2 = np.random.standard_normal(d)
        # print mu2
        mu1 = data.min(axis=0)
        mu2 = data.max(axis=0)
Beispiel #39
0
                   delimiter=',',
                   skiprows=1)
 print data.shape
 y, x = np.split(data, [
     1,
 ], axis=1)
 x, x_test, y, y_test = train_test_split(x,
                                         y,
                                         train_size=0.6,
                                         random_state=0)
 gmm = GaussianMixture(n_components=2,
                       covariance_type='full',
                       random_state=0)
 x_min = np.min(x, axis=0)
 x_max = np.max(x, axis=0)
 gmm.fit(x)
 print '均值 = \n', gmm.means_
 print '方差 = \n', gmm.covariances_
 y_hat = gmm.predict(x)
 y_test_hat = gmm.predict(x_test)
 change = (gmm.means_[0][0] > gmm.means_[1][0])
 if change:
     z = y_hat == 0
     y_hat[z] = 1
     y_hat[~z] = 0
     z = y_test_hat == 0
     y_test_hat[z] = 1
     y_test_hat[~z] = 0
 acc = np.mean(y_hat.ravel() == y.ravel())
 acc_test = np.mean(y_test_hat.ravel() == y_test.ravel())
 acc_str = u'训练集准确率:%.2f%%' % (acc * 100)
Beispiel #40
0

kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=500, n_init=20, random_state=0)
y_pred = kmeans.fit_predict(X)
plt.scatter(X['Age'], X['Spending Score (1-100)'])
plt.ylabel("Spending Score")
plt.xlabel("Age")
plt.title("Clusters found by KMeans")
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='black')
plt.show()


from sklearn.mixture import GaussianMixture
n_clusters = 5
gmm_model = GaussianMixture(n_components=n_clusters, random_state=5)
gmm_model.fit(X)


cluster_labels = gmm_model.predict(X)
X = pd.DataFrame(X)
X['cluster'] = cluster_labels


color=['blue','green','red', 'black', 'yellow']
for k in range(0,n_clusters):
    data = X[X["cluster"]==k].copy()
    plt.scatter(data["Age"],data["Spending Score (1-100)"],c=color[k])
    
plt.title("Clusters Identified by Guassian Mixture Model")    
plt.ylabel("Spending Score (1-100)")
plt.xlabel("Age")
l = 256
im = np.zeros((l, l))
points = l*np.random.random((2, n**2))
im[(points[0]).astype(np.int), (points[1]).astype(np.int)] = 1
im = ndimage.gaussian_filter(im, sigma=l/(4.*n))

mask = (im > im.mean()).astype(np.float)


img = mask + 0.3*np.random.randn(*mask.shape)

hist, bin_edges = np.histogram(img, bins=60)
bin_centers = 0.5*(bin_edges[:-1] + bin_edges[1:])

classif = GaussianMixture(n_components=2)
classif.fit(img.reshape((img.size, 1)))

threshold = np.mean(classif.means_)
binary_img = img > threshold


plt.figure(figsize=(11,4))

plt.subplot(131)
plt.imshow(img)
plt.axis('off')
plt.subplot(132)
plt.plot(bin_centers, hist, lw=2)
plt.axvline(0.5, color='r', ls='--', lw=2)
plt.text(0.57, 0.8, 'histogram', fontsize=20, transform = plt.gca().transAxes)
plt.yticks([])
Beispiel #42
0
visualizer.fit(results)  # Fit the data to the visualizer
# Finalize and render the figure
visualizer.show(
    outpath="charts/creditcards.k-means.Randomized.SilhouetteVisualizer.png")

lowest_bic = np.infty
bic = []
n_components_range = range(1, 4)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = GaussianMixture(n_components=n_components,
                              covariance_type=cv_type)
        gmm.fit(results)
        bic.append(gmm.bic(results))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm

bic = np.array(bic)
color_iter = itertools.cycle(
    ['navy', 'turquoise', 'cornflowerblue', 'darkorange'])
clf = best_gmm
bars = []

# Plot the BIC scores
plt.figure(figsize=(8, 6))
spl = plt.subplot(2, 1, 1)
for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
Beispiel #43
0
def main():
    parser = argparse.ArgumentParser(
        description='Train VaDE with MNIST dataset',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--epochs',
                        '-e',
                        help='Number of epochs.',
                        type=int,
                        default=20)
    parser.add_argument('--gpu',
                        '-g',
                        help='GPU id. (Negative number indicates CPU)',
                        type=int,
                        default=-1)
    parser.add_argument('--learning-rate',
                        '-l',
                        help='Learning Rate.',
                        type=float,
                        default=0.001)
    parser.add_argument('--batch-size',
                        '-b',
                        help='Batch size.',
                        type=int,
                        default=128)
    parser.add_argument('--out',
                        '-o',
                        help='Output path.',
                        type=str,
                        default='./vade_parameter.pth')
    args = parser.parse_args()

    if_use_cuda = torch.cuda.is_available() and args.gpu >= 0
    device = torch.device('cuda:{}'.format(args.gpu) if if_use_cuda else 'cpu')

    dataset = datasets.MNIST('./data',
                             train=True,
                             download=True,
                             transform=transforms.ToTensor())
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=2,
                                              pin_memory=if_use_cuda)

    pretrain_model = AutoEncoderForPretrain(784, 10).to(device)

    optimizer = torch.optim.Adam(pretrain_model.parameters(),
                                 lr=args.learning_rate)

    for epoch in range(1, args.epochs + 1):
        train(pretrain_model, data_loader, optimizer, device, epoch)

    with torch.no_grad():
        x = torch.cat([data[0] for data in dataset]).view(-1, 784).to(device)
        z = pretrain_model.encode(x).cpu()

    pretrain_model = pretrain_model.cpu()
    state_dict = pretrain_model.state_dict()

    gmm = GaussianMixture(n_components=10, covariance_type='diag')
    gmm.fit(z)

    model = VaDE(N_CLASSES, 784, 10)
    model.load_state_dict(state_dict, strict=False)
    model._pi.data = torch.log(torch.from_numpy(gmm.weights_)).float()
    model.mu.data = torch.from_numpy(gmm.means_).float()
    model.logvar.data = torch.log(torch.from_numpy(gmm.covariances_)).float()

    torch.save(model.state_dict(), args.out)
t = PrettyTable(['Method', 'Accuracy'])

km = KMeans(k, n_init=5)
km.fit(Y)
zh_kmeans = km.labels_
x1_kmeans = X[np.where(zh_kmeans==0)][:, np.newaxis]
x2_kmeans = X[np.where(zh_kmeans==1)][:, np.newaxis]
x1_mu_kmeans, x2_mu_kmeans = km.cluster_centers_
x1_mu_kmeans, x2_mu_kmeans = x1_mu_kmeans[0], x2_mu_kmeans[0]
x1_var_kmeans, x2_var_kmeans = np.var(x1_kmeans), np.var(x2_kmeans)
acc_kmeans = metric.accuracy(z, zh_kmeans)
t.add_row(['k-means', acc_kmeans])

gm = GMM(k, n_init=5, init_params="kmeans")
gm.fit(Y)
zh_gmm = gm.predict(Y)
#x1_gmm = X[np.where(zh_gmm==0)][:, np.newaxis]
#x2_gmm = X[np.where(zh_gmm==1)][:, np.newaxis]
x1_mu_gmm, x2_mu_gmm = gm.means_
x1_mu_gmm, x2_mu_gmm = x1_mu_gmm[0], x2_mu_gmm[0]
x1_var_gmm, x2_var_gmm = gm.covariances_
x1_var_gmm, x2_var_gmm = x1_var_gmm[0][0], x2_var_gmm[0][0]
acc_gmm = metric.accuracy(z, zh_gmm)
t.add_row(['gmm', acc_gmm])

G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x-y))
zh_kgroups = wrapper.kernel_kgroups(k, Y, G)
x1_kgroups = X[np.where(zh_kgroups==0)][:, np.newaxis]
x2_kgroups = X[np.where(zh_kgroups==1)][:, np.newaxis]
acc_kgroups = metric.accuracy(z, zh_kgroups)
Beispiel #45
0
def fit_gmm(
    max_components,
    n_distances,
    atoms,
    distances,
    regularization_type="bic",
    covariance_type="diag",
):
    """
    Fit a GMM to a set of distances.

    This routine will fit a Gaussian mixture model from a set
    of input distances using sklearn_. The resulting set of parameters can
    be used to initialize a `GMMDistanceRestraint` in a MELD simulation.

    .. _sklearn: http://scikit-learn.org/stable/modules/mixture.html

    Parameters
    ----------
    max_components: int
        Maximum number of components to use in fitting GMM.
    n_distances: int
        Number of distances involved in GMM
    atoms: list of (int, str, int, str) tuples.
        The atoms that are involved in each distance are specified
        as a list of `n_distances` tuples, each of the form
        (r1, n1, r2, n2), where r1, r2 are the integer residue
        indices starting from one, and n1, n2 are the atom names.
    distances: array_like(n_dim=2)
        An (n_samples, n_distances) array of distances (in nm) to fit.
    regularization_type: str
        The type of regularization to use, options are "bic"
        and "dirichlet".
    covariance_type: str
        The form of the covariance matrix, options are "diag"
        and "full".

    Returns
    -------
    GMMParams
        The fit parameters, which can be used to initialize
        a `meld.system.restraints.GMMDistanceRestraint` using
        ``GMMDistanceRestraint.from_params``.

    Notes
    -----
    There are two ways to regularize in order to prevent over fitting.

    ``regularization_type="bic"`` will use the Bayesian information
    criterion to penalize models that have more parameters. When
    using ``bic``, The final number of components in the model
    will be less than or equal to `max_components`.

    ``regularization_type=dirichlet`` will use a Dirichlet process
    prior on the weight distributions. The final number of components
    in the model will always be equal to `max_components`, but most
    of the weights will be small.

    There are two forms for the covariance matrix, which differ in
    the number of parameters and expressiveness.

    ``covariance_type="diag"`` will fit using a diagonal covariance
    matrix. This has few parameters, but does not capture correlations
    between input distances. Typically, choosing ``"diag"`` will
    result in a model with more components.

    ``covariance_type="full"`` will fit using a full representation
    of the covariance matrix. This captures correlations between
    input distances, but has far more parameters and is potentially
    prone to over fitting.
    """

    #
    # Constants
    #
    N_INIT = 25
    MAX_ITER = 1000
    KFOLD_SPLITS = 5
    REG_COVAR = 1e-4
    RANDOMSEARCH_TRIALS = 32

    #
    # Check the inputs
    #
    if distances.shape[1] != n_distances:
        raise ValueError("distances must have shape (n_samples, n_distances)")

    if len(atoms) != n_distances:
        raise ValueError(
            "atoms must be a list of (ind1, name1, ind2, name2) of "
            "length n_components"
        )

    if regularization_type not in ["bic", "dirichlet"]:
        raise ValueError('regularization_type must be one of ["bic", "dirichlet"]')

    if covariance_type not in ["diag", "full"]:
        raise ValueError('covariance_type must be one of ["diag", "full"]')

    if max_components < 1:
        raise ValueError("max_components must be >= 1")
    if max_components > 32:
        raise ValueError("MELD supports a maximum of 32 GMM components")

    #
    # Create and fit the model
    #
    if regularization_type == "bic":
        # BIC fit
        # Search different values of n_components to find the minimal
        # BIC.
        models = []
        for i in range(1, max_components + 1):
            g = GaussianMixture(
                n_components=i,
                n_init=N_INIT,
                max_iter=MAX_ITER,
                covariance_type=covariance_type,
                reg_covar=REG_COVAR,
            )
            g.fit(distances)
            models.append((g.bic(distances), g))

        gmm = sorted(models, key=lambda x: x[0])[0][1]

    else:
        # Dirichlet process fit
        # use RandomSearchCV to optimize hyperparameters
        params = {
            "weight_concentration_prior": LogUniformSampler(1e-6, 10),
            "mean_precision_prior": LogUniformSampler(1, 10),
        }
        model = BayesianGaussianMixture(
            max_components,
            n_init=N_INIT,
            max_iter=MAX_ITER,
            covariance_type=covariance_type,
            reg_covar=REG_COVAR,
        )
        rs = RandomizedSearchCV(
            model,
            param_distributions=params,
            n_iter=RANDOMSEARCH_TRIALS,
            cv=KFold(n_splits=KFOLD_SPLITS, shuffle=True),
        )
        rs.fit(distances)
        gmm = rs.best_estimator_

    # turn the vector representation of the diagonal into a full
    # precision matrix
    if covariance_type == "diag":
        precisions = gmm.precisions_
        assert len(precisions.shape) == 2
        new_precisions = []
        for i in range(precisions.shape[0]):
            new_precisions.append(np.diag(precisions[i, :]))
        precisions = np.array(new_precisions)
    else:
        precisions = gmm.precisions_

    # convert the list of atoms into the correct form
    new_atoms = []
    for r1, n1, r2, n2 in atoms:
        new_atoms.append((r1, n1))
        new_atoms.append((r2, n2))

    # Return the parameters for a GMM
    return GMMParams(
        n_components=gmm.weights_.shape[0],
        n_distances=n_distances,
        atoms=new_atoms,
        weights=gmm.weights_,
        means=gmm.means_,
        precisions=precisions,
    )
Beispiel #46
0
X_train_tsne = tsne.fit_transform(X_train)
X_score_tsne = tsne.fit_transform(X_score)
# ====== lda ====== #
lda = LinearDiscriminantAnalysis(n_components=NUM_DIM)
lda.fit(X_train, y_train)
X_train_lda = lda.transform(X_train)
X_score_lda = lda.transform(X_score)
# ====== plda ====== #
plda = PLDA(n_phi=NUM_DIM, random_state=SEED)
plda.fit(X_train, y_train)
X_train_plda = plda.predict_log_proba(X_train)
X_score_plda = plda.predict_log_proba(X_score)
# ====== gmm ====== #
gmm = GaussianMixture(n_components=NUM_DIM, max_iter=100, covariance_type='full',
                      random_state=SEED)
gmm.fit(X_train)
X_train_gmm = gmm._estimate_weighted_log_prob(X_train)
X_score_gmm = gmm._estimate_weighted_log_prob(X_score)
# ====== rbm ====== #
rbm = BernoulliRBM(n_components=NUM_DIM, batch_size=8, learning_rate=0.0008,
                   n_iter=8, verbose=2, random_state=SEED)
rbm.fit(X_train)
X_train_rbm = rbm.transform(X_train)
X_score_rbm = rbm.transform(X_score)
# ===========================================================================
# Deep Learning
# ===========================================================================

# ===========================================================================
# Visualize
# ===========================================================================
Beispiel #47
0
plt.close()

n_components = range(2, 31)
Cancer_EM_aic = []
Cancer_EM_bic = []
Cancer_EM_score = []
Cancer_EM_homogeneity_score = []
Cancer_EM_complete_score = []
Cancer_EM_log = []
Cancer_EM_train_acc = []
Cancer_EM_cv_acc = []

for i in n_components:
    print(i)
    EM.set_params(random_state=7641, n_components=i)
    EM.fit(Cancer_X)
    Cancer_EM_score.append(EM.score(Cancer_X_train))
    Cancer_EM_bic.append(EM.bic(Cancer_X_train))
    Cancer_EM_aic.append(EM.aic(Cancer_X_train))
    Cancer_EM_log.append(
        silhouette_score(Cancer_X_train, EM.predict(Cancer_X_train)))
    Cancer_EM_homogeneity_score.append(
        homogeneity_score(Cancer_y_train, EM.predict(Cancer_X_train)))
    Cancer_EM_complete_score.append(
        completeness_score(Cancer_y_train, EM.predict(Cancer_X_train)))
    Cancer_scores = cross_validate(EM,
                                   Cancer_X_train,
                                   Cancer_y_train,
                                   cv=5,
                                   scoring=make_scorer(my_custom_acc,
                                                       greater_is_better=True),
    
    start = timer()
    zh = kernel_kmeans(k, G, Z0, W)
    end = timer()
    Zh = ztoZ(zh)
    t.add_row(["kernel k-means (k-means++)", metric.accuracy(z, zh), 
                  objective(Zh, G, W), end-start])
    
    start = timer()
    zh = kernel_kmeans(k, G, Z1, W)
    end = timer()
    Zh = ztoZ(zh)
    t.add_row(["kernel k-means (spectral)", metric.accuracy(z, zh), 
                  objective(Zh, G, W), end-start])
    
    start = timer()
    gmm = GMM(k)
    gmm.fit(X)
    zh = gmm.predict(X)
    end = timer()
    t.add_row(["GMM", metric.accuracy(z, zh), "-", end-start])
    
    start = timer()
    km = KMeans(k)
    zh = km.fit_predict(X)
    end = timer()
    t.add_row(["k-means", metric.accuracy(z, zh), "-", end-start])

    print t

Beispiel #49
0
def cluster_silh_plot(prefix,
                      clustermethod,
                      drmethod,
                      range_n_clusters,
                      X,
                      plotdim,
                      seed=seed):
    silhouette_avgs = []
    sample_silhouette_nvalues = []
    cluster_nlabels = []
    clusterers = []
    cluster_scores = ["method,drmethod,nclusters,score"]
    for n_clusters in range_n_clusters:
        # Initialize the clusterer with n_clusters value and a random generator
        # seed for reproducibility.
        if clustermethod == 'GM':
            name = 'GaussianMixture'
            clusterer = GaussianMixture(n_components=n_clusters,
                                        random_state=seed)
        if clustermethod == 'KM':
            name = 'KMeans'
            clusterer = KMeans(n_clusters=n_clusters, random_state=seed)

        clusterers.append(clusterer)

        # Predict cluster labels
        cluster_labels = clusterer.fit(X).predict(X)
        cluster_nlabels.append(cluster_labels)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        silhouette_avgs.append(silhouette_avg)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
        cluster_scores.append(
            "%s,%s,%d,%.10f" %
            (clustermethod, drmethod, n_clusters, silhouette_avg))

        # Compute the silhouette scores for each sample
        sample_silhouette_nvalues.append(silhouette_samples(X, cluster_labels))

    highest_score = -1
    n_clusters = None
    cluster_labels = None
    sample_silhouette_values = None
    silhouette_avg = None
    clusterer = None

    for i, v in enumerate(silhouette_avgs):
        if v > highest_score:
            n_clusters = range_n_clusters[i]
            silhouette_avg = silhouette_avgs[i]
            sample_silhouette_values = sample_silhouette_nvalues[i]
            cluster_labels = cluster_nlabels[i]
            clusterer = clusterers[i]
            highest_score = v

    print("highest silhoutte score = %.10f" % (silhouette_avg))
    print("n_clusters with highest score = %d" % (n_clusters))
    print("plotting...")

    figname = "%s-%s-%s-clusters.png" % (label.replace(
        " ", "-"), clustermethod, drmethod)
    plot_clusters_save(prefix, clustermethod, name, X, cluster_labels,
                       n_clusters, plotdim, figname)
    figname = "%s-%s-%s-%d.png" % (label.replace(
        " ", "-"), clustermethod, drmethod, n_clusters)
    plot_silh_save(prefix, clustermethod, name, n_clusters, X, cluster_labels,
                   clusterer, silhouette_avg, sample_silhouette_values,
                   figname)

    with open('%s-%s-silhscores.csv' % (prefix.replace(" ", "-"), drmethod),
              "w") as f:
        for line in cluster_scores:
            f.write("%s\n" % (line))

    return cluster_labels, silhouette_avgs
Beispiel #50
0
    for max_iter in [50, 100, 200, 300]:
        params = {'n_clusters': n_clusters, 'max_iter': max_iter}
        k_means = KMeans(**params)
        k_means.fit(df)
        df['k_means_id'] = k_means.labels_

        plt.scatter(df['x'], df['y'], c=df['k_means_id'], alpha=0.5)

        text = "K-means, n-clusters: " + str(n_clusters) + ",max_iter: " + str(
            max_iter)
        plt.title(text)
        plt.show()

#EM clustering==========================================
from sklearn.mixture import GaussianMixture

for n_components in [2, 3, 4, 5, 6]:
    for max_iter in [50, 100, 200, 300]:
        params = {'n_components': n_components, 'max_iter': max_iter}
        gmm = GaussianMixture(**params)
        gmm.fit(df)
        y_predict = gmm.predict(df)
        df['EM_id'] = y_predict

        plt.figure(figsize=(8, 8))
        plt.scatter(df['x'], df['y'], c=df['EM_id'], alpha=0.5)
        text = "EM, n_components: " + str(n_components) + ",max_iter: " + str(
            max_iter)
        plt.title(text)
        plt.show()
Beispiel #51
0
sm_init[sm_init == 0] = 1
sm_final[sm_final == 0] = 1
init = reverse_histogram(np.log10(sm_init))
final = reverse_histogram(np.log10(sm_final))


#init = np.vstack((lf_flat, hf_i_flat)).T
#final = np.vstack((lf_flat, hf_f_flat)).T

m_init = np.array([[  0.,   0.],
                   [ 54.,  10.],
                   [ 80.,  75.]])

gmm = GaussianMixture(n_components=3, means_init=m_init)

gmm.fit(init)
means_i = gmm.means_
cov_i = gmm.covariances_

m_final = np.array([[  0.,   0.],
                   [ 42.,  17.],
                   [ 80.,  75.]])
gmm = GaussianMixture(n_components=3, means_init=m_final)
gmm.fit(final)
means_f = gmm.means_
cov_f = gmm.covariances_

plot_log_histogram(histo_init, means_i, vmax=100, interp="bicubic")
pl.scatter(m_init[:,0], m_init[:,1], s=100, marker="^")
plot_log_histogram(histo_final, means_f, vmax=100, interp="bicubic")
pl.scatter(m_final[:,0], m_final[:,1], s=100, marker="^")
# Plot real cluster
plt.subplot(2, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Real Clusters')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')

# Plot K-means cluster
plt.subplot(2, 2, 2)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[model.labels_], s=40)
plt.title('K-Means Clustering')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')

# General EM for GMM
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler.fit(X)
xsa = scaler.transform(X)
xs = pd.DataFrame(xsa, columns = X.columns)

from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components = 3)
gmm.fit(xs)
gmm_y = gmm.predict(xs)
plt.subplot(2, 2, 3)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[gmm_y], s=40)
plt.title('GMM Clustering')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
print('Observation: The GMM using EM algorithm based clustering matched the true labels more closely than the Kmeans.')
Beispiel #53
0
def fit_gmm_to_points(points,
                      n_components,
                      mdl,
                      ps=[],
                      num_iter=100,
                      covariance_type='full',
                      min_covar=0.001,
                      init_centers=[],
                      force_radii=-1.0,
                      force_weight=-1.0,
                      mass_multiplier=1.0):
    """fit a GMM to some points. Will return the score and the Akaike score.
    Akaike information criterion for the current model fit. It is a measure
    of the relative quality of the GMM that takes into account the
    parsimony and the goodness of the fit.
    if no particles are provided, they will be created

    points:            list of coordinates (python)
    n_components:      number of gaussians to create
    mdl:               IMP Model
    ps:                list of particles to be decorated. if empty, will add
    num_iter:          number of EM iterations
    covariance_type:   covar type for the gaussians. options: 'full', 'diagonal', 'spherical'
    min_covar:         assign a minimum value to covariance term. That is used to have more spherical
                       shaped gaussians
    init_centers:      initial coordinates of the GMM
    force_radii:       fix the radii (spheres only)
    force_weight:      fix the weights
    mass_multiplier:   multiply the weights of all the gaussians by this value
    dirichlet:         use the DGMM fitting (can reduce number of components, takes longer)
    """


    new_sklearn = False
    try:
        from sklearn.mixture import GMM
    except ImportError:
        from sklearn.mixture import GaussianMixture
        new_sklearn = True

    print('creating GMM with n_components',n_components,'n_iter',num_iter,'covar type',covariance_type)
    if new_sklearn:
        # aic() calls size() on points, so it needs to a numpy array, not a list
        points = np.array(points)
        weights_init = precisions_init = None
        if force_radii != -1.0:
            print('warning: radii can no longer be forced, but setting '
                  'initial values to ', force_radii)
            precisions_init = np.array([[1./force_radii]*3
                                       for i in range(n_components)])
        if force_weight != -1.0:
            print('warning: weights can no longer be forced, but setting '
                  'initial values to ', force_weight)
            weights_init = np.array([force_weight]*n_components)

        gmm = GaussianMixture(n_components=n_components,
                              max_iter=num_iter,
                              covariance_type=covariance_type,
                              weights_init=weights_init,
                              precisions_init=precisions_init,
                              means_init=None if init_centers==[]
                                              else init_centers)
    else:
        params='m'
        init_params='m'
        if force_radii==-1.0:
            params+='c'
            init_params+='c'
        else:
            covariance_type='spherical'
            print('forcing spherical with radii',force_radii)

        if force_weight==-1.0:
            params+='w'
            init_params+='w'
        else:
            print('forcing weights to be',force_weight)

        gmm = GMM(n_components=n_components, n_iter=num_iter,
                  covariance_type=covariance_type, min_covar=min_covar,
                  params=params, init_params=init_params)
        if force_weight!=-1.0:
            gmm.weights_=np.array([force_weight]*n_components)
        if force_radii!=-1.0:
            gmm.covars_=np.array([[force_radii]*3 for i in range(n_components)])
        if init_centers!=[]:
            gmm.means_=init_centers
    print('fitting')
    model=gmm.fit(points)
    score=gmm.score(points)
    akaikescore=model.aic(points)
    #print('>>> GMM score',gmm.score(points))

    ### convert format to core::Gaussian
    if new_sklearn:
        covars = gmm.covariances_
    else:
        covars = gmm.covars_
    for ng in range(n_components):
        covar=covars[ng]
        if covar.size==3:
            covar=np.diag(covar).tolist()
        else:
            covar=covar.tolist()
        center=list(gmm.means_[ng])
        weight=mass_multiplier*gmm.weights_[ng]
        if ng>=len(ps):
            ps.append(IMP.Particle(mdl))
        shape=IMP.algebra.get_gaussian_from_covariance(covar,IMP.algebra.Vector3D(center))
        g=IMP.core.Gaussian.setup_particle(ps[ng],shape)
        IMP.atom.Mass.setup_particle(ps[ng],weight)
        IMP.core.XYZR.setup_particle(ps[ng],sqrt(max(g.get_variances())))

    return (score,akaikescore)
# Using the elbow method to find the optimal number of clusters
wcss = []
for i in range(1, 15):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=1)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 15), wcss)
plt.title('Finding the Best K:  The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Fitting K-Means to the dataset (4 clusters)
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=1)
y_kmeans = kmeans.fit_predict(X)

# Check against known classifications
cm = confusion_matrix(y_kmeans, Y)
print(Y_Results)
print(pd.DataFrame(cm))

# Run EM using GaussianMixture (4 clusters)
EM = GaussianMixture(n_components=4, random_state=1)
fit = EM.fit(X)
labels = fit.predict(X)

# Generate confusion matrix to compare to actual results
cm = confusion_matrix(labels, Y)
print(Y_Results)
print(pd.DataFrame(cm))
Beispiel #55
0
#使用最佳的eps进行DBScan聚类
db = DBSCAN(eps=eps, min_samples=3).fit(XYMatrix)
db_label = np.array([i + 1 for i in db.labels_])
score = silhouette_score(XYMatrix, db_label)
print score
writeHTML(clusterType="DBSCAN_final", clusterLabel=db_label)

#Kmeans聚类与GMM聚类进行比较
kmean_classes = len(np.unique(cluster_labels))

#GMM聚类 n_components = kmean_classes
gmm = GaussianMixture(n_components=kmean_classes, max_iter=20, random_state=0)
gmm.means_init = np.array(
    [XYMatrix[cluster_labels == i].mean(axis=0) for i in range(kmean_classes)])
gmm.fit(XYMatrix)
gmm_labels = gmm.predict(XYMatrix)
#以Kmeans为基础,计算GMM的准确率
train_accuracy = np.mean(gmm_labels.ravel() == cluster_labels.ravel()) * 100
print "gmm - kmeans accuracy : ", train_accuracy

#去除DBScan算法认定的噪音
no_noise_matrix = np.array(XYMatrix[db_label != 0])
no_noise_label = np.array(db_label[db_label != 0])

dbscan_class = len(np.unique(no_noise_label))

gmm = GaussianMixture(n_components=dbscan_class, random_state=0)
gmm.means_init = np.array([
    no_noise_matrix[no_noise_label == i].mean(axis=0)
    for i in range(dbscan_class)
Beispiel #56
0
    def __init__(self,
                 pulse_times_A,
                 pulse_times_B,
                 units_A=1,
                 units_B=1,
                 chunk_size=5,
                 plot=False,
                 raise_exception=True):
        '''Class for converting timestamps between two recording systems
        (e.g  pyControl and an ephys) using sync pulses with random inter-pulse
        intervals recorded on both systems.  Typically these sync pulses are generated
        by pyControl using the Rsync hardware object and sent to other systems. To use the
        Rsync_aligner,instantiate it by providing the sync pulse times recorded by each
        system. Timestamps from either system can then be converted into the reference frame
        of the other using the A_to_B and B_to_A methods.  If the hardware systems use 
        different units to measure time this must be specified using the units arguments
        when the aligner is instantiated. When the aligner is instantiated it works out 
        which pulses in each reference frame correspond to each other by by aligning 
        short chunks of pulse sequence A with B by minimising the mean squared error 
        between inter-pulse intervals.

        Arguments:

        pulse_times_A: The times when sync pulses occured recorded by hardware system A.

        pulse_times_B: The times when sync pulses occured recorded by hardware system B.

        units_A: The time units used by system A expressed in milliseconds.  E.g. if 
                 system A uses units of seconds the *units_A* argument is 1000.  

        units_B: The time units used by system B expressed in milliseconds.

        plot: Whether to plot information about the alignment.

        raise_exception: If *True* an RsyncError exception is raised if no match is found
                         between the sync pulse sequences.

        '''

        # Convert all units to ms.
        pulse_times_A = pulse_times_A * units_A
        pulse_times_B = pulse_times_B * units_B
        # Evalute inter pulse intervals
        intervals_A = np.diff(
            pulse_times_A)  # Inter-pulse intervals for sequence A
        intervals_B = np.diff(
            pulse_times_B)  # Inter-pulse intervals for sequence B
        intervals_B2 = intervals_B**2
        # Find alignments of chunks which minimise sum of squared errors.
        chunk_starts_A = np.arange(
            0,
            len(pulse_times_A) - chunk_size,
            chunk_size)  # Start indices of each chunk of sequence A.
        chunk_starts_B = np.zeros(
            chunk_starts_A.shape,
            int)  # Start indicies of corresponding chunks in B.
        chunk_min_mse = np.zeros(
            chunk_starts_A.shape
        )  # Mean squared error for each chunks best alignment.
        chunk_2nd_mse = np.zeros(
            chunk_starts_A.shape
        )  # Mean sqared error for each chunks second best (i.e non matching) alignment.
        ones_chunk = np.ones(chunk_size)
        for i, csA in enumerate(chunk_starts_A):
            chunk_A = intervals_A[csA:csA + chunk_size]
            mse = (np.correlate(intervals_B2, ones_chunk, mode='valid') +
                   np.sum(chunk_A**2) - 2 * np.correlate(
                       intervals_B, chunk_A, mode='valid')) / chunk_size
            chunk_starts_B[i] = np.argmin(mse)
            sorted_chunk_min_mse = np.sort(mse)
            chunk_min_mse[i] = sorted_chunk_min_mse[0]
            chunk_2nd_mse[i] = sorted_chunk_min_mse[1]
        # Assign chunks to matched and non-matched groups by fitting 2 component
        # Gaussian mixture model to log mse distribition of best + second best
        # alignments.
        log_mse = np.log(np.hstack([chunk_min_mse, chunk_2nd_mse]))
        log_mse = log_mse[np.isfinite(log_mse)].reshape(-1, 1)
        gmm = GaussianMixture(n_components=2, covariance_type='spherical')
        gmm.fit(log_mse)
        valid_matches = gmm.predict(log_mse) == np.argmin(
            gmm.means_)  # True for chunks which are valid matches.
        # Make arrays of corresponding times.
        cor_times_A = np.full(
            pulse_times_B.shape,
            np.nan)  # A pulse times corresponding to each B pulse.
        cor_times_B = np.full(
            pulse_times_A.shape,
            np.nan)  # B pulse times corresponding to each A pulse.
        for csA, csB, valid in zip(chunk_starts_A, chunk_starts_B,
                                   valid_matches):
            if valid:
                cor_times_A[csB:csB + chunk_size] = pulse_times_A[csA:csA +
                                                                  chunk_size]
                cor_times_B[csA:csA + chunk_size] = pulse_times_B[csB:csB +
                                                                  chunk_size]
        # Store pulse times, their correspondences and units.
        self.pulse_times_A = pulse_times_A
        self.pulse_times_B = pulse_times_B
        self.cor_times_A = cor_times_A
        self.cor_times_B = cor_times_B
        self.units_A = units_A
        self.units_B = units_B
        # Check quality of alignment.
        separation_OK = (
            np.abs(gmm.means_[0] - gmm.means_[1])[0]
            >  # Different in GMM means > 3 x sum of standard deviations.
            3 * np.sum(np.sqrt(gmm.covariances_)))
        order_OK = ((np.nanmin(np.diff(cor_times_A)) > 0)
                    and (np.nanmin(np.diff(cor_times_A)) > 0)
                    )  # Corresponding times are monotonically increacing.
        if not (separation_OK and order_OK):
            if raise_exception:
                raise RsyncError(
                    'No match found between inter-pulse interval sequences.')
            else:
                print(
                    'Rsync warning: No match found between inter-pulse interval sequences.'
                )
        # Plotting
        if plot:
            plt.figure(plot if type(plot) == int else 1, figsize=[7, 9]).clf()
            plt.subplot2grid((3, 3), (0, 0), rowspan=1, colspan=2)
            plt.hist(log_mse[valid_matches], 20, color='b', label='Match')
            plt.hist(log_mse[~valid_matches], 20, color='r', label='Non-match')
            plt.legend(loc='upper center')
            plt.xlabel('Log mean squared error')
            plt.ylabel('# chunks')
            plt.subplot2grid((3, 3), (0, 2), rowspan=1, colspan=1)
            timing_errors = np.diff(cor_times_A) - np.diff(pulse_times_B)
            plt.hist(timing_errors[~np.isnan(timing_errors)], 20)
            plt.xlabel('Inter-pulse interval\ndiscrepancy (ms)')
            plt.ylabel('# pulses')
            plt.subplot2grid((3, 1), (1, 0), rowspan=2, colspan=1)
            plt.plot(pulse_times_A / units_A,
                     cor_times_B / units_B,
                     '.',
                     markersize=2)
            plt.xlim(pulse_times_A[0] / units_A, pulse_times_A[-1] / units_A)
            plt.xlabel('pulse times A')
            plt.ylabel('pulse times B')
            plt.tight_layout()
Beispiel #57
0
print('Unimodal Gaussian Fit:  Mean {:.4}, stdev {:.4}'.format(mu, sig))
plt.hist(data, bins='auto', alpha=.3, normed=True)

##############################################################################
# As expected, the result is rather silly, since we are only fitting *one*
# of the two gaussians.

##############################################################################
# Fit Gaussian Mixture Model (GMM)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Assuming the data is the sum of one or more gaussians.
# Easily handles multidimensional case as well.

gmm = GaussianMixture(n_components=2, covariance_type='spherical')
gmm.fit(data)

mu1 = gmm.means_[0, 0]
mu2 = gmm.means_[1, 0]
var1, var2 = gmm.covariances_
wgt1, wgt2 = gmm.weights_
print(
    '''Fit:
      1: Mean {:.4}, var {:.4}, weight {:.4}
      2: Mean {:.4}, var {:.4}, weight {:.4}
'''.format(mu1, var1, wgt1, mu2, var2, wgt2)
)

plt.hist(data, bins='auto', alpha=.3, normed=True)
plt.vlines((mu1, mu2), ymin=0, ymax=0.35, label='Fitted Means')
plt.plot(x, norm.pdf(x, mu1, np.sqrt(var1)))
Beispiel #58
0
def GMM(distro, n_components = 2):
    GM = GaussianMixture(n_components)

    GM.fit(np.array(distro).reshape((-1, 1)))

    return GM
		n_clusters = int(clustering_params[0])
		n_iter = int(clustering_params[1])
		thresh = float(clustering_params[2])
		n_restarts = int(clustering_params[3]) 

		# Make data array to be put through the GMM - 5 components: 3 PCs, scaled energy, amplitude
		this_cluster = np.where(predictions == int(clusters[0]))[0]
		n_pc = 3
		data = np.zeros((len(this_cluster), n_pc + 2))	
		data[:,2:] = pca_slices[this_cluster,:n_pc]
		data[:,0] = energy[this_cluster]/np.max(energy[this_cluster])
		data[:,1] = np.abs(amplitudes[this_cluster])/np.max(np.abs(amplitudes[this_cluster]))

		# Cluster the data
		g = GaussianMixture(n_components = n_clusters, covariance_type = 'full', tol = thresh, max_iter = n_iter, n_init = n_restarts)
		g.fit(data)
	
		# Show the cluster plots if the solution converged
		if g.converged_:
			split_predictions = g.predict(data)
			x = np.arange(len(spike_waveforms[0])/10) + 1
			for cluster in range(n_clusters):
				split_points = np.where(split_predictions == cluster)[0]				
				# plt.figure(cluster)
				slices_dejittered = spike_waveforms[this_cluster, :]		# Waveforms and times from the chosen cluster
				times_dejittered = spike_times[this_cluster]
				times_dejittered = times_dejittered[split_points]		# Waveforms and times from the chosen split of the chosen cluster
				ISIs = np.ediff1d(np.sort(times_dejittered))/30.0
				violations1 = 100.0*float(np.sum(ISIs < 1.0)/split_points.shape[0])
				violations2 = 100.0*float(np.sum(ISIs < 2.0)/split_points.shape[0])
				fig, ax = blech_waveforms_datashader.waveforms_datashader(slices_dejittered[split_points, :], x)
Beispiel #60
0
with open("_caption.pickle", 'rb') as f:
    captions_ids_train, captions_ids_test = pickle.load(f)
# images_train = np.array(images_train)
# images_test = np.array(images_test)

######################################################
## GMM ##
images_test = np.array(images_test)
images_test = images_test.reshape((189, 64 * 64 * 3))

total_components = 50
gmm = GaussianMixture(n_components=total_components,
                      covariance_type='diag',
                      verbose=5,
                      max_iter=500)
gmm.fit(images_test)

#######################################################
## image interpolation ##

save_dir = "checkpoint"
net_rnn_name = os.path.join(save_dir, 'net_rnn.npz')
net_cnn_name = os.path.join(save_dir, 'net_cnn.npz')
net_g_name = os.path.join(save_dir, 'net_g.npz')
net_d_name = os.path.join(save_dir, 'net_d.npz')
ni = int(np.ceil(np.sqrt(batch_size)))

t_real_image = tf.placeholder('float32',
                              [batch_size, image_size, image_size, 3],
                              name='real_image')
t_wrong_image = tf.placeholder('float32',