def cause(n, k, p1, p2): g = GaussianMixture(k) g.means_ = p1 * np.random.randn(k, 1) g.covars_ = np.power(abs(p2 * np.random.randn(k, 1) + 1), 2) g.weights_ = abs(np.random.rand(k, 1)) g.weights_ = g.weights_ / sum(g.weights_) return scale(g.sample(n))
def create_random_gmm(n_mix, n_features, covariance_type, prng=0): prng = check_random_state(prng) g = GaussianMixture(n_mix, covariance_type=covariance_type) g.means_ = prng.randint(-20, 20, (n_mix, n_features)) g.covars_ = make_covar_matrix(covariance_type, n_mix, n_features) g.weights_ = normalized(prng.rand(n_mix)) return g
def gmm_cause(points, k=2, p1=3, p2=4): """Init a root cause with a Gaussian Mixture Model w/ a spherical covariance type.""" g = GMM(k, covariance_type="spherical") g.fit(np.random.randn(300, 1)) g.means_ = p1 * np.random.randn(k, 1) g.covars_ = np.power(abs(p2 * np.random.randn(k, 1) + 1), 2) g.weights_ = abs(np.random.rand(k)) g.weights_ = g.weights_ / sum(g.weights_) return g.sample(points)[0].reshape(-1)
def _set_GMMs(self, statesSequence): ''' build n-state GMMs with scikit-learn's classes. copy means etc. from self.weights and self.means and self.covars ''' if statesSequence==None: sys.exit('no state sequence') numMixtures = self._get_num_mixtures(statesSequence) means = numpy.empty((self.n, numMixtures, self.numDimensions)) weights = numpy.ones((self.n, numMixtures),dtype=numpy.double) # init covars covars = [[ numpy.matrix(numpy.eye(self.numDimensions, self.numDimensions)) for j in xrange(numMixtures)] for i in xrange(self.n)] for i in range(len(statesSequence) ): state = statesSequence[i] if ParametersAlgo.FOR_MAKAM: for (numMixture, weight, mixture) in state.mixtures: weights[i,numMixture-1] = weight means[i,numMixture-1,:] = mixture.mean.vector variance_ = mixture.var.vector for k in range(len( variance_) ): covars[i][numMixture-1][k,k] = variance_[k] elif ParametersAlgo.FOR_JINGJU: gmm_ = state.mixtures for numMixture in range(gmm_.n_components): weights[i,numMixture] = gmm_.weights_[numMixture] means[i,numMixture,:] = gmm_.means_[numMixture] variance_ = gmm_.covars_[numMixture] for k in range(len( variance_) ): covars[i][numMixture][k,k] = variance_[k] ####### put into GMM models self.GMMs = numpy.empty(self.n, dtype=GMM_) for stateIdx in range(self.n): curr_GMM = GMM_(covariance_type='diag', n_components=numMixtures) curr_GMM.means_ = means[stateIdx] curr_GMM.covars_ = numpy.zeros((numMixtures, self.numDimensions)) for m_idx in range(numMixtures): for d_idx in range(self.numDimensions): a = covars[stateIdx][m_idx][d_idx,d_idx] curr_GMM.covars_[m_idx,d_idx] = a curr_GMM.weights_ = weights[stateIdx] curr_GMM.precisions_cholesky_ = _compute_precision_cholesky(curr_GMM.covars_, curr_GMM.covariance_type) self.GMMs[stateIdx] = curr_GMM
def fit_gmm_to_points(points, n_components, mdl, ps=[], num_iter=100, covariance_type='full', min_covar=0.001, init_centers=[], force_radii=-1.0, force_weight=-1.0, mass_multiplier=1.0): """fit a GMM to some points. Will return the score and the Akaike score. Akaike information criterion for the current model fit. It is a measure of the relative quality of the GMM that takes into account the parsimony and the goodness of the fit. if no particles are provided, they will be created points: list of coordinates (python) n_components: number of gaussians to create mdl: IMP Model ps: list of particles to be decorated. if empty, will add num_iter: number of EM iterations covariance_type: covar type for the gaussians. options: 'full', 'diagonal', 'spherical' min_covar: assign a minimum value to covariance term. That is used to have more spherical shaped gaussians init_centers: initial coordinates of the GMM force_radii: fix the radii (spheres only) force_weight: fix the weights mass_multiplier: multiply the weights of all the gaussians by this value dirichlet: use the DGMM fitting (can reduce number of components, takes longer) """ new_sklearn = False try: from sklearn.mixture import GMM except ImportError: from sklearn.mixture import GaussianMixture new_sklearn = True print('creating GMM with n_components',n_components,'n_iter',num_iter,'covar type',covariance_type) if new_sklearn: # aic() calls size() on points, so it needs to a numpy array, not a list points = np.array(points) weights_init = precisions_init = None if force_radii != -1.0: print('warning: radii can no longer be forced, but setting ' 'initial values to ', force_radii) precisions_init = np.array([[1./force_radii]*3 for i in range(n_components)]) if force_weight != -1.0: print('warning: weights can no longer be forced, but setting ' 'initial values to ', force_weight) weights_init = np.array([force_weight]*n_components) gmm = GaussianMixture(n_components=n_components, max_iter=num_iter, covariance_type=covariance_type, weights_init=weights_init, precisions_init=precisions_init, means_init=None if init_centers==[] else init_centers) else: params='m' init_params='m' if force_radii==-1.0: params+='c' init_params+='c' else: covariance_type='spherical' print('forcing spherical with radii',force_radii) if force_weight==-1.0: params+='w' init_params+='w' else: print('forcing weights to be',force_weight) gmm = GMM(n_components=n_components, n_iter=num_iter, covariance_type=covariance_type, min_covar=min_covar, params=params, init_params=init_params) if force_weight!=-1.0: gmm.weights_=np.array([force_weight]*n_components) if force_radii!=-1.0: gmm.covars_=np.array([[force_radii]*3 for i in range(n_components)]) if init_centers!=[]: gmm.means_=init_centers print('fitting') model=gmm.fit(points) score=gmm.score(points) akaikescore=model.aic(points) #print('>>> GMM score',gmm.score(points)) ### convert format to core::Gaussian if new_sklearn: covars = gmm.covariances_ else: covars = gmm.covars_ for ng in range(n_components): covar=covars[ng] if covar.size==3: covar=np.diag(covar).tolist() else: covar=covar.tolist() center=list(gmm.means_[ng]) weight=mass_multiplier*gmm.weights_[ng] if ng>=len(ps): ps.append(IMP.Particle(mdl)) shape=IMP.algebra.get_gaussian_from_covariance(covar,IMP.algebra.Vector3D(center)) g=IMP.core.Gaussian.setup_particle(ps[ng],shape) IMP.atom.Mass.setup_particle(ps[ng],weight) IMP.core.XYZR.setup_particle(ps[ng],sqrt(max(g.get_variances()))) return (score,akaikescore)
def clustering_gmm(data, n_clusters, tol=1e-7, min_covar=None, scale='logicle'): """ Find clusters in an array using a Gaussian Mixture Model. Before clustering, `data` can be automatically rescaled as specified by the `scale` argument. Parameters ---------- data : FCSData or array_like Data to cluster. n_clusters : int Number of clusters to find. tol : float, optional Tolerance for convergence. Directly passed to either ``GaussianMixture`` or ``GMM``, depending on ``scikit-learn``'s version. min_covar : float, optional The minimum trace that the initial covariance matrix will have. If ``scikit-learn``'s version is older than 0.18, `min_covar` is also passed directly to ``GMM``. scale : str, optional Rescaling applied to `data` before performing clustering. Can be either ``linear`` (no rescaling), ``log``, or ``logicle``. Returns ------- labels : array Nx1 array with labels for each element in `data`, assigning ``data[i]`` to cluster ``labels[i]``. Notes ----- A Gaussian Mixture Model finds clusters by fitting a linear combination of `n_clusters` Gaussian probability density functions (pdf) to `data` using Expectation Maximization (EM). This method can be fairly sensitive to the initial parameter choice. To generate a reasonable set of initial conditions, `clustering_gmm` first divides all points in `data` into `n_clusters` groups of the same size based on their Euclidean distance to the minimum value. Then, for each group, the 50% samples farther away from the mean are discarded. The mean and covariance are calculated from the remaining samples of each group, and used as initial conditions for the GMM EM algorithm. `clustering_gmm` internally uses a `GaussianMixture` object from the ``scikit-learn`` library (``GMM`` if ``scikit-learn``'s version is lower than 0.18), with full covariance matrices for each cluster. For more information, consult ``scikit-learn``'s documentation. """ # Initialize min_covar parameter # Parameter is initialized differently depending on scikit's version if min_covar is None: if packaging.version.parse(sklearn.__version__) \ >= packaging.version.parse('0.18'): min_covar = 1e-3 else: min_covar = 5e-5 # Copy events before rescaling data = data.copy() # Apply rescaling if scale == 'linear': # No rescaling pass elif scale == 'log': # Logarithm of zero and negatives is undefined. Therefore, saturate # any non-positives to a small positive value. # The machine epsilon `eps` is the smallest number such that # `1.0 + eps != eps`. For a 64-bit floating point, `eps ~= 1e-15`. data[data < 1e-15] = 1e-15 # Rescale data = np.log10(data) elif scale == 'logicle': # Use the logicle transform class in the plot module, and transform # data one channel at a time. for ch in range(data.shape[1]): # We need a transformation from "data value" to "display scale" # units. To do so, we use an inverse logicle transformation. t = FlowCal.plot._LogicleTransform(data=data, channel=ch).inverted() data[:, ch] = t.transform_non_affine(data[:, ch], mask_out_of_range=False) else: raise ValueError("scale {} not supported".format(scale)) ### # Parameter initialization ### weights = np.tile(1.0 / n_clusters, n_clusters) means = [] covars = [] # Calculate distance to minimum value. Then, sort based on this distance. dist = np.sum((data - np.min(data, axis=0))**2., axis=1) sorted_idx = np.argsort(dist) # Expected number of elements per cluster n_per_cluster = data.shape[0] / float(n_clusters) # Get means and covariances per cluster # We will just use a fraction of ``1 - discard_frac`` of the data. # Data at the edges that actually corresponds to another cluster can # really mess up the final result. discard_frac = 0.5 for i in range(n_clusters): il = int((i + discard_frac / 2) * n_per_cluster) ih = int((i + 1 - discard_frac / 2) * n_per_cluster) sorted_idx_cluster = sorted_idx[il:ih] data_cluster = data[sorted_idx_cluster] # Calculate means and covariances means.append(np.mean(data_cluster, axis=0)) if data.shape[1] == 1: cov = np.cov(data_cluster.T).reshape(1, 1) else: cov = np.cov(data_cluster.T) # Add small number to diagonal to avoid near-singular covariances cov += np.eye(data.shape[1]) * min_covar covars.append(cov) # Means should be an array means = np.array(means) ### # Run Gaussian Mixture Model Clustering ### if packaging.version.parse(sklearn.__version__) \ >= packaging.version.parse('0.18'): # GaussianMixture uses precisions, the inverse of covariances. # To get the inverse, we solve the linear equation C*P = I. We also # use the fact that C is positive definite. precisions = [ scipy.linalg.solve(c, np.eye(c.shape[0]), assume_a='pos') for c in covars ] precisions = np.array(precisions) # Initialize GaussianMixture object gmm = GaussianMixture(n_components=n_clusters, tol=tol, covariance_type='full', weights_init=weights, means_init=means, precisions_init=precisions, max_iter=500) else: # Initialize GMM object gmm = GMM(n_components=n_clusters, tol=tol, min_covar=min_covar, covariance_type='full', params='mc', init_params='') # Set initial parameters gmm.weight_ = weights gmm.means_ = means gmm.covars_ = covars # Fit gmm.fit(data) # Get labels by sampling from the responsibilities # This avoids the complete elimination of a cluster if two or more # clusters have very similar means. resp = gmm.predict_proba(data) labels = [np.random.choice(range(n_clusters), p=ri) for ri in resp] return labels
def fit_gmm_to_points(points, n_components, mdl, ps=[], num_iter=100, covariance_type='full', min_covar=0.001, init_centers=[], force_radii=-1.0, force_weight=-1.0, mass_multiplier=1.0): """fit a GMM to some points. Will return the score and the Akaike score. Akaike information criterion for the current model fit. It is a measure of the relative quality of the GMM that takes into account the parsimony and the goodness of the fit. if no particles are provided, they will be created points: list of coordinates (python) n_components: number of gaussians to create mdl: IMP Model ps: list of particles to be decorated. if empty, will add num_iter: number of EM iterations covariance_type: covar type for the gaussians. options: 'full', 'diagonal', 'spherical' min_covar: assign a minimum value to covariance term. That is used to have more spherical shaped gaussians init_centers: initial coordinates of the GMM force_radii: fix the radii (spheres only) force_weight: fix the weights mass_multiplier: multiply the weights of all the gaussians by this value dirichlet: use the DGMM fitting (can reduce number of components, takes longer) """ new_sklearn = False try: from sklearn.mixture import GMM except ImportError: from sklearn.mixture import GaussianMixture new_sklearn = True print('creating GMM with n_components',n_components,'n_iter',num_iter,'covar type',covariance_type) if new_sklearn: # aic() calls size() on points, so it needs to be # a numpy array, not a list points = np.array(points) weights_init = precisions_init = None if force_radii != -1.0: print('warning: radii can no longer be forced, but setting ' 'initial values to ', force_radii) precisions_init = np.array([[1./force_radii]*3 for i in range(n_components)]) if force_weight != -1.0: print('warning: weights can no longer be forced, but setting ' 'initial values to ', force_weight) weights_init = np.array([force_weight]*n_components) gmm = GaussianMixture(n_components=n_components, max_iter=num_iter, covariance_type=covariance_type, weights_init=weights_init, precisions_init=precisions_init, means_init=None if init_centers==[] else init_centers) else: params='m' init_params='m' if force_radii==-1.0: params+='c' init_params+='c' else: covariance_type='spherical' print('forcing spherical with radii',force_radii) if force_weight==-1.0: params+='w' init_params+='w' else: print('forcing weights to be',force_weight) gmm = GMM(n_components=n_components, n_iter=num_iter, covariance_type=covariance_type, min_covar=min_covar, params=params, init_params=init_params) if force_weight!=-1.0: gmm.weights_=np.array([force_weight]*n_components) if force_radii!=-1.0: gmm.covars_=np.array([[force_radii]*3 for i in range(n_components)]) if init_centers!=[]: gmm.means_=init_centers print('fitting') model=gmm.fit(points) score=gmm.score(points) akaikescore=model.aic(points) #print('>>> GMM score',gmm.score(points)) ### convert format to core::Gaussian if new_sklearn: covars = gmm.covariances_ else: covars = gmm.covars_ for ng in range(n_components): covar=covars[ng] if covar.size==3: covar=np.diag(covar).tolist() else: covar=covar.tolist() center=list(gmm.means_[ng]) weight=mass_multiplier*gmm.weights_[ng] if ng>=len(ps): ps.append(IMP.Particle(mdl)) shape=IMP.algebra.get_gaussian_from_covariance(covar,IMP.algebra.Vector3D(center)) g=IMP.core.Gaussian.setup_particle(ps[ng],shape) IMP.atom.Mass.setup_particle(ps[ng],weight) IMP.core.XYZR.setup_particle(ps[ng],sqrt(max(g.get_variances()))) return (score,akaikescore)