def setUp(self): """ Set up each test with a new XDGMM object and some data. """ self.xdgmm = XDGMM(n_components=3) self.files = [] """ Use scikit-learn GaussianMixture for sampling some data points """ self.gmm = skl_GMM(n_components=3, max_iter=10, covariance_type='full', random_state=None) self.gmm.weights_ = np.array([0.3, 0.5, 0.2]) self.gmm.means_ = np.array( [np.array([0, 1]), np.array([5, 4]), np.array([2, 4])]) self.gmm.covariances_ = np.array([ np.diag((2, 1)), np.array([[1, 0.2], [0.2, 1]]), np.diag((0.3, 0.5)) ]) self.gmm.precisions_ = np.linalg.inv(self.gmm.covariances_) self.gmm.precisions_cholesky_ = np.linalg.cholesky( self.gmm.precisions_) self.X = self.gmm.sample(1000)[0] errs = 0.2 * np.random.random_sample((1000, 2)) self.Xerr = np.zeros(self.X.shape + self.X.shape[-1:]) diag = np.arange(self.X.shape[-1]) self.Xerr[:, diag, diag] = np.vstack([errs[:, 0]**2, errs[:, 1]**2]).T
def fit(self, X, Xerr): """Fit the XD model to data Whichever method is specified in self.method will be used. Results are saved in self.mu/V/weights and in the self.GMM object Parameters ---------- X: array_like, shape = (n_samples, n_features) Input data. Xerr: array_like, shape = (n_samples, n_features, n_features) Error on input data. """ if type(X) == pd.core.frame.DataFrame: if type(X.columns) == pd.indexes.base.Index: self.labels = np.array(X.columns) X = X.values if self.method=='astroML': self.GMM.n_components=self.n_components self.GMM.n_iter=self.n_iter self.GMM.fit(X, Xerr) self.V=self.GMM.V self.mu=self.GMM.mu self.weights=self.GMM.alpha if self.method=='Bovy': """ Bovy extreme_deconvolution only imports if the method is 'Bovy' (this is because installation is somewhat more complicated than astroML, and we don't want it to be required) As with the astroML method, initialize with a few steps of the scikit-learn GMM """ from extreme_deconvolution import extreme_deconvolution\ as bovyXD tmp_gmm = skl_GMM(self.n_components, max_iter=1, covariance_type='full', random_state=self.random_state) tmp_gmm._initialize_parameters(X, self.random_state) #tmp_gmm.fit(X) self.mu = tmp_gmm.means_ self.weights = tmp_gmm.weights_ self.V = tmp_gmm.covariances_ logl=bovyXD.extreme_deconvolution(X,Xerr,self.weights,self.mu,self.V, splitnmerge=self.splitnmerge,tol=self.tol,maxiter=self.n_iter,w=self.w) self.GMM.V = self.V self.GMM.mu = self.mu self.GMM.alpha = self.weights return self
def plot_cond_model(xdgmm, cond_xdgmm, y): plt.clf() setup_text_plots(fontsize=16, usetex=True) fig = plt.figure(figsize=(12, 9)) ax1 = fig.add_subplot(111) for i in range(xdgmm.n_components): draw_ellipse(xdgmm.mu[i], xdgmm.V[i], scales=[2], ax=ax1, ec='None', fc='gray', alpha=0.2) ax1.plot([-2, 15], [y, y], color='blue', linewidth=2) ax1.set_xlim(-1, 13) ax1.set_ylim(-6, 16) ax1.set_xlabel('$x$', fontsize=18) ax1.set_ylabel('$y$', fontsize=18) ax2 = ax1.twinx() x = np.array([np.linspace(-2, 14, 1000)]).T gmm = skl_GMM(n_components=cond_xdgmm.n_components, covariance_type='full') gmm.means_ = cond_xdgmm.mu gmm.weights_ = cond_xdgmm.weights gmm.covars_ = cond_xdgmm.V logprob, responsibilities = gmm.score_samples(x) pdf = np.exp(logprob) ax2.plot(x, pdf, color='red', linewidth=2, label='Cond. dist. of $x$ given $y=' + str(y) + '\pm 0.05$') ax2.legend() ax2.set_ylabel('Probability', fontsize=18) ax2.set_ylim(0, 0.52) ax1.set_xlim(-1, 13) plt.show()
def score_samples(self, X, Xerr): """Return per-sample liklihood of the data under the model Uses the scikit-learn GMM.score_samples method to compute the log probability of X under the model and return the posterior probabilites of each mixture component for each element of X Scores each data point in X separately so that each corresponding Xerr array can be folded into the covariance matrices and be included in the calculation (since the scikit-learn GMM implementation does not include errors) Parameters ---------- X: array_like, shape = (n_samples, n_features) Input data. Xerr: array_like, shape = (n_samples, n_features, n_features) Error on input data. Returns ------- logprob : array_like, shape = (n_samples, n_features) Log probabilities of each data point in X. responsibilities: array_like, shape = (n_samples, n_components) Posterior probabilities of each mixture component for each data point in X. """ if self.V is None or self.mu is None or self.weights is None: raise StandardError("Model parameters not set.") if type(X) == pd.core.frame.DataFrame: X = X.values tmp_GMM = skl_GMM(self.n_components, max_iter=self.n_iter, covariance_type='full', random_state=self.random_state) tmp_GMM.weights_ = self.weights tmp_GMM.means_ = self.mu X = X[:, np.newaxis, :] Xerr = Xerr[:, np.newaxis, :, :] T = Xerr + self.V logprob = [] responsibilities = [] for i in range(X.shape[0]): tmp_GMM.covariances_ = T[i] tmp_GMM.precisions_ = np.linalg.inv(T[i]) chol = np.linalg.cholesky(np.linalg.inv(T[i])) tmp_GMM.precisions_cholesky_ = chol lp = tmp_GMM.score_samples(X[i].reshape(1,-1)) logprob.append(lp) resp = tmp_GMM.predict_proba(X[i].reshape(1,-1)) responsibilities.append(resp) logprob=np.array(logprob)[:,0] responsibilities=np.array(responsibilities)[:,0] return logprob,responsibilities