def compute_bb_nll(a, d, mu, theta): """Computes negative log-likelihood for Beta-Binomial model. Covers limit cases theta = 0 (Binomial) and theta = inf (Bernoulli). Args: a: Vector successes. d: Vector of trials. mu: Mean of the distribution. Has to be the same shape as a and d. theta: Dispersion parameter. Returns: Negative log-likelihood. """ a = atleast_2d_column(a) d = atleast_2d_column(d) mu = atleast_2d_column(mu) if (mu > 1).any() or (mu < 0).any(): raise ValueError('mu has to be between 0 and 1.') if a.size != d.size or a.size != mu.size: raise ValueError('a, d and mu have to be of the same size.') if theta < 0: raise ValueError('theta has to be non-negative.') if theta == 0: nll = -binom(n=d, p=mu).logpmf(a).sum() elif np.isinf(theta): nll = -binom(n=d > 0, p=mu).logpmf(a > 0).sum() else: alpha = reparameterize_polya_ms(np.hstack([mu, 1 - mu]), 1 / theta) nll = -betabinom(n=d, a=alpha[:, 0, np.newaxis], b=alpha[:, 1, np.newaxis]).logpmf(a).sum() return nll
def __init__(self, a, d, E=None, X=None): """Creates DaliModule. Stores a, d, E and X as 2-dimensional numpy arrays, keeping only entries with nonzero value in d (i.e. non-zero total counts). Args: a: Counts for the alternative allele in each cell. d: Total counts for both alleles in each cell. E: Optional environment / cell-state matrix. X: Optional design matrix. """ if a.shape[0] != d.shape[0]: msg = ('Dimension mismatch: a and d need' 'to have the same number of entries!') raise ValueError(msg) if (E is not None) and (a.shape[0] != E.shape[0]): msg = ('Dimension mismatch: First dimension of E' 'has to equal the number of elements in a and d!') raise ValueError(msg) if (X is not None) and (a.shape[0] != X.shape[0]): msg = ('Dimension mismatch: First dimension of X' 'has to equal the number of elements in a and d!') raise ValueError(msg) if d.sum() == 0: raise ValueError('All counts are zero!') self.d = atleast_2d_column(d).astype(float) self.idsnonzero = (self.d > 0).flatten() self.d = self.d[self.idsnonzero, :] self.a = atleast_2d_column(a)[self.idsnonzero, :].astype(float) if E is not None: self.E = atleast_2d_column(E)[self.idsnonzero, :].astype(float) self.k = self.E.shape[1] else: self.E = None self.k = 0 if X is not None: X = atleast_2d_column(X)[self.idsnonzero, :].astype(float) non_constant = ~(X[0, :, np.newaxis] == X.T).T.all(0) if non_constant.sum() < X.shape[1]: print('Warning: Removing constant columns from X.') if non_constant.sum() == 0: X = None else: X = atleast_2d_column(X[:, non_constant]) self.X = X else: self.X = None self.n = self.a.shape[0]
def run_model( model, A, D, init_kwargs={}, fit_kwargs={}, callbacks=list(), show_progress=True, verbose=True): """Fit a DaliModule for each column of A and D. For each column in D with non-zero counts, this function creates a model and calls model.fit() and executes additional callbacks. Args model: Model to run on each region. A: cell-by-region matrix of counts for the alternative allele. D: cell-by-region matrix of counts for both alleles (total counts). init_kwargs: Additional keyword arguments for the model initialization, e.g. cell-state variables. fit_kwargs: Additional Keyword arguments for model.fit(). callbacks: List of callbacks to be executed after fitting the model. A callback is a function operating on DaliModule objects. For example, to run additional functions or extract fitted parameters. show_progress: Show progressbar. verbose: Be verbose. Returns: List of callback return values for each region. """ A = atleast_2d_column(A) D = atleast_2d_column(D) n_regions = D.shape[1] results = list() pb = trange(n_regions) if show_progress else range(n_regions) for i in pb: if D[:, i].sum() == 0: if verbose: print("Warning: Zero column in D, appending None") results.append([None for cb in callbacks]) continue # create model for i-th column mod = model(A[:, i], D[:, i], **init_kwargs) # fit model mod.fit(**fit_kwargs) # run callbacks on fitted model region_results = list() for cb in callbacks: region_results.append(cb(mod)) results.append(region_results) return results
def create_rbf_kernel(E, lengthscale=1): """Creates radial basis kernel matrix.""" E = atleast_2d_column(E) E_norm = np.sum(E**2, axis=-1) l = -1 / lengthscale**2 K = np.exp(l * (E_norm[:, None] + E_norm[None, :] - 2 * np.dot(E, E.T))) return K
def compute_posterior(self, E=None, full_cov=False): """Computes the mean and variances of the posterior over latent rates.""" E = self.E if E is None else atleast_2d_column(E) mu, covar = self.model.predict_f(E.astype(np.float64), full_cov=full_cov) if full_cov: covar = covar[0, :, :] return mu.numpy().astype(np.float32), covar.numpy().astype(np.float32)
def simulate_beta_binomial(K, D, sigma2, theta, mu=0, invlink=logistic, seed=None): """Simulates from binomial Gaussian process with Beta latent noise. Args: K: Cell-state kernel, for example as generated by create_linear_kernel or create_rbf_kernel. D: Array of total counts. sigma2: Kernel variance component. theta: Dispersion parameter. If zero, sample from a regular Binomial distribution instead. mu: Optional fixed effects on a logit scale. Defaults to zero, which corresponds to a binomial mean of 0.5. invlink: Inverse link function. Defaults to invlogit. seed: Random seed. Returns: List with alternative counts, latent rates as well as sampled binomial means. """ D = atleast_2d_column(D) n, p = D.shape rng = np.random.default_rng(seed) if sigma2 == 0: latent = mu * np.ones((n, p)) else: mu = mu * np.ones((n, 1)) latent = _sample_normal(p, mu, sigma2 * K, rng) beta_mean = invlink(latent) if theta > 0: binomial_mean = rng.beta(a=beta_mean / theta, b=(1 - beta_mean) / theta) else: binomial_mean = beta_mean a = rng.binomial(n=D, p=binomial_mean) return {'A': a, 'beta_mean': beta_mean, 'binomial_mean': binomial_mean}
def run_interpolation(A, D, cell_state, kernel='Linear', num_inducing=800, maxiter=2000, return_prior_mean=False, n_cores=1): """Run scDALI interpolation of allelic rates for each region. A, D are assumed to be n-by-d, where n is the number of cells and d the number of regions to model. Args: A: Alternative counts for each cell and region. D: Total counts for each cell and region. cell_state: Matrix of cell states, e.g. clusters or coordinates in a low-dimensional cell-state space. kernel: Kernel function for GP interpolation, e.g. 'Linear' or 'RBF'. num_inducing: Number of inducing points for the GP model maxiter: Max iterations for GP optimization. return_prior_mean: Return the estimated GP prior mean. n_cores: Number of cores to use. Returns: Estimated posterior mean and variances for each region. """ from scdali.models.gp import SparseGP D = atleast_2d_column(D) A = atleast_2d_column(A) if A.shape != D.shape: raise ValueError('A and D need to be of the same shape.') if cell_state is None: raise ValueError('Interpolation requires cell_state to be specified') init_kwargs = {} fit_kwargs = {} init_kwargs['kernel'] = kernel init_kwargs['num_inducing'] = num_inducing fit_kwargs['maxiter'] = maxiter init_kwargs['E'] = cell_state n_cores = min(n_cores, D.shape[1]) print('[scdali] Processing %d regions on %d core(s) ... ' % (D.shape[1], n_cores), flush=True) callbacks = [] callbacks.append(create_method_callback('compute_posterior', E=cell_state)) if return_prior_mean: callbacks.append(create_method_callback('get_prior_mean')) show_progress = False if n_cores > 1 else True f = partial(run_model, SparseGP, init_kwargs=init_kwargs, fit_kwargs=fit_kwargs, callbacks=callbacks, show_progress=show_progress) results = process_parallel(f, mat_dict={'A': A, 'D': D}, n_cores=n_cores) out = dict() out['posterior_mean'] = np.asarray([r[0][0].flatten() for r in results]).T out['posterior_var'] = np.asarray([r[0][1].flatten() for r in results]).T if return_prior_mean: out['prior_mean'] = [float(r[1]) for r in results] return out
def run_tests(A, D, model, X=None, cell_state=None, return_rho=False, base_rate=None, n_cores=1): """Run scDALI hypothesis tests for each region. A, D are assumed to be n-by-d, where n is the number of cells and d the number of regions to model. Args: A: Alternative counts for each cell and region. D: Total counts for each cell and region. model: String indicating the model to run. Options are 'scDALI-Joint' - a Beta-Binomial variance component score test to test for either heterogeneous or homogeneous allelic imbalance. Requires cell_state and base_rate. 'scDALI-Het' - a Beta-Binomial variance component score test to test for heterogeneous allelic imbalance. Requires cell_state. 'scDALI-Hom' - a Beta-Binomial variance component score test to test for homogeneous allelic imbalance. Requires base_rate. X: Optional design matrix. return_rho: When model is scDALI-Joint, this flag indicates whether to return rho, the fraction of allelic variation explained by global imbalance. cell_state: Matrix of cell states, e.g. clusters or coordinates in a low-dimensional cell-state space. base_rate: Null allelic rate. n_cores: Number of cores to use. Returns: p-values for each region. """ D = atleast_2d_column(D) A = atleast_2d_column(A) if X is not None: X = atleast_2d_column(X) if A.shape != D.shape: raise ValueError('A and D need to be of the same shape.') try: m = MODELS[model] except KeyError: msg = ('Model not recognized. Choices are ' + ', '.join(MODELS.keys()) + '.') raise ValueError(msg) if model in ['scDALI-Joint', 'scDALI-Het'] and cell_state is None: raise ValueError('%s requires cell_state to be specified' % model) if model in ['scDALI-Joint', 'scDALI-Hom'] and base_rate is None: raise ValueError('%s requires base_rate to be specified' % model) init_kwargs = {} fit_kwargs = {} if model in ['scDALI-Joint', 'scDALI-Hom']: init_kwargs['base_rate'] = base_rate if model in ['scDALI-Joint', 'scDALI-Het']: init_kwargs['E'] = cell_state init_kwargs['X'] = X n_cores = min(n_cores, D.shape[1]) print('[scdali] Processing %d regions on %d core(s) ... ' % (D.shape[1], n_cores), flush=True) callbacks = [] if model == 'scDALI-Joint': callbacks.append(create_method_callback('test', return_rho=return_rho)) else: callbacks.append(create_method_callback('test')) show_progress = False if n_cores > 1 else True f = partial(run_model, m, init_kwargs=init_kwargs, fit_kwargs=fit_kwargs, callbacks=callbacks, show_progress=show_progress) results = process_parallel(f, mat_dict={'A': A, 'D': D}, n_cores=n_cores) out = dict() if model == 'scDALI-Joint' and return_rho: out['pvalues'] = np.asarray([r[0][0] for r in results]).T out['rhos'] = np.asarray([r[0][1] for r in results]).T else: out['pvalues'] = np.asarray([r[0] for r in results]).T return out
def fit_bb_glm(a, d, X, offset=0, theta=None, maxiter=100, tol=1e-5): """Fits generalized linear model with Beta-Binomial likelihood. Uses iteratively reweighted least squares / Fisher scoring. Args: a: Vector successes. d: Vector of trials. X: Design matrix. offset: Untrainable offset parameter. theta: Dispersion parameter. If None, estimate alternatingly. maxiter: Maximum number of iterations tol: Break if mean absolute change in estimated parameters is below tol. Returns: Regression coefficients, estimated dispersion parameter and number of iterations. """ from numpy_sugar.linalg import rsolve a = atleast_2d_column(a) d = atleast_2d_column(d) X = atleast_2d_column(X) y = a / d fit_dispersion = theta is None is_bernoulli = False if np.array_equal(y, y.astype(bool)) and fit_dispersion: is_bernoulli = True d = (d > 0).astype(float) theta = 0 fit_dispersion = False if fit_dispersion: data = np.hstack([a, d - a]) beta = rsolve(X.T @ X, X.T @ y) converged = False for i in range(maxiter): eta = X @ beta + offset mu = logistic(eta) if fit_dispersion: m = np.hstack([mu, 1 - mu]) maxiter = min(10**(i + 1), 1000) (s, niter) = fit_polya_precision(data=data, m=m, maxiter=maxiter) theta = 1 / s gprime = 1 / ((1 - mu) * mu) z = eta + gprime * (y - mu) - offset W = d * mu * (1 - mu) * (theta + 1) W = W / (d * theta + 1) XW = (W * X).T beta_new = rsolve(XW @ X, XW @ z) if np.abs(beta - beta_new).mean() < tol: converged = True break beta = beta_new if not converged: print('Warning: Model did not converge. Try increasing maxiter.') if is_bernoulli: theta = np.inf return beta, theta, i