Exemple #1
0
def compute_bb_nll(a, d, mu, theta):
    """Computes negative log-likelihood for Beta-Binomial model.

    Covers limit cases theta = 0 (Binomial) and theta = inf (Bernoulli).

    Args:
        a: Vector successes.
        d: Vector of trials.
        mu: Mean of the distribution. Has to be the same shape as a and d.
        theta: Dispersion parameter.

    Returns:
        Negative log-likelihood.
    """
    a = atleast_2d_column(a)
    d = atleast_2d_column(d)
    mu = atleast_2d_column(mu)

    if (mu > 1).any() or (mu < 0).any():
        raise ValueError('mu has to be between 0 and 1.')
    if a.size != d.size or a.size != mu.size:
        raise ValueError('a, d and mu have to be of the same size.')
    if theta < 0:
        raise ValueError('theta has to be non-negative.')

    if theta == 0:
        nll = -binom(n=d, p=mu).logpmf(a).sum()
    elif np.isinf(theta):
        nll = -binom(n=d > 0, p=mu).logpmf(a > 0).sum()
    else:
        alpha = reparameterize_polya_ms(np.hstack([mu, 1 - mu]), 1 / theta)
        nll = -betabinom(n=d,
                         a=alpha[:, 0, np.newaxis],
                         b=alpha[:, 1, np.newaxis]).logpmf(a).sum()
    return nll
Exemple #2
0
    def __init__(self, a, d, E=None, X=None):
        """Creates DaliModule.

       Stores a, d, E and X as 2-dimensional numpy arrays, keeping only entries
       with nonzero value in d (i.e. non-zero total counts).

        Args:
            a: Counts for the alternative allele in each cell.
            d: Total counts for both alleles in each cell.
            E: Optional environment / cell-state matrix.
            X: Optional design matrix.
        """
        if a.shape[0] != d.shape[0]:
            msg = ('Dimension mismatch: a and d need'
                   'to have the same number of entries!')
            raise ValueError(msg)
        if (E is not None) and (a.shape[0] != E.shape[0]):
            msg = ('Dimension mismatch: First dimension of E'
                   'has to equal the number of elements in a and d!')
            raise ValueError(msg)
        if (X is not None) and (a.shape[0] != X.shape[0]):
            msg = ('Dimension mismatch: First dimension of X'
                   'has to equal the number of elements in a and d!')
            raise ValueError(msg)
        if d.sum() == 0:
            raise ValueError('All counts are zero!')

        self.d = atleast_2d_column(d).astype(float)
        self.idsnonzero = (self.d > 0).flatten()

        self.d = self.d[self.idsnonzero, :]
        self.a = atleast_2d_column(a)[self.idsnonzero, :].astype(float)

        if E is not None:
            self.E = atleast_2d_column(E)[self.idsnonzero, :].astype(float)
            self.k = self.E.shape[1]
        else:
            self.E = None
            self.k = 0

        if X is not None:
            X = atleast_2d_column(X)[self.idsnonzero, :].astype(float)
            non_constant = ~(X[0, :, np.newaxis] == X.T).T.all(0)
            if non_constant.sum() < X.shape[1]:
                print('Warning: Removing constant columns from X.')
                if non_constant.sum() == 0:
                    X = None
                else:
                    X = atleast_2d_column(X[:, non_constant])
            self.X = X
        else:
            self.X = None

        self.n = self.a.shape[0]
Exemple #3
0
def run_model(
        model,
        A, D,
        init_kwargs={},
        fit_kwargs={},
        callbacks=list(),
        show_progress=True,
        verbose=True):
    """Fit a DaliModule for each column of A and D.

    For each column in D with non-zero counts, this function creates a model
    and calls model.fit() and executes additional callbacks.

    Args
        model: Model to run on each region.
        A: cell-by-region matrix of counts for the alternative allele.
        D: cell-by-region matrix of counts for both alleles (total counts).
        init_kwargs: Additional keyword arguments for the model initialization,
            e.g. cell-state variables.
        fit_kwargs: Additional Keyword arguments for model.fit().
        callbacks: List of callbacks to be executed after fitting the model.
            A callback is a function operating on DaliModule objects. For
            example, to run additional functions or extract fitted parameters.
        show_progress: Show progressbar.
        verbose: Be verbose.

    Returns:
        List of callback return values for each region.
    """
    A = atleast_2d_column(A)
    D = atleast_2d_column(D)
    n_regions = D.shape[1]

    results = list()
    pb = trange(n_regions) if show_progress else range(n_regions)
    for i in pb:
        if D[:, i].sum() == 0:
            if verbose:
                print("Warning: Zero column in D, appending None")
            results.append([None for cb in callbacks])
            continue

        # create model for i-th column
        mod = model(A[:, i], D[:, i], **init_kwargs)

        # fit model
        mod.fit(**fit_kwargs)

        # run callbacks on fitted model
        region_results = list()
        for cb in callbacks:
            region_results.append(cb(mod))
        results.append(region_results)
    return results
Exemple #4
0
def create_rbf_kernel(E, lengthscale=1):
    """Creates radial basis kernel matrix."""
    E = atleast_2d_column(E)
    E_norm = np.sum(E**2, axis=-1)
    l = -1 / lengthscale**2
    K = np.exp(l * (E_norm[:, None] + E_norm[None, :] - 2 * np.dot(E, E.T)))
    return K
Exemple #5
0
 def compute_posterior(self, E=None, full_cov=False):
     """Computes the mean and variances of the posterior over latent rates."""
     E = self.E if E is None else atleast_2d_column(E)
     mu, covar = self.model.predict_f(E.astype(np.float64),
                                      full_cov=full_cov)
     if full_cov:
         covar = covar[0, :, :]
     return mu.numpy().astype(np.float32), covar.numpy().astype(np.float32)
Exemple #6
0
def simulate_beta_binomial(K,
                           D,
                           sigma2,
                           theta,
                           mu=0,
                           invlink=logistic,
                           seed=None):
    """Simulates from binomial Gaussian process with Beta latent noise.

    Args:
        K: Cell-state kernel, for example as generated by create_linear_kernel
            or create_rbf_kernel.
        D: Array of total counts.
        sigma2: Kernel variance component.
        theta: Dispersion parameter. If zero, sample from a regular Binomial
            distribution instead.
        mu: Optional fixed effects on a logit scale. Defaults to zero, which
            corresponds to a binomial mean of 0.5.
        invlink: Inverse link function. Defaults to invlogit.
        seed: Random seed.

    Returns:
        List with alternative counts, latent rates as well as sampled binomial
        means.
    """
    D = atleast_2d_column(D)

    n, p = D.shape
    rng = np.random.default_rng(seed)

    if sigma2 == 0:
        latent = mu * np.ones((n, p))
    else:
        mu = mu * np.ones((n, 1))
        latent = _sample_normal(p, mu, sigma2 * K, rng)
    beta_mean = invlink(latent)
    if theta > 0:
        binomial_mean = rng.beta(a=beta_mean / theta,
                                 b=(1 - beta_mean) / theta)
    else:
        binomial_mean = beta_mean
    a = rng.binomial(n=D, p=binomial_mean)
    return {'A': a, 'beta_mean': beta_mean, 'binomial_mean': binomial_mean}
Exemple #7
0
def run_interpolation(A,
                      D,
                      cell_state,
                      kernel='Linear',
                      num_inducing=800,
                      maxiter=2000,
                      return_prior_mean=False,
                      n_cores=1):
    """Run scDALI interpolation of allelic rates for each region.

    A, D are assumed to be n-by-d, where n is the number of cells and d the
    number of regions to model.

    Args:
        A: Alternative counts for each cell and region.
        D: Total counts for each cell and region.
        cell_state: Matrix of cell states, e.g. clusters or coordinates
            in a low-dimensional cell-state space.
        kernel: Kernel function for GP interpolation, e.g. 'Linear' or 'RBF'.
        num_inducing: Number of inducing points for the GP model
        maxiter: Max iterations for GP optimization.
        return_prior_mean: Return the estimated GP prior mean.
        n_cores: Number of cores to use.

    Returns:
        Estimated posterior mean and variances for each region.
    """
    from scdali.models.gp import SparseGP

    D = atleast_2d_column(D)
    A = atleast_2d_column(A)

    if A.shape != D.shape:
        raise ValueError('A and D need to be of the same shape.')

    if cell_state is None:
        raise ValueError('Interpolation requires cell_state to be specified')

    init_kwargs = {}
    fit_kwargs = {}
    init_kwargs['kernel'] = kernel
    init_kwargs['num_inducing'] = num_inducing
    fit_kwargs['maxiter'] = maxiter
    init_kwargs['E'] = cell_state

    n_cores = min(n_cores, D.shape[1])
    print('[scdali] Processing %d regions on %d core(s) ... ' %
          (D.shape[1], n_cores),
          flush=True)

    callbacks = []
    callbacks.append(create_method_callback('compute_posterior', E=cell_state))
    if return_prior_mean:
        callbacks.append(create_method_callback('get_prior_mean'))

    show_progress = False if n_cores > 1 else True
    f = partial(run_model,
                SparseGP,
                init_kwargs=init_kwargs,
                fit_kwargs=fit_kwargs,
                callbacks=callbacks,
                show_progress=show_progress)
    results = process_parallel(f, mat_dict={'A': A, 'D': D}, n_cores=n_cores)

    out = dict()
    out['posterior_mean'] = np.asarray([r[0][0].flatten() for r in results]).T
    out['posterior_var'] = np.asarray([r[0][1].flatten() for r in results]).T
    if return_prior_mean:
        out['prior_mean'] = [float(r[1]) for r in results]
    return out
Exemple #8
0
def run_tests(A,
              D,
              model,
              X=None,
              cell_state=None,
              return_rho=False,
              base_rate=None,
              n_cores=1):
    """Run scDALI hypothesis tests for each region.

    A, D are assumed to be n-by-d, where n is the number of cells and d the
    number of regions to model.

    Args:
        A: Alternative counts for each cell and region.
        D: Total counts for each cell and region.
        model: String indicating the model to run. Options are
            'scDALI-Joint' - a Beta-Binomial variance component score test to test
                for either heterogeneous or homogeneous allelic imbalance.
                Requires cell_state and base_rate.
            'scDALI-Het' - a Beta-Binomial variance component score test to test
                for heterogeneous allelic imbalance.
                Requires cell_state.
            'scDALI-Hom' - a Beta-Binomial variance component score test to test
                for homogeneous allelic imbalance.
                Requires base_rate.
        X: Optional design matrix.
        return_rho: When model is scDALI-Joint, this flag indicates whether to
            return rho, the fraction of allelic variation explained by global
            imbalance.  cell_state: Matrix of cell states, e.g. clusters or coordinates
            in a low-dimensional cell-state space.
        base_rate: Null allelic rate.
        n_cores: Number of cores to use.

    Returns:
        p-values for each region.
    """
    D = atleast_2d_column(D)
    A = atleast_2d_column(A)

    if X is not None:
        X = atleast_2d_column(X)

    if A.shape != D.shape:
        raise ValueError('A and D need to be of the same shape.')

    try:
        m = MODELS[model]
    except KeyError:
        msg = ('Model not recognized. Choices are ' +
               ', '.join(MODELS.keys()) + '.')
        raise ValueError(msg)

    if model in ['scDALI-Joint', 'scDALI-Het'] and cell_state is None:
        raise ValueError('%s requires cell_state to be specified' % model)
    if model in ['scDALI-Joint', 'scDALI-Hom'] and base_rate is None:
        raise ValueError('%s requires base_rate to be specified' % model)

    init_kwargs = {}
    fit_kwargs = {}
    if model in ['scDALI-Joint', 'scDALI-Hom']:
        init_kwargs['base_rate'] = base_rate
    if model in ['scDALI-Joint', 'scDALI-Het']:
        init_kwargs['E'] = cell_state
    init_kwargs['X'] = X

    n_cores = min(n_cores, D.shape[1])
    print('[scdali] Processing %d regions on %d core(s) ... ' %
          (D.shape[1], n_cores),
          flush=True)

    callbacks = []
    if model == 'scDALI-Joint':
        callbacks.append(create_method_callback('test', return_rho=return_rho))
    else:
        callbacks.append(create_method_callback('test'))

    show_progress = False if n_cores > 1 else True
    f = partial(run_model,
                m,
                init_kwargs=init_kwargs,
                fit_kwargs=fit_kwargs,
                callbacks=callbacks,
                show_progress=show_progress)
    results = process_parallel(f, mat_dict={'A': A, 'D': D}, n_cores=n_cores)

    out = dict()
    if model == 'scDALI-Joint' and return_rho:
        out['pvalues'] = np.asarray([r[0][0] for r in results]).T
        out['rhos'] = np.asarray([r[0][1] for r in results]).T
    else:
        out['pvalues'] = np.asarray([r[0] for r in results]).T
    return out
Exemple #9
0
def fit_bb_glm(a, d, X, offset=0, theta=None, maxiter=100, tol=1e-5):
    """Fits generalized linear model with Beta-Binomial likelihood.

    Uses iteratively reweighted least squares / Fisher scoring.

    Args:
        a: Vector successes.
        d: Vector of trials.
        X: Design matrix.
        offset: Untrainable offset parameter.
        theta: Dispersion parameter. If None, estimate alternatingly.
        maxiter: Maximum number of iterations
        tol: Break if mean absolute change in estimated parameters is below tol.

    Returns:
        Regression coefficients, estimated dispersion parameter and number of
        iterations.
    """
    from numpy_sugar.linalg import rsolve

    a = atleast_2d_column(a)
    d = atleast_2d_column(d)
    X = atleast_2d_column(X)

    y = a / d

    fit_dispersion = theta is None

    is_bernoulli = False
    if np.array_equal(y, y.astype(bool)) and fit_dispersion:
        is_bernoulli = True
        d = (d > 0).astype(float)
        theta = 0
        fit_dispersion = False

    if fit_dispersion:
        data = np.hstack([a, d - a])

    beta = rsolve(X.T @ X, X.T @ y)
    converged = False
    for i in range(maxiter):
        eta = X @ beta + offset
        mu = logistic(eta)

        if fit_dispersion:
            m = np.hstack([mu, 1 - mu])
            maxiter = min(10**(i + 1), 1000)
            (s, niter) = fit_polya_precision(data=data, m=m, maxiter=maxiter)
            theta = 1 / s

        gprime = 1 / ((1 - mu) * mu)
        z = eta + gprime * (y - mu) - offset

        W = d * mu * (1 - mu) * (theta + 1)
        W = W / (d * theta + 1)

        XW = (W * X).T
        beta_new = rsolve(XW @ X, XW @ z)

        if np.abs(beta - beta_new).mean() < tol:
            converged = True
            break

        beta = beta_new

    if not converged:
        print('Warning: Model did not converge. Try increasing maxiter.')

    if is_bernoulli:
        theta = np.inf
    return beta, theta, i