Exemple #1
0
def estimate(y, lik, K, M=None, verbose=True):
    from numpy_sugar.linalg import economic_qs
    from numpy import pi, var, diag
    from glimix_core.glmm import GLMMExpFam
    from glimix_core.lmm import LMM
    from limix._data._assert import assert_likelihood
    from limix._data import normalize_likelihood, conform_dataset
    from limix.qtl._assert import assert_finite
    from limix._display import session_block, session_line
    lik = normalize_likelihood(lik)
    lik_name = lik[0]
    with session_block("Heritability analysis", disable=not verbose):
        with session_line("Normalising input...", disable=not verbose):
            data = conform_dataset(y, M=M, K=K)
        y = data["y"]
        M = data["M"]
        K = data["K"]
        assert_finite(y, M, K)
        if K is not None:
            # K = K / diag(K).mean()
            QS = economic_qs(K)
        else:
            QS = None
        if lik_name == "normal":
            method = LMM(y.values, M.values, QS, restricted=True)
            method.fit(verbose=verbose)
        else:
            method = GLMMExpFam(y, lik, M.values, QS, n_int=500)
            method.fit(verbose=verbose, factr=1e6, pgtol=1e-3)
        g = method.scale * (1 - method.delta)
        e = method.scale * method.delta
        if lik_name == "bernoulli":
            e += pi * pi / 3
        v = var(method.mean())
        return g, v, e
Exemple #2
0
def estimate(y_phe, lik, kin, marker_mat=None, verbose=True):
    ''' estimate variance components '''
    lik = normalize_likelihood(lik)
    lik_name = lik[0]
    with session_block("Heritability analysis", disable=not verbose):
        with session_line("Normalising input...", disable=not verbose):
            data = conform_dataset(y_phe, M=marker_mat, K=kin)
        y_phe = data["y"]
        marker_mat = data["M"]
        kin = data["K"]
        assert_finite(y_phe, marker_mat, kin)
        if kin is not None:
            # K = K / diag(K).mean()
            q_s = economic_qs(kin)
        else:
            q_s = None
        if lik_name == "normal":
            method = LMM(y_phe.values, marker_mat.values, q_s, restricted=True)
            method.fit(verbose=verbose)
        else:
            method = GLMMExpFam(y_phe, lik, marker_mat.values, q_s, n_int=500)
            method.fit(verbose=verbose, factr=1e6, pgtol=1e-3)
        v_g = method.scale * (1 - method.delta)
        v_e = method.scale * method.delta
        if lik_name == "bernoulli":
            v_e += pi * pi / 3
        v_v = var(method.mean())
        return v_g, v_v, v_e
Exemple #3
0
def st_scan(G, y, lik, K=None, M=None, verbose=True):
    r""" Single-variant association testing via generalised linear mixed models.

    It supports Normal (linear mixed model), Bernoulli, Probit, Binomial, and Poisson
    residual errors, defined by ``lik``.
    The columns of ``G`` define the candidates to be tested for association
    with the phenotype ``y``.
    The covariance matrix is set by ``K``.
    If not provided, or set to ``None``, the generalised linear model
    without random effects is assumed.
    The covariates can be set via the parameter ``M``.
    We recommend to always provide a column of ones when covariates are actually
    provided.

    Parameters
    ----------
    G : array_like
        :math:`N` individuals by :math:`S` candidate markers.
    y : array_like
        An outcome array of :math:`N` individuals.
    lik : tuple, "normal", "bernoulli", "probit", binomial", "poisson"
        Sample likelihood describing the residual distribution.
        Either a tuple or a string specifiying the likelihood is required. The Normal,
        Bernoulli, Probit, and Poisson likelihoods can be selected by providing a
        string. Binomial likelihood on the other hand requires a tuple because of the
        number of trials: ``("binomial", array_like)``.
    K : array_like, optional
        :math:`N`-by-:math:`N` covariance matrix (e.g., kinship coefficients).
        Set to ``None`` for a generalised linear model without random effects.
        Defaults to ``None``.
    M : array_like, optional
        `N` individuals by `S` covariates.
        It will create a :math:`N`-by-:math:`1` matrix ``M`` of ones representing the
        offset covariate if ``None`` is passed. If an array is passed, it will used as
        is. Defaults to ``None``.
    verbose : bool, optional
        ``True`` to display progress and summary; ``False`` otherwise.

    Returns
    -------
    :class:`limix.qtl.QTLModel`
        QTL representation.

    Examples
    --------
    .. doctest::

        >>> from numpy import dot, exp, sqrt, ones
        >>> from numpy.random import RandomState
        >>> from pandas import DataFrame
        >>> import pandas as pd
        >>> from limix.qtl import st_scan
        >>>
        >>> random = RandomState(1)
        >>> pd.options.display.float_format = "{:9.6f}".format
        >>>
        >>> n = 30
        >>> p = 3
        >>> samples_index = range(n)
        >>>
        >>> M = DataFrame(dict(offset=ones(n), age=random.randint(10, 60, n)))
        >>> M.index = samples_index
        >>>
        >>> X = random.randn(n, 100)
        >>> K = dot(X, X.T)
        >>>
        >>> candidates = random.randn(n, p)
        >>> candidates = DataFrame(candidates, index=samples_index,
        ...                                    columns=['rs0', 'rs1', 'rs2'])
        >>>
        >>> y = random.poisson(exp(random.randn(n)))
        >>>
        >>> model = st_scan(candidates, y, 'poisson', K, M=M, verbose=False)
        >>>
        >>> model.variant_pvalues.to_dataframe()  # doctest: +FLOAT_CMP
                         pv
        candidate
        rs0        0.554444
        rs1        0.218996
        rs2        0.552200
        >>> model.variant_effsizes.to_dataframe()  # doctest: +FLOAT_CMP
                   effsizes
        candidate
        rs0       -0.130867
        rs1       -0.315078
        rs2       -0.143869
        >>> model.variant_effsizes_se.to_dataframe()  # doctest: +FLOAT_CMP
                   effsizes std
        candidate
        rs0            0.221390
        rs1            0.256327
        rs2            0.242013
        >>> model  # doctest: +FLOAT_CMP
        Variants
        --------
               effsizes  effsizes_se   pvalues
        count         3            3         3
        mean  -0.196604     0.239910  0.441880
        std    0.102807     0.017563  0.193027
        min   -0.315077     0.221389  0.218996
        25%   -0.229473     0.231701  0.385598
        50%   -0.143869     0.242013  0.552200
        75%   -0.137367     0.249170  0.553322
        max   -0.130866     0.256326  0.554443
        <BLANKLINE>
        Covariate effect sizes for H0
        -----------------------------
              age    offset
        -0.005568  0.395287

    >>> from numpy import zeros
    >>>
    >>> nsamples = 50
    >>>
    >>> X = random.randn(nsamples, 2)
    >>> G = random.randn(nsamples, 100)
    >>> K = dot(G, G.T)
    >>> ntrials = random.randint(1, 100, nsamples)
    >>> z = dot(G, random.randn(100)) / sqrt(100)
    >>>
    >>> successes = zeros(len(ntrials), int)
    >>> for i, nt in enumerate(ntrials):
    ...     for _ in range(nt):
    ...         successes[i] += int(z[i] + 0.5 * random.randn() > 0)
    >>>
    >>> result = st_scan(X, successes, ("binomial", ntrials), K, verbose=False)
    >>> print(result)  # doctest: +FLOAT_CMP
    Variants
    --------
           effsizes  effsizes_se   pvalues
    count         2            2         2
    mean   0.227116     0.509575  0.478677
    std    0.567975     0.031268  0.341791
    min   -0.174503     0.487466  0.236994
    25%    0.026307     0.498520  0.357835
    50%    0.227116     0.509575  0.478677
    75%    0.427925     0.520630  0.599518
    max    0.628735     0.531685  0.720359
    <BLANKLINE>
    Covariate effect sizes for H0
    -----------------------------
       offset
     0.409570


    Notes
    -----
    It will raise a ``ValueError`` exception if non-finite values are passed. Please,
    refer to the :func:`limix.qc.mean_impute` function for missing value imputation.
    """
    from numpy_sugar import is_all_finite
    from numpy_sugar.linalg import economic_qs

    if not isinstance(lik, (tuple, list)):
        lik = (lik,)

    lik_name = lik[0].lower()
    lik = (lik_name,) + lik[1:]
    check_likelihood_name(lik_name)

    with session_block("qtl analysis", disable=not verbose):

        with session_line("Normalising input... ", disable=not verbose):
            data = conform_dataset(y, M, G=G, K=K)

        y = data["y"]
        M = data["M"]
        G = data["G"]
        K = data["K"]

        if not is_all_finite(y):
            raise ValueError("Outcome must have finite values only.")

        if not is_all_finite(M):
            raise ValueError("Covariates must have finite values only.")

        if K is not None:
            if not is_all_finite(K):
                raise ValueError("Covariate matrix must have finite values only.")
            QS = economic_qs(K)
        else:
            QS = None

        y = normalise_extreme_values(data["y"], lik)

        if lik_name == "normal":
            model = _perform_lmm(y.values, M, QS, G, verbose)
        else:
            model = _perform_glmm(y.values, lik, M, K, QS, G, verbose)

        if verbose:
            print(model)

        return model
Exemple #4
0
def mt_scan(G, Y, M=None, K=None, Ac=None, Asnps=None, Asnps0=None, verbose=True):
    """
    Wrapper function for multi-trait single-variant association testing
    using variants of the multi-trait linear mixed model.

    Parameters
    ----------
    Y : (`N`, `P`) ndarray
        phenotype data
    Asnps : (`P`, `K`) ndarray
         trait design of snp covariance.
         By default, ``Asnps`` is eye(`P`).
    R : (`N`, `N`) ndarray
        LMM-covariance/genetic relatedness matrix.
        If not provided, then standard linear regression is considered.
        Alternatively, its eighenvalue decomposition can be
        provided through ``eigh_R``.
        if ``eigh_R`` is set, this parameter is ignored.
    eigh_R : tuple
        Tuple with `N` ndarray of eigenvalues of `R` and
        (`N`, `N`) ndarray of eigenvectors of ``R``.
    covs : (`N`, `D`) ndarray
        covariate design matrix.
        By default, ``covs`` is a (`N`, `1`) array of ones.
    Ac : (`P`, `L`) ndarray
        trait design matrices of the different fixed effect terms.
        By default, ``Ac`` is eye(`P`).
    Asnps0 : (`P`, `K`) ndarray
         trait design of snp covariance in the null model.
         By default, Asnps0 is not considered (i.e., no SNP effect in the null model).
         If specified, then three tests are considered:
         (i) Asnps vs , (ii) Asnps0!=0, (iii) Asnps!=Asnps0
    verbose : (bool, optional):
        if True, details such as runtime as displayed.
    """
    from pandas import DataFrame
    from scipy.stats import chi2
    from numpy import eye, cov, asarray
    from scipy.linalg import eigh
    from limix_core.gp import GP2KronSum
    from limix_core.covar import FreeFormCov
    from limix_lmm.mtlmm import MTLMM

    if Ac is None:
        Ac = eye(Y.shape[1])

    with session_block("single-trait association test", disable=not verbose):

        with session_line("Normalising input... ", disable=not verbose):

            data = conform_dataset(Y, M, G=G, K=K)

            Y = asarray(data["y"])
            M = asarray(data["M"])
            G = asarray(data["G"])
            K = asarray(data["K"])

            # case 1: multi-trait linear model
            if K is None:
                raise ValueError("multi-trait linear model not supported")

            eigh_R = eigh(K)

            # case 2: full-rank multi-trait linear model
            S_R, U_R = eigh_R
            S_R = add_jitter(S_R)
            gp = GP2KronSum(
                Y=Y,
                Cg=FreeFormCov(Y.shape[1]),
                Cn=FreeFormCov(Y.shape[1]),
                S_R=eigh_R[0],
                U_R=eigh_R[1],
                F=M,
                A=Ac,
            )
            gp.covar.Cr.setCovariance(0.5 * cov(Y.T))
            gp.covar.Cn.setCovariance(0.5 * cov(Y.T))
            gp.optimize(verbose=verbose)

            lmm = MTLMM(Y, F=M, A=Ac, Asnp=Asnps, covar=gp.covar)
            if Asnps0 is not None:
                lmm0 = MTLMM(Y, F=M, A=Ac, Asnp=Asnps0, covar=gp.covar)

            if Asnps0 is None:

                lmm.process(G)
                RV = OrderedDict()
                RV["pv"] = lmm.getPv()
                RV["lrt"] = lmm.getLRT()

            else:

                lmm.process(G)
                lmm0.process(G)

                # compute pv
                lrt1 = lmm.getLRT()
                lrt0 = lmm0.getLRT()
                lrt = lrt1 - lrt0
                pv = chi2(Asnps.shape[1] - Asnps0.shape[1]).sf(lrt)

                RV = OrderedDict()
                RV["pv1"] = lmm.getPv()
                RV["pv0"] = lmm0.getPv()
                RV["pv"] = pv
                RV["lrt1"] = lrt1
                RV["lrt0"] = lrt0
                RV["lrt"] = lrt

        return DataFrame(RV)
Exemple #5
0
def scan(G,
         Y,
         lik="normal",
         K=None,
         M=None,
         idx=None,
         A=None,
         A0=None,
         A1=None,
         verbose=True):
    """
    Multi-trait association and interaction testing via linear mixed models.

    Let n, c, and p be the number of samples, covariates, and traits, respectively.
    The outcome variable Y is a n×p matrix distributed according to ::

        vec(Y) ~ N((A ⊗ M) vec(𝚨), K₀ = C₀ ⊗ K + C₁ ⊗ I) under H₀.

    A and M are design matrices of dimensions p×p and n×c provided by the user,
    where X is the usual matrix of covariates commonly used in single-trait models.
    𝚨 is a c×p matrix of fixed-effect sizes per trait.
    C₀ and C₁ are both symmetric matrices of dimensions p×p, for which C₁ is
    guaranteed by our implementation to be of full rank.
    The parameters of the H₀ model are the matrices 𝚨, C₀, and C₁.

    The additional models H₁ and H₂ are define as ::

        vec(Y) ~ N((A ⊗ M) vec(𝚨) + (A₀ ⊗ Gᵢ) vec(𝚩₀), s⋅K₀)

    and ::

        vec(Y) ~ N((A ⊗ M) vec(𝚨) + (A₀ ⊗ Gᵢ) vec(𝚩₀) + (A₁ ⊗ Gᵢ) vec(𝚩₁), s⋅K₀)

    It performs likelihood-ratio tests for the following cases, where the first
    hypothesis is the null one while the second hypothesis is the alternative one:
    - H₀ vs H₁: testing for vec(𝚩₀) ≠ 𝟎 while vec(𝚩₁) = 𝟎
    - H₀ vs H₂: testing for [vec(𝚩₀) vec(𝚩₁)] ≠ 𝟎
    - H₁ vs H₂: testing for vec(𝚩₁) ≠ 𝟎

    It supports generalized linear mixed models (GLMM) when a single trait is used.
    In this case, the following likelihoods are implemented:
    - Bernoulli
    - Probit
    - Binomial
    - Poisson

    Formally, let p(𝜇) be one of the supported probability distributions where 𝜇 is
    its mean. The H₀ model is defined as follows::

        yᵢ ∼ p(𝜇ᵢ=g(zᵢ)) for 𝐳 ∼ 𝓝(..., ...).

    g(⋅) is the corresponding canonical link function for the Bernoulli, Binomial, and
    Poisson likelihoods. The Probit likelihood, on the other hand, is a Bernoulli
    likelihood with probit link function.

    Parameters
    ----------
    G : n×m array_like
        Genetic candidates.
    Y : n×p array_like
        Rows are samples and columns are phenotypes.
    lik : tuple, "normal", "bernoulli", "probit", "binomial", "poisson"
        Sample likelihood describing the residual distribution.
        Either a tuple or a string specifying the likelihood is required. The Normal,
        Bernoulli, Probit, and Poisson likelihoods can be selected by providing a
        string. Binomial likelihood on the other hand requires a tuple because of the
        number of trials: ``("binomial", array_like)``. Defaults to ``"normal"``.
    K : n×n array_like
        Sample covariance, often the so-called kinship matrix.
    M : n×c array_like
        Covariates matrix.
    idx : list
        List of candidate indices that defines the set of candidates to be used in the
        tests.
    A : p×p array_like
        Symmetric trait-by-trait design matrix.
    A0 : p×p₀ array_like, optional
        Matrix A₀, possibility a non-symmetric one. If ``None``, it defines an empty
        matrix, p₀=0. Defaults to ``None``.
    A1 : p×p₁ array_like, optional
        Matrix A₁, possibility a non-symmetric one. If ``None``, it defines an identity
        matrix, p₀=p. Defaults to ``None``.
    verbose : bool, optional
        ``True`` to display progress and summary; ``False`` otherwise.

    Returns
    -------
    result : :class:`limix.qtl._result.STScanResult`, :class:`limix.qtl._result.MTScanResult`
        P-values, log of marginal likelihoods, effect sizes, and associated statistics.

    Examples
    --------
    .. doctest::

        >>> from limix.qtl import scan
        >>> from numpy import reshape, kron, eye
        >>> from numpy import concatenate
        >>> from numpy.random import RandomState
        >>> import scipy.stats as st
        >>> from limix.qc import normalise_covariance
        >>>
        >>> def vec(x):
        ...     return reshape(x, (-1,) + x.shape[2:], order="F")
        >>>
        >>> def unvec(x, shape):
        ...     return reshape(x, shape, order="F")
        >>>
        >>> random = RandomState(0)
        >>> n = 30
        >>> ntraits = 2
        >>> ncovariates = 3
        >>>
        >>> A = random.randn(ntraits, ntraits)
        >>> A = A @ A.T
        >>> M = random.randn(n, ncovariates)
        >>>
        >>> C0 = random.randn(ntraits, ntraits)
        >>> C0 = C0 @ C0.T
        >>>
        >>> C1 = random.randn(ntraits, ntraits)
        >>> C1 = C1 @ C1.T
        >>>
        >>> G = random.randn(n, 4)
        >>>
        >>> A0 = random.randn(ntraits, 1)
        >>> A1 = random.randn(ntraits, 2)
        >>> A01 = concatenate((A0, A1), axis=1)
        >>>
        >>> K = random.randn(n, n + 1)
        >>> K = normalise_covariance(K @ K.T)
        >>>
        >>> beta = vec(random.randn(ntraits, ncovariates))
        >>> alpha = vec(random.randn(A01.shape[1], G.shape[1]))
        >>>
        >>> mvn = st.multivariate_normal
        >>> m = kron(A, M) @ beta + kron(A01, G) @ alpha
        >>> Y = unvec(mvn(m, kron(C0, K) + kron(C1, eye(n))).rvs(), (n, -1))
        >>>
        >>> idx = [[0, 1], 2, [3]]
        >>> r = scan(G, Y, idx=idx, K=K, M=M, A=A, A0=A0, A1=A1, verbose=False)

    .. doctest::

        >>> from numpy import dot, exp, sqrt, ones
        >>> from numpy.random import RandomState
        >>> from pandas import DataFrame
        >>> import pandas as pd
        >>> from limix.qtl import scan
        >>>
        >>> random = RandomState(1)
        >>> pd.options.display.float_format = "{:9.6f}".format
        >>>
        >>> n = 30
        >>> p = 3
        >>> samples_index = range(n)
        >>>
        >>> M = DataFrame(dict(offset=ones(n), age=random.randint(10, 60, n)))
        >>> M.index = samples_index
        >>>
        >>> X = random.randn(n, 100)
        >>> K = dot(X, X.T)
        >>>
        >>> candidates = random.randn(n, p)
        >>> candidates = DataFrame(candidates, index=samples_index,
        ...                                    columns=['rs0', 'rs1', 'rs2'])
        >>>
        >>> y = random.poisson(exp(random.randn(n)))
        >>>
        >>> result = scan(candidates, y, 'poisson', K, M=M, verbose=False)
        >>>
        >>> result.stats  # doctest: +FLOAT_CMP +SKIP
               null lml    alt lml    pvalue  dof
        test
        0    -48.736563 -48.561855  0.554443    1
        1    -48.736563 -47.981093  0.218996    1
        2    -48.736563 -48.559868  0.552200    1
        >>> result.alt_effsizes  # doctest: +FLOAT_CMP +SKIP
           test candidate   effsize  effsize se
        0     0       rs0 -0.130867    0.221390
        1     1       rs1 -0.315079    0.256327
        2     2       rs2 -0.143869    0.242014
        >>> print(result)  # doctest: +FLOAT_CMP +SKIP
        Null model
        ----------
        <BLANKLINE>
          𝐳 ~ 𝓝(M𝜶, 0.79*K + 0.00*I)
          yᵢ ~ Poisson(λᵢ=g(zᵢ)), where g(x)=eˣ
          M = ['offset' 'age']
          𝜶 = [ 0.39528617 -0.00556789]
          Log marg. lik.: -48.736563230140376
          Number of models: 1
        <BLANKLINE>
        Alt model
        ---------
        <BLANKLINE>
          𝐳 ~ 𝓝(M𝜶 + Gᵢ, 0.79*K + 0.00*I)
          yᵢ ~ Poisson(λᵢ=g(zᵢ)), where g(x)=eˣ
          Min. p-value: 0.21899561824721903
          First perc. p-value: 0.22565970374303942
          Max. log marg. lik.: -47.981092939974765
          99th perc. log marg. lik.: -47.9926684371547
          Number of models: 3

        >>> from numpy import zeros
        >>>
        >>> nsamples = 50
        >>>
        >>> X = random.randn(nsamples, 2)
        >>> G = random.randn(nsamples, 100)
        >>> K = dot(G, G.T)
        >>> ntrials = random.randint(1, 100, nsamples)
        >>> z = dot(G, random.randn(100)) / sqrt(100)
        >>>
        >>> successes = zeros(len(ntrials), int)
        >>> for i, nt in enumerate(ntrials):
        ...     for _ in range(nt):
        ...         successes[i] += int(z[i] + 0.5 * random.randn() > 0)
        >>>
        >>> result = scan(X, successes, ("binomial", ntrials), K, verbose=False)
        >>> print(result)  # doctest: +FLOAT_CMP +SKIP
        Null model
        ----------
        <BLANKLINE>
          𝐳 ~ 𝓝(M𝜶, 1.74*K + 0.15*I)
          yᵢ ~ Binom(μᵢ=g(zᵢ), nᵢ), where g(x)=1/(1+e⁻ˣ)
          M = ['offset']
          𝜶 = [0.40956947]
          Log marg. lik.: -142.9436437096321
          Number of models: 1
        <BLANKLINE>
        Alt model
        ---------
        <BLANKLINE>
          𝐳 ~ 𝓝(M𝜶 + Gᵢ, 1.74*K + 0.15*I)
          yᵢ ~ Binom(μᵢ=g(zᵢ), nᵢ), where g(x)=1/(1+e⁻ˣ)
          Min. p-value: 0.23699422686919802
          First perc. p-value: 0.241827874774993
          Max. log marg. lik.: -142.24445140459548
          99th perc. log marg. lik.: -142.25080258276773
          Number of models: 2

    Notes
    -----
    It will raise a ``ValueError`` exception if non-finite values are passed. Please,
    refer to the :func:`limix.qc.mean_impute` function for missing value imputation.
    """
    from numpy_sugar.linalg import economic_qs

    lik = normalize_likelihood(lik)

    if A is None:
        if A0 is not None or A1 is not None:
            raise ValueError(
                "You cannot define `A0` or `A1` without defining `A`.")

    with session_block("QTL analysis", disable=not verbose):

        with session_line("Normalising input... ", disable=not verbose):
            data = conform_dataset(Y, M, G=G, K=K)

        Y = data["y"]
        M = data["M"]
        G = data["G"]
        K = data["K"]

        assert_finite(Y, M, K)

        if K is not None:
            QS = economic_qs(K)
        else:
            QS = None

        if A is None:
            r = _single_trait_scan(idx, lik, Y, M, G, QS, verbose)
        else:
            r = _multi_trait_scan(idx, lik, Y, M, G, QS, A, A0, A1, verbose)

        r = r.create()
        if verbose:
            print(r)

        return r
Exemple #6
0
def scan(ctx, trait, genotype, covariate, kinship, lik, output_dir, verbose,
         dry_run, **_):
    """ Single-variant association testing via mixed models.

    This analysis requires minimally the specification of one phenotype
    (PHENOTYPES_FILE) and genotype data (GENOTYPE_FILE).

    The --filter option allows for selecting a subset of the original dataset for
    the analysis. For example,

        --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)"

    states that only loci of chromosome 3 having a position inside the range (100, 200)
    will be considered. The --filter option can be used multiple times in the same
    call. In general, --filter accepts a string of the form

        <DATA-TYPE>: <BOOL-EXPR>

    where <DATA-TYPE> can be phenotype, genotype, or covariate. <BOOL-EXPR> is a boolean
    expression involving row or column names. Please, consult `pandas.DataFrame.query`
    function from Pandas package for further information.
    \f

    Examples
    --------

    ... doctest::

        # First we perform a quick file inspection. This step is optional but is very
        # useful to check whether `limix` is able to read them and print out their
        # metadata.
        limix show phenotypes.csv
        limix show genotype.bgen
        limix show kinship.raw

        # We now perform the analysis, specifying the genotype loci and the phenotype
        # of interest.
        limix phenotypes.csv genotype.bgen --kinship-file=kinship.raw \
            --output-dir=results \
            --filter="phenotype: col == 'height'" \
            --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)"
    """
    import sys
    from os import makedirs
    from os.path import abspath, exists, join
    import traceback
    from limix._display import session_block, banner, session_line, indent, print_exc
    from limix.qtl import scan
    from limix.io import fetch
    from .pipeline import Pipeline
    from limix._data import conform_dataset
    from .preprocess import impute as impute_func
    from .preprocess import normalize as normalize_func
    from .preprocess import where as where_func
    from .preprocess import drop_missing, drop_maf

    print(banner())

    if ctx.obj is None:
        ctx.obj = {"preprocess": []}

    output_dir = abspath(output_dir)
    if not dry_run:
        if not exists(output_dir):
            makedirs(output_dir, exist_ok=True)

    def _print_data_array(arr, verbose):
        if verbose:
            print("\n{}\n".format(indent(_clean_data_array_repr(arr))))

    data = {"y": None, "G": None, "K": None}

    data["y"] = fetch("trait", trait, verbose)
    _print_data_array(data["y"], verbose)

    data["G"] = fetch("genotype", genotype, verbose)
    _print_data_array(data["G"], verbose)

    if covariate is not None:
        data["M"] = fetch("covariate", covariate, verbose)
        _print_data_array(data["M"], verbose)

    if kinship is not None:
        data["K"] = fetch("kinship", kinship, verbose)
        _print_data_array(data["K"], verbose)

    with session_line("Matching samples... "):
        data = conform_dataset(**data)
    data = {k: v for k, v in data.items() if v is not None}

    if data["y"].sample.size == 0:
        raise RuntimeError(
            "Exiting early because there is no sample left after matching samples."
            + " Please, check your sample ids.")

    oparams = _ordered_params(ctx)

    with session_block("preprocessing", disable=not verbose):
        pipeline = Pipeline(data)
        preproc_params = [
            i for i in oparams if i[0] in
            ["impute", "normalize", "where", "drop_missing", "drop_maf"]
        ]

        for p in preproc_params:
            if p[0] == "where":
                pipeline.append(where_func, "where", p[1])
            elif p[0] == "normalize":
                pipeline.append(normalize_func, "normalize", p[1])
            elif p[0] == "impute":
                pipeline.append(impute_func, "impute", p[1])
            elif p[0] == "drop_maf":
                pipeline.append(drop_maf, "drop-maf", p[1])
            elif p[0] == "drop_missing":
                pipeline.append(drop_missing, "drop-missing", p[1])

        data = pipeline.run()

    if dry_run:
        print("Exiting early because of dry-run option.")
        return

    if "K" not in data:
        data["K"] = None
    try:
        res = scan(data["G"],
                   data["y"],
                   lik=lik,
                   K=data["K"],
                   M=data["M"],
                   verbose=verbose)
    except Exception as e:
        print_exc(traceback.format_stack(), e)
        sys.exit(1)

    with session_line("Saving results to `{}`... ".format(output_dir)):
        res.to_csv(join(output_dir, "null.csv"), join(output_dir, "alt.csv"))
Exemple #7
0
def scan(G,
         Y,
         lik="normal",
         K=None,
         M=None,
         idx=None,
         A=None,
         A0=None,
         A1=None,
         verbose=True):
    """
    Multi-trait association and interaction testing via linear mixed models.

    Let n, c, and p be the number of samples, covariates, and traits, respectively.
    The outcome variable Y is a n×p matrix distributed according to ::

        vec(Y) ~ N((A ⊗ M) vec(𝚨), K₀ = C₀ ⊗ K + C₁ ⊗ I) under H₀.

    A and M are design matrices of dimensions p×p and n×c provided by the user,
    where X is the usual matrix of covariates commonly used in single-trait models.
    𝚨 is a c×p matrix of fixed-effect sizes per trait.
    C₀ and C₁ are both symmetric matrices of dimensions p×p, for which C₁ is
    guaranteed by our implementation to be of full rank.
    The parameters of the H₀ model are the matrices 𝚨, C₀, and C₁.

    The additional models H₁ and H₂ are define as ::

        vec(Y) ~ N((A ⊗ M) vec(𝚨) + (A₀ ⊗ Gᵢ) vec(𝚩₀), s⋅K₀)

    and ::

        vec(Y) ~ N((A ⊗ M) vec(𝚨) + (A₀ ⊗ Gᵢ) vec(𝚩₀) + (A₁ ⊗ Gᵢ) vec(𝚩₁), s⋅K₀)

    It performs likelihood-ratio tests for the following cases, where the first
    hypothesis is the null one while the second hypothesis is the alternative one:

    - H₀ vs H₁: testing for vec(𝚩₀) ≠ 𝟎 while vec(𝚩₁) = 𝟎
    - H₀ vs H₂: testing for [vec(𝚩₀) vec(𝚩₁)] ≠ 𝟎
    - H₁ vs H₂: testing for vec(𝚩₁) ≠ 𝟎

    It supports generalized linear mixed models (GLMM) when a single trait is used.
    In this case, the following likelihoods are implemented:

    - Bernoulli
    - Probit
    - Binomial
    - Poisson

    Formally, let p(𝜇) be one of the supported probability distributions where 𝜇 is
    its mean. The H₀ model is defined as follows::

        yᵢ ∼ p(𝜇ᵢ=g(zᵢ)) for 𝐳 ∼ 𝓝(..., ...).

    g(⋅) is the corresponding canonical link function for the Bernoulli, Binomial, and
    Poisson likelihoods. The Probit likelihood, on the other hand, is a Bernoulli
    likelihood with probit link function.

    Parameters
    ----------
    G : n×m array_like
        Genetic candidates.
    Y : n×p array_like
        Rows are samples and columns are phenotypes.
    lik : tuple, "normal", "bernoulli", "probit", "binomial", "poisson"
        Sample likelihood describing the residual distribution.
        Either a tuple or a string specifying the likelihood is required. The Normal,
        Bernoulli, Probit, and Poisson likelihoods can be selected by providing a
        string. Binomial likelihood on the other hand requires a tuple because of the
        number of trials: ``("binomial", array_like)``. Defaults to ``"normal"``.
    K : n×n array_like
        Sample covariance, often the so-called kinship matrix.
    M : n×c array_like
        Covariates matrix.
    idx : list
        List of candidate indices that defines the set of candidates to be used in the
        tests.
    A : p×p array_like
        Symmetric trait-by-trait design matrix.
    A0 : p×p₀ array_like, optional
        Matrix A₀, possibility a non-symmetric one. If ``None``, it defines an empty
        matrix, p₀=0. Defaults to ``None``.
    A1 : p×p₁ array_like, optional
        Matrix A₁, possibility a non-symmetric one. If ``None``, it defines an identity
        matrix, p₀=p. Defaults to ``None``.
    verbose : bool, optional
        ``True`` to display progress and summary; ``False`` otherwise.

    Returns
    -------
    result : :class:`limix.qtl._result.STScanResult`, :class:`limix.qtl._result.MTScanResult`
        P-values, log of marginal likelihoods, effect sizes, and associated statistics.

    Examples
    --------
    .. doctest::

        >>> from limix.qtl import scan
        >>> from numpy import reshape, kron, eye
        >>> from numpy import concatenate
        >>> from numpy.random import RandomState
        >>> import scipy.stats as st
        >>> from limix.qc import normalise_covariance
        >>>
        >>> def vec(x):
        ...     return reshape(x, (-1,) + x.shape[2:], order="F")
        >>>
        >>> def unvec(x, shape):
        ...     return reshape(x, shape, order="F")
        >>>
        >>> random = RandomState(0)
        >>> n = 30
        >>> ntraits = 2
        >>> ncovariates = 3
        >>>
        >>> A = random.randn(ntraits, ntraits)
        >>> A = A @ A.T
        >>> M = random.randn(n, ncovariates)
        >>>
        >>> C0 = random.randn(ntraits, ntraits)
        >>> C0 = C0 @ C0.T
        >>>
        >>> C1 = random.randn(ntraits, ntraits)
        >>> C1 = C1 @ C1.T
        >>>
        >>> G = random.randn(n, 4)
        >>>
        >>> A0 = random.randn(ntraits, 1)
        >>> A1 = random.randn(ntraits, 2)
        >>> A01 = concatenate((A0, A1), axis=1)
        >>>
        >>> K = random.randn(n, n + 1)
        >>> K = normalise_covariance(K @ K.T)
        >>>
        >>> beta = vec(random.randn(ntraits, ncovariates))
        >>> alpha = vec(random.randn(A01.shape[1], G.shape[1]))
        >>>
        >>> mvn = st.multivariate_normal
        >>> m = kron(A, M) @ beta + kron(A01, G) @ alpha
        >>> Y = unvec(mvn(m, kron(C0, K) + kron(C1, eye(n))).rvs(), (n, -1))
        >>>
        >>> idx = [[0, 1], 2, [3]]
        >>> r = scan(G, Y, idx=idx, K=K, M=M, A=A, A0=A0, A1=A1, verbose=False)

    .. doctest::

        >>> from numpy import dot, exp, sqrt, ones
        >>> from numpy.random import RandomState
        >>> from pandas import DataFrame
        >>> import pandas as pd
        >>> from limix.qtl import scan
        >>>
        >>> random = RandomState(1)
        >>> pd.options.display.float_format = "{:9.6f}".format
        >>>
        >>> n = 30
        >>> p = 3
        >>> samples_index = range(n)
        >>>
        >>> M = DataFrame(dict(offset=ones(n), age=random.randint(10, 60, n)))
        >>> M.index = samples_index
        >>>
        >>> X = random.randn(n, 100)
        >>> K = dot(X, X.T)
        >>>
        >>> candidates = random.randn(n, p)
        >>> candidates = DataFrame(candidates, index=samples_index,
        ...                                    columns=['rs0', 'rs1', 'rs2'])
        >>>
        >>> y = random.poisson(exp(random.randn(n)))
        >>>
        >>> result = scan(candidates, y, 'poisson', K, M=M, verbose=False)
        >>>
        >>> result.stats  # doctest: +FLOAT_CMP
                   lml0       lml2  dof20    scale2      pv20
        test
        0    -48.720890 -48.536860      1  0.943532  0.544063
        1    -48.720890 -47.908341      1  0.904814  0.202382
        2    -48.720890 -48.534754      1  0.943400  0.541768
        >>> print(result)  # doctest: +FLOAT_CMP
        Hypothesis 0
        ------------
        <BLANKLINE>
        𝐳 ~ 𝓝(𝙼𝜶, 0.000⋅𝙺 + 0.788⋅𝙸) for yᵢ ~ Poisson(λᵢ=g(zᵢ)) and g(x)=eˣ
        <BLANKLINE>
        M     = ['offset' 'age']
        𝜶     = [ 0.39528889 -0.00556797]
        se(𝜶) = [0.50173695 0.01505240]
        lml   = -48.720890273519444
        <BLANKLINE>
        Hypothesis 2
        ------------
        <BLANKLINE>
        𝐳 ~ 𝓝(𝙼𝜶 + G𝛃, s(0.000⋅𝙺 + 0.788⋅𝙸)) for yᵢ ~ Poisson(λᵢ=g(zᵢ)) and g(x)=eˣ
        <BLANKLINE>
                  lml       cov. effsizes   cand. effsizes
        --------------------------------------------------
        mean   -4.833e+01       2.393e-01       -1.966e-01
        std     3.623e-01       2.713e-01        1.028e-01
        min    -4.854e+01      -8.490e-03       -3.151e-01
        25%    -4.854e+01      -7.684e-03       -2.295e-01
        50%    -4.853e+01       2.243e-01       -1.439e-01
        75%    -4.822e+01       4.725e-01       -1.374e-01
        max    -4.791e+01       5.255e-01       -1.309e-01
        <BLANKLINE>
        Likelihood-ratio test p-values
        ------------------------------
        <BLANKLINE>
               𝓗₀ vs 𝓗₂
        ----------------
        mean   4.294e-01
        std    1.966e-01
        min    2.024e-01
        25%    3.721e-01
        50%    5.418e-01
        75%    5.429e-01
        max    5.441e-01
        >>> from numpy import zeros
        >>>
        >>> nsamples = 50
        >>>
        >>> X = random.randn(nsamples, 2)
        >>> G = random.randn(nsamples, 100)
        >>> K = dot(G, G.T)
        >>> ntrials = random.randint(1, 100, nsamples)
        >>> z = dot(G, random.randn(100)) / sqrt(100)
        >>>
        >>> successes = zeros(len(ntrials), int)
        >>> for i, nt in enumerate(ntrials):
        ...     for _ in range(nt):
        ...         successes[i] += int(z[i] + 0.5 * random.randn() > 0)
        >>>
        >>> result = scan(X, successes, ("binomial", ntrials), K, verbose=False)
        >>> print(result)  # doctest: +FLOAT_CMP
        Hypothesis 0
        ------------
        <BLANKLINE>
        𝐳 ~ 𝓝(𝙼𝜶, 0.152⋅𝙺 + 1.738⋅𝙸) for yᵢ ~ Binom(μᵢ=g(zᵢ), nᵢ) and g(x)=1/(1+e⁻ˣ)
        <BLANKLINE>
        M     = ['offset']
        𝜶     = [0.40956942]
        se(𝜶) = [0.55141166]
        lml   = -142.80784719977515
        <BLANKLINE>
        Hypothesis 2
        ------------
        <BLANKLINE>
        𝐳 ~ 𝓝(𝙼𝜶 + G𝛃, s(0.152⋅𝙺 + 1.738⋅𝙸)) for yᵢ ~ Binom(μᵢ=g(zᵢ), nᵢ) and g(x)=1/(1+e⁻ˣ)
        <BLANKLINE>
                  lml       cov. effsizes   cand. effsizes
        --------------------------------------------------
        mean   -1.425e+02       3.701e-01        2.271e-01
        std     4.110e-01       2.296e-02        5.680e-01
        min    -1.427e+02       3.539e-01       -1.745e-01
        25%    -1.426e+02       3.620e-01        2.631e-02
        50%    -1.425e+02       3.701e-01        2.271e-01
        75%    -1.423e+02       3.782e-01        4.279e-01
        max    -1.422e+02       3.864e-01        6.287e-01
        <BLANKLINE>
        Likelihood-ratio test p-values
        ------------------------------
        <BLANKLINE>
               𝓗₀ vs 𝓗₂
        ----------------
        mean   4.959e-01
        std    3.362e-01
        min    2.582e-01
        25%    3.771e-01
        50%    4.959e-01
        75%    6.148e-01
        max    7.336e-01

    Notes
    -----
    It will raise a ``ValueError`` exception if non-finite values are passed. Please,
    refer to the :func:`limix.qc.mean_impute` function for missing value imputation.
    """
    from numpy_sugar.linalg import economic_qs

    lik = normalize_likelihood(lik)

    if A is None:
        if A0 is not None or A1 is not None:
            raise ValueError(
                "You cannot define `A0` or `A1` without defining `A`.")

    with session_block("QTL analysis", disable=not verbose):

        with session_line("Normalising input... ", disable=not verbose):
            data = conform_dataset(Y, M, G=G, K=K)

        Y = data["y"]
        M = data["M"]
        G = data["G"]
        K = data["K"]

        assert_finite(Y, M, K)

        if K is not None:
            QS = economic_qs(K)
        else:
            QS = None

        if verbose:
            print()
            _print_input_info(idx, lik, Y, M, G, K)
            print()

        if A is None:
            r = _single_trait_scan(idx, lik, Y, M, G, QS, verbose)
        else:
            r = _multi_trait_scan(idx, lik, Y, M, G, QS, A, A0, A1, verbose)

        r = r.create()
        if verbose:
            print()
            print(r)

        return r
Exemple #8
0
def st_iscan(G, y, K=None, M=None, E0=None, E1=None, W_R=None, verbose=True):
    r""" Single-variant association interation testing.

    Parameters
    ----------
    pheno : (`N`, 1) ndarray
        phenotype data
    covs : (`N`, `D`) ndarray
        covariate design matrix.
        By default, ``covs`` is a (`N`, `1`) array of ones.
    R : (`N`, `N`) ndarray
        LMM-covariance/genetic relatedness matrix.
        If not provided, then standard linear regression is considered.
        Alternatively, its eighenvalue decomposition can be
        provided through ``eigh_R``.
        if ``eigh_R`` is set, this parameter is ignored.
        If the LMM-covariance is low-rank, ``W_R`` can be provided
    eigh_R : tuple
        Tuple with `N` ndarray of eigenvalues of `R` and
        (`N`, `N`) ndarray of eigenvectors of ``R``.
    W_R : (`N`, `R`) ndarray
        If the LMM-covariance is low-rank, one can provide ``W_R`` such that
        ``R`` = dot(``W_R``, transpose(``W_R``)).
    inter : (`N`, `K`) ndarray
        interaction variables interacting with the snp.
        If specified, then the current tests are considered:
        (i) (inter&inter0)-by-g vs no-genotype-effect;
        (ii) inter0-by-g vs no-genotype-effect;
        (iii) (inter&inter0)-by-g vs inter0-by-g.
    inter0 : (`N`, `K0`) ndarray
        interaction variables to be included in the alt and null model.
        By default, if inter is not specified, inter0 is ignored.
        By default, if inter is specified, inter0=ones so that inter0-by-g=g,
        i.e. an additive genetic effect is considered.
    verbose : (bool, optional):
        if True, details such as runtime as displayed.
    """
    from limix_lmm.lmm import LMM
    from limix_lmm.lmm_core import LMMCore
    from limix_core.gp import GP2KronSum, GP2KronSumLR
    from limix_core.covar import FreeFormCov
    from scipy.linalg import eigh
    from numpy import ones, var, concatenate, asarray

    lmm0 = None

    with session_block("single-trait association test", disable=not verbose):

        # if covs is None:
        #     covs = ones([pheno.shape[0], 1])

        with session_line("Normalising input... ", disable=not verbose):
            data = conform_dataset(y, M, G=G, K=K)

            y = data["y"]
            M = data["M"]
            G = data["G"]
            K = data["K"]

            # case 1: linear model
            # if W_R is None and eigh_R is None and R is None:
            if K is None:
                if verbose:
                    print("Model: lm")
                gp = None
                Kiy_fun = None

            # case 2: low-rank linear model
            elif W_R is not None:
                if verbose:
                    print("Model: low-rank lmm")
                gp = GP2KronSumLR(Y=y,
                                  Cn=FreeFormCov(1),
                                  G=W_R,
                                  F=M,
                                  A=ones((1, 1)))
                gp.covar.Cr.setCovariance(var(y) * ones((1, 1)))
                gp.covar.Cn.setCovariance(var(y) * ones((1, 1)))
                gp.optimize(verbose=verbose)
                Kiy_fun = gp.covar.solve

            # case 3: full-rank linear model
            else:
                if verbose:
                    print("Model: lmm")
                # if eigh_R is None:
                eigh_R = eigh(K)
                S_R, U_R = eigh_R
                add_jitter(S_R)
                gp = GP2KronSum(
                    Y=y,
                    Cg=FreeFormCov(1),
                    Cn=FreeFormCov(1),
                    S_R=S_R,
                    U_R=U_R,
                    F=M,
                    A=ones((1, 1)),
                )
                gp.covar.Cr.setCovariance(0.5 * var(y) * ones((1, 1)))
                gp.covar.Cn.setCovariance(0.5 * var(y) * ones((1, 1)))
                gp.optimize(verbose=verbose)
                Kiy_fun = gp.covar.solve

            if E1 is None:
                lmm = LMM(y, M, Kiy_fun)
                E1 = None
                E0 = None
            else:
                lmm = LMMCore(y, M, Kiy_fun)
                if E0 is None:
                    E0 = ones([y.shape[0], 1])
                if (E0 == 1).sum():
                    lmm0 = LMM(y, M, Kiy_fun)
                else:
                    lmm0 = LMMCore(y, M, Kiy_fun)
                E1 = concatenate([E0, E1], 1)

    return _process(lmm, lmm0, asarray(G), E0, E1)
Exemple #9
0
def iscan(G, y, lik="normal", K=None, M=None, idx=None, E0=None, E1=None, verbose=True):
    r"""
    Single-trait association with interaction test via generalized linear mixed models.

    The general formulae for normally distributed traits is

    .. math::

        𝐲 = 𝙼𝛂 + (𝙶⊙𝙴₀)𝛃₀ + (𝙶⊙𝙴₁)𝛃₁ + 𝐮 + 𝛆,\\
        \text{where}~~ 𝐮∼𝓝(𝟎, 𝓋₀𝙺) ~~\text{and}~~ 𝛆∼𝓝(𝟎, 𝓋₁𝙸).

    The operator ⊙ works as follows:

    .. math::

        𝙰⊙𝙱 = [𝙰₀𝙱₀ ~~...~~ 𝙰₀𝙱ₙ ~~ 𝙰₁𝙱₀ ~~...~~ 𝙰₁𝙱ₙ ~~...~~ 𝙰ₘ𝙱ₙ]

    The covariates is enconded in matrix 𝙼 while the candidate set is enconded in matrix
    𝙶. The parameters are the effect sizes 𝛂, 𝛃₀, and 𝛃₁, and the variances 𝓋₀ and 𝓋₁.

    It performs likelihood-ratio tests for the following cases, where the first
    hypothesis is the null one while the second hypothesis is the alternative one:

    - H₀ vs H₁: testing for vec(𝛃₀) ≠ 𝟎 while vec(𝛃₁) = 𝟎
    - H₀ vs H₂: testing for [vec(𝛃₀) vec(𝛃₁)] ≠ 𝟎
    - H₁ vs H₂: testing for vec(𝛃₁) ≠ 𝟎

    It also supports generalized linear mixed models (GLMM). In this case, the following
    likelihoods are implemented:

    - Bernoulli
    - Probit
    - Binomial
    - Poisson

    Formally, let p(𝜇) be one of the supported probability distributions where 𝜇 is
    its mean. The H₀ model is defined as follows:

    .. math::

        yᵢ ∼ p(𝜇ᵢ=g(zᵢ)) ~~\text{for}~~ 𝐳 ∼ 𝓝(𝙼𝛂 + (𝙶⊙𝙴₀)𝛃₀ + (𝙶⊙𝙴₁)𝛃₁, 𝓋₀𝙺 + 𝓋₁𝙸).

    g(⋅) is the corresponding canonical link function for the Bernoulli, Binomial, and
    Poisson likelihoods. The Probit likelihood, on the other hand, is a Bernoulli
    likelihood with probit link function.

    Parameters
    ----------
    G : n×m array_like
        Genetic candidates.
    Y : n×p array_like
        Rows are samples and columns are phenotypes.
    lik : tuple, "normal", "bernoulli", "probit", "binomial", "poisson"
        Sample likelihood describing the residual distribution.
        Either a tuple or a string specifying the likelihood is required. The Normal,
        Bernoulli, Probit, and Poisson likelihoods can be selected by providing a
        string. Binomial likelihood on the other hand requires a tuple because of the
        number of trials: ``("binomial", array_like)``. Defaults to ``"normal"``.
    K : n×n array_like
        Sample covariance, often the so-called kinship matrix.
    M : n×c array_like
        Covariates matrix.
    idx : list
        List of candidate indices that defines the set of candidates to be used in the
        tests.
    E0 : array_like
        Matrix representing the first environment.
    E1 : array_like
        Matrix representing the second environment.
    verbose : bool, optional
        ``True`` to display progress and summary; ``False`` otherwise.

    Returns
    -------
    result : :class:`limix.qtl._result.IScanResult`
        P-values, log of marginal likelihoods, effect sizes, and associated statistics.

    Notes
    -----
    It will raise a ``ValueError`` exception if non-finite values are passed. Please,
    refer to the :func:`limix.qc.mean_impute` function for missing value imputation.
    """
    from numpy_sugar.linalg import economic_qs
    from xarray import concat
    from numpy import asarray, empty, ones

    lik = normalize_likelihood(lik)
    lik_name = lik[0]

    with session_block("QTL analysis", disable=not verbose):

        with session_line("Normalising input... ", disable=not verbose):

            data = conform_dataset(y, M, G=G, K=K)

        Y = data["y"]
        M = data["M"]
        G = data["G"]
        K = data["K"]

        assert_finite(y, M, K)
        nsamples = y.shape[0]

        if E1 is None:
            E1 = ones((nsamples, 1))

        if E0 is None:
            E0 = empty((nsamples, 0))

        E0 = _asarray(E0, "env0", ["sample", "env"])
        E1 = _asarray(E1, "env1", ["sample", "env"])
        E01 = concat([E0, E1], dim="env")

        if K is not None:
            QS = economic_qs(K)
        else:
            QS = None

        if lik_name == "normal":
            scanner, v0, v1 = _lmm(Y.values.ravel(), M.values, QS, verbose)
        else:
            scanner, v0, v1 = _glmm(Y.values.ravel(), lik, M.values, QS, verbose)

        r = IScanResultFactory(
            lik_name,
            Y.trait,
            M.covariate,
            G.candidate,
            E0.env,
            E1.env,
            scanner.null_lml,
            scanner.null_beta,
            scanner.null_beta_se,
            v0,
            v1,
        )

        if idx is None:

            assert E1.shape[1] > 0
            idx = range(G.shape[1])

            if E0.shape[1] == 0:
                r1 = scanner.fast_scan(G, verbose)

            for i in idx:
                i = _2d_sel(i)
                g = asarray(G[:, i], float)

                if E0.shape[1] > 0:
                    r1 = scanner.scan(g, E0)
                    h1 = _normalise_scan_names(r1)
                else:
                    h1 = _normalise_scan_names({k: v[i] for k, v in r1.items()})
                    h1["covariate_effsizes"] = h1["covariate_effsizes"].ravel()
                    h1["covariate_effsizes_se"] = h1["covariate_effsizes_se"].ravel()

                r2 = scanner.scan(g, E01)
                h2 = _normalise_scan_names(r2)
                r.add_test(i, h1, h2)
        else:
            for i in idx:
                i = _2d_sel(i)
                g = asarray(G[:, i], float)

                r1 = scanner.scan(g, E0)
                r2 = scanner.scan(g, E01)

                h1 = _normalise_scan_names(r1)
                h2 = _normalise_scan_names(r2)
                r.add_test(i, h1, h2)

        r = r.create()
        if verbose:
            print(r)

        return r
Exemple #10
0
def st_sscan(G, y, E, M=None, tests=None, verbose=True):
    """Mixed-model with genetic effect heterogeneity.

    Parameters
    ----------
    pheno : (`N`, 1) ndarray
        phenotype data
    environments : (`N`, `E`) ndarray
        environments data.
    covs : (`N`, `D`) ndarray
        covariate design matrix.
        By default, ``covs`` is a (`N`, `1`) array of ones.
    tests : list
        Which tests are performed.
        Element list values are ``'inter'`` and ``'assoc'``.
        By default, only the interaction test is considered.
    rhos : list
        for the association test, a list of ``rho`` values must be specified.
        The choice of ``rho`` affects the statistical power of the test
        (for more information see the StructLMM paper).
        By default, ``rho=[0, 0.1**2, 0.2**2, 0.3**2, 0.4**2, 0.5**2, 0.5, 1.]``
    verbose : (bool, optional):
        if True, details such as runtime as displayed.
    """
    from struct_lmm import StructLMM
    from numpy import zeros, hstack, asarray
    from pandas import DataFrame

    rhos = [0.0, 0.1**2, 0.2**2, 0.3**2, 0.4**2, 0.5**2, 0.5, 1.0]

    with session_block("struct-lmm analysis", disable=not verbose):

        with session_line("Normalising input... ", disable=not verbose):
            data = conform_dataset(y, M, G=G, K=None)

        y = data["y"]
        M = data["M"]
        G = data["G"]

        if tests is None:
            tests = ["inter"]

        if "inter" in tests:
            slmi = StructLMM(asarray(y, float), E, W=E, rho_list=[0])

        if "assoc" in tests:
            slmm = StructLMM(asarray(y, float), E, W=E, rho_list=rhos)
            slmm.fit_null(F=asarray(M, float), verbose=False)

        _pvi = zeros(G.shape[1])
        _pva = zeros(G.shape[1])
        for snp in range(G.shape[1]):
            x = asarray(G[:, [snp]], float)

            if "inter" in tests:
                # interaction test
                M1 = hstack((M, x))
                slmi.fit_null(F=M1, verbose=False)
                _pvi[snp] = slmi.score_2_dof(x)

            if "assoc" in tests:
                # association test
                _pva[snp] = slmm.score_2_dof(x)

    data = OrderedDict()
    data["pvi"] = _pvi
    data["pva"] = _pva
    return DataFrame(data)