Example #1
0
def test_qc_kinship_dataarray():
    random = RandomState(0)
    X = random.randn(3, 5)
    K = X.dot(X.T)
    K = da.from_array(K, chunks=2)
    K = xr.DataArray(K)

    K1 = zeros((3, 3))

    K0 = normalise_covariance(K)
    K2 = normalise_covariance(K, out=K1)

    Kf = [
        [2.5990890007787586, -0.1951278087849671, 0.5472860002747189],
        [-0.1951278087849671, 0.4202620710126438, 0.2642930556468809],
        [0.5472860002747189, 0.2642930556468809, 0.5971001753452302],
    ]

    assert_allclose(K0, Kf)
    assert_(isinstance(K0, xr.DataArray))

    assert_allclose(K0, K1)
    assert_(isinstance(K1, ndarray))
    assert_(isinstance(K2, ndarray))

    assert_allclose(K0, K2)
    assert_(K2 is K1)
Example #2
0
def test_qtl_scan_three_hypotheses_mt():
    random = RandomState(0)
    n = 30
    ntraits = 2
    ncovariates = 3

    A = random.randn(ntraits, ntraits)
    A = A @ A.T
    M = random.randn(n, ncovariates)

    C0 = random.randn(ntraits, ntraits)
    C0 = C0 @ C0.T

    C1 = random.randn(ntraits, ntraits)
    C1 = C1 @ C1.T

    G = random.randn(n, 4)

    A0 = random.randn(ntraits, 1)
    A1 = random.randn(ntraits, 2)
    A01 = concatenate((A0, A1), axis=1)

    K = random.randn(n, n + 1)
    K = normalise_covariance(K @ K.T)

    beta = vec(random.randn(ntraits, ncovariates))
    alpha = vec(random.randn(A01.shape[1], G.shape[1]))

    m = kron(A, M) @ beta + kron(A01, G) @ alpha
    Y = unvec(mvn(random, m, kron(C0, K) + kron(C1, eye(n))), (n, -1))

    idx = [[0, 1], 2, [3]]
    r = scan(G, Y, idx=idx, K=K, M=M, A=A, A0=A0, A1=A1, verbose=False)
    str(r)
Example #3
0
def test_glmm_composer():
    random = RandomState(0)
    nsamples = 50

    glmm = GLMMComposer(nsamples)

    glmm.fixed_effects.append_offset()

    X0 = random.randn(nsamples)
    glmm.fixed_effects.append(X0)
    glmm.fixed_effects[0].offset = 1
    glmm.fixed_effects[1].effsizes = [1]

    assert_allclose(glmm.fixed_effects.mean.value() - X0, ones(nsamples))

    X12 = random.randn(nsamples, 2)
    glmm.fixed_effects.append(X12)

    G0 = random.randn(nsamples, 100)
    K0 = normalise_covariance(dot(G0, G0.T))
    glmm.covariance_matrices.append(K0)

    G1 = random.randn(nsamples, 100)
    K1 = normalise_covariance(dot(G1, G1.T))
    glmm.covariance_matrices.append(K1)

    glmm.covariance_matrices.append_iid_noise()
    glmm.covariance_matrices[0].scale = 1
    glmm.covariance_matrices[1].scale = 0
    glmm.covariance_matrices[2].scale = 1
    K = glmm.covariance_matrices.cov.value()
    assert_allclose(K, K0 + eye(nsamples))

    y = random.randn(nsamples)
    glmm.y = y

    glmm.fit(verbose=False)

    assert_allclose(glmm.covariance_matrices[0].scale, 0, atol=1e-6)
    assert_allclose(glmm.covariance_matrices[1].scale, 0, atol=1e-6)
    assert_allclose(glmm.covariance_matrices[2].scale, 1.099905167170892, atol=1e-6)

    assert_allclose(glmm.lml(), -73.32753446649403, atol=1e-6)
Example #4
0
def _train_gblup(y, Z, X, include_ses=False, p_threshold=0.01):
    log = logging.getLogger(pyfocus.LOG)

    try:
        from limix.qc import normalise_covariance
    except ImportError as ie:
        log.error(
            "Training submodule requires limix>=2.0.0 and sklearn to be installed."
        )
        raise
    from numpy.linalg import multi_dot as mdot
    from scipy.linalg import pinvh

    log.debug("Initializing GBLUP model")

    attrs = dict()

    # estimate heritability using limix
    K_cis = np.dot(Z, Z.T)
    K_cis = normalise_covariance(K_cis)
    fe_var, s2u, s2e, logl, fixed_betas, pval = _fit_cis_herit(y, K_cis, X)
    yresid = y - np.dot(X, fixed_betas)

    if pval > p_threshold:
        log.info("h2g pvalue {} greater than threshold {}. Skipping".format(
            pval, p_threshold))
        return None

    attrs["h2g"] = s2u / (fe_var + s2u + s2e)
    attrs["h2g.logl"] = logl
    attrs["h2g.pvalue"] = pval

    # Total variance
    n, p = Z.shape

    # ridge solution (i.e. rrBLUP)
    # this will be slower than normal GBLUP when p > n but is a little bit more flexible
    ZtZpDinv = pinvh(np.dot(Z.T, Z) + np.eye(p) * (s2e / s2u))
    betas = mdot([ZtZpDinv, Z.T, yresid])

    if include_ses:
        # TODO: come back to this with matrix operations rather than list comprehensions
        # jack-knife standard-errors over the fast leave-one-out estimates using rrBLUP
        """
        h = np.array([mdot([Z[i], ZtZpDinv, Z[i]]) for i in range(n)])
        e = yresid - np.dot(Z, betas)
        beta_jk = [betas - np.dot(ZtZpDinv, Z[i] * e[i]) / (1 - h[i]) for i in range(n)]
        ses = np.sqrt(np.mean(beta_jk, axis=0) * (n - 1))
        """
        ses = None
    else:
        ses = None

    return betas, ses, attrs
Example #5
0
def _train_enet(y, Z, X, include_ses=False, p_threshold=0.01):
    log = logging.getLogger(pyfocus.LOG)
    try:
        from limix.qc import normalise_covariance
        from sklearn.linear_model import ElasticNetCV
    except ImportError as ie:
        log.error(
            "Training submodule requires limix>=2.0.0 and sklearn to be installed."
        )
        raise
    from scipy.linalg import lstsq

    log.debug("Initializing ElasticNet model")

    n = len(y)
    attrs = dict()

    K_cis = np.dot(Z, Z.T)
    K_cis = normalise_covariance(K_cis)
    fe_var, s2u, s2e, logl, fixed_betas, pval = _fit_cis_herit(y, K_cis, X)
    if pval > p_threshold:
        log.info("h2g pvalue {} greater than threshold {}. Skipping".format(
            pval, p_threshold))
        return None

    h2g = s2u / (s2u + s2e + fe_var)

    attrs["h2g"] = h2g
    attrs["h2g.logl"] = logl
    attrs["h2g.pvalue"] = pval

    # we only want to penalize SNP effects and not covariate effects...
    fixed_betas, sum_resid, ranks, svals = lstsq(X, y)
    yresid = y - np.dot(X, fixed_betas)

    enet = ElasticNetCV(l1_ratio=0.5, fit_intercept=True, cv=5)
    enet.fit(Z, yresid)
    betas = enet.coef_

    attrs["r2"] = enet.score(Z, yresid)
    attrs["resid.var"] = sum((yresid - enet.predict(Z))**2) / (n - 1)

    if include_ses:
        # TODO: bootstrap?
        ses = None
    else:
        ses = None

    return betas, ses, attrs
Example #6
0
def _test_qtl_scan_st(lik):
    random = RandomState(0)
    n = 30
    ncovariates = 3

    M = random.randn(n, ncovariates)

    v0 = random.rand()
    v1 = random.rand()

    G = random.randn(n, 4)

    K = random.randn(n, n + 1)
    K = normalise_covariance(K @ K.T)

    beta = random.randn(ncovariates)
    alpha = random.randn(G.shape[1])

    m = M @ beta + G @ alpha
    y = mvn(random, m, v0 * K + v1 * eye(n))

    idx = [[0, 1], 2, [3]]

    if lik == "poisson":
        y = random.poisson(exp(y))
    elif lik == "bernoulli":
        y = random.binomial(1, 1 / (1 + exp(-y)))
    elif lik == "probit":
        y = random.binomial(1, st.norm.cdf(y))
    elif lik == "binomial":
        ntrials = random.randint(0, 30, len(y))
        y = random.binomial(ntrials, 1 / (1 + exp(-y)))
        lik = (lik, ntrials)

    r = scan(G, y, lik=lik, idx=idx, K=K, M=M, verbose=False)
    str(r)
    str(r.stats.head())
    str(r.effsizes["h2"].head())
    str(r.h0.trait)
    str(r.h0.likelihood)
    str(r.h0.lml)
    str(r.h0.effsizes)
    str(r.h0.variances)
Example #7
0
def test_qtl_scan_two_hypotheses_mt_A0A1_none():
    random = RandomState(0)
    n = 30
    ntraits = 2
    ncovariates = 3

    A = random.randn(ntraits, ntraits)
    A = A @ A.T
    M = random.randn(n, ncovariates)

    C0 = random.randn(ntraits, ntraits)
    C0 = C0 @ C0.T

    C1 = random.randn(ntraits, ntraits)
    C1 = C1 @ C1.T

    G = random.randn(n, 4)

    A1 = eye(ntraits)

    K = random.randn(n, n + 1)
    K = normalise_covariance(K @ K.T)

    beta = vec(random.randn(ntraits, ncovariates))
    alpha = vec(random.randn(A1.shape[1], G.shape[1]))

    m = kron(A, M) @ beta + kron(A1, G) @ alpha
    Y = unvec(mvn(random, m, kron(C0, K) + kron(C1, eye(n))), (n, -1))
    Y = DataArray(Y, dims=["sample", "trait"], coords={"trait": ["WA", "Cx"]})

    idx = [[0, 1], 2, [3]]
    r = scan(G, Y, idx=idx, K=K, M=M, A=A, verbose=False)
    df = r.effsizes["h2"]
    df = df[df["test"] == 0]
    assert_array_equal(df["trait"], ["WA"] * 3 + ["Cx"] * 3 + [None] * 4)
    assert_array_equal(
        df["env"], [None] * 6 + ["env1_WA", "env1_WA", "env1_Cx", "env1_Cx"]
    )
    str(r)
Example #8
0
with localconverter(default_converter + pandas2ri.converter) as cv:
    pd_K = r('K_matrix')
    K_data = np.array(pd_K)

#or make it using limix
from limix.stats import linear_kinship
K = linear_kinship(SNP_data, verbose=True)
K_data = K

#Another way to make the kinship matrix
from numpy import dot
from limix.qc import normalise_covariance
X = SNP_data
K = dot(X, X.T)
Kn = normalise_covariance(K)

#Missing Value Threshold
Miss_Tol = .3
#Minor Allele Frequency Threshold
MAF_Tol = .05
#estimating the allele frequencies in the data
SNPsum = np.nansum(SNP_data, axis=0)
nInd = np.sum(~np.isnan(SNP_data), axis=0)
freq_hat = np.array(SNPsum, dtype="float") / (2 * nInd)
mask = np.ndarray.flatten(
    np.array(np.all([freq_hat > MAF_Tol, freq_hat < (1 - MAF_Tol)],
                    axis=0)).astype("bool"))
SNP_data = SNP_data[:, mask]
SNP_names = SNP_names[mask, :]  #Too many indices for array error
MAF = freq_hat[mask]
Example #9
0
def _train_lasso(y, Z, X, include_ses=False, p_threshold=0.01):
    log = logging.getLogger(pyfocus.LOG)
    try:
        from limix.qc import normalise_covariance
        from sklearn.linear_model import Lasso
    except ImportError as ie:
        log.error(
            "Training submodule requires limix>=2.0.0 and sklearn to be installed."
        )
        raise
    from scipy.linalg import lstsq

    log.debug("Initializing LASSO model")

    n = len(y)
    attrs = dict()

    K_cis = np.dot(Z, Z.T)
    K_cis = normalise_covariance(K_cis)
    fe_var, s2u, s2e, logl, fixed_betas, pval = _fit_cis_herit(y, K_cis, X)
    if pval > p_threshold:
        log.info("h2g pvalue {} greater than threshold {}. Skipping".format(
            pval, p_threshold))
        return None

    h2g = s2u / (s2u + s2e + fe_var)

    attrs["h2g"] = h2g
    attrs["h2g.logl"] = logl
    attrs["h2g.pvalue"] = pval

    # we only want to penalize SNP effects and not covariate effects...
    fixed_betas, sum_resid, ranks, svals = lstsq(X, y)
    yresid = y - np.dot(X, fixed_betas)

    # PLINK-style LASSO
    lambda_max = np.linalg.norm(Z.T.dot(yresid), np.inf) / float(n)

    def _gen_e():
        e = np.random.normal(size=n)
        return np.linalg.norm(Z.T.dot(e), np.inf)

    min_tmp = np.median([_gen_e() for _ in range(1000)])
    sige = np.sqrt(1.0 - h2g + (1.0 / float(n)))
    lambda_min = (sige / n) * min_tmp

    # 100 values spaced logarithmically from lambda-min to lambda-max
    alphas = np.exp(np.linspace(np.log(lambda_min), np.log(lambda_max), 100))

    # fit LASSO solution using coordinate descent, updating with consecutively smaller penalties
    lasso = Lasso(fit_intercept=True, warm_start=True)
    for penalty in reversed(alphas):
        lasso.set_params(alpha=penalty)
        lasso.fit(Z, yresid)

    betas = lasso.coef_

    attrs["r2"] = lasso.score(Z, yresid)
    attrs["resid.var"] = sum((yresid - lasso.predict(Z))**2) / (n - 1)

    if include_ses:
        # TODO: bootstrap?
        ses = None
    else:
        ses = None

    return betas, ses, attrs