Esempio n. 1
0
def test_qtl_scan_lmm_repeat_samples_by_index():
    random = RandomState(0)
    nsamples = 30
    samples = ["sample{}".format(i) for i in range(nsamples)]

    G = random.randn(nsamples, 100)
    G = DataFrame(data=G, index=samples)

    K = linear_kinship(G.values[:, 0:80], verbose=False)
    K = DataFrame(data=K, index=samples, columns=samples)

    y0 = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples)
    y1 = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples)
    y = concatenate((y0, y1))
    y = DataFrame(data=y, index=samples + samples)

    M = G.values[:, :5]
    X = G.values[:, 68:70]
    M = DataFrame(data=M, index=samples)
    X = DataFrame(data=X, index=samples)

    result = scan(X, y, "normal", K, M=M, verbose=False)
    pv = result.stats["pv20"]
    assert_allclose(pv.values[0], 0.9920306566395604, rtol=1e-6)

    ix_best_snp = argmin(array(result.stats["pv20"]))

    M = concatenate((M, X.loc[:, [ix_best_snp]]), axis=1)
    M = DataFrame(data=M, index=samples)

    result = scan(X, y, "normal", K, M=M, verbose=False)
    pv = result.stats["pv20"]
    assert_allclose(pv[ix_best_snp], 1.0, rtol=1e-6)
    assert_allclose(pv.values[0], 0.6684700834450028, rtol=1e-6)
Esempio n. 2
0
def test_qtl_scan_lmm_different_samples_order():
    random = RandomState(0)
    nsamples = 50
    samples = ["sample{}".format(i) for i in range(nsamples)]

    G = random.randn(nsamples, 100)
    G = DataFrame(data=G, index=samples)

    K = linear_kinship(G.values[:, 0:80], verbose=False)
    K = DataFrame(data=K, index=samples, columns=samples)

    y = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples)
    y = DataFrame(data=y, index=samples)

    M = G.values[:, :5]
    X = G.values[:, 68:70]
    M = DataFrame(data=M, index=samples)
    X = DataFrame(data=X, index=samples)

    result = scan(X, y, "normal", K, M=M, verbose=False)
    pv = result.stats["pv20"]
    assert_allclose(pv.values[1], 0.10807353644788478, rtol=1e-6)
    X.sort_index(inplace=True, ascending=False)
    X = DataFrame(X.values, index=X.index.values)
    result = scan(X, y, "normal", K, M=M, verbose=False)
    pv = result.stats["pv20"]
    assert_allclose(pv.values[1], 0.10807353644788478, rtol=1e-6)
Esempio n. 3
0
def test_qtl_scan_three_hypotheses_mt():
    random = RandomState(0)
    n = 30
    ntraits = 2
    ncovariates = 3

    A = random.randn(ntraits, ntraits)
    A = A @ A.T
    M = random.randn(n, ncovariates)

    C0 = random.randn(ntraits, ntraits)
    C0 = C0 @ C0.T

    C1 = random.randn(ntraits, ntraits)
    C1 = C1 @ C1.T

    G = random.randn(n, 4)

    A0 = random.randn(ntraits, 1)
    A1 = random.randn(ntraits, 2)
    A01 = concatenate((A0, A1), axis=1)

    K = random.randn(n, n + 1)
    K = normalise_covariance(K @ K.T)

    beta = vec(random.randn(ntraits, ncovariates))
    alpha = vec(random.randn(A01.shape[1], G.shape[1]))

    m = kron(A, M) @ beta + kron(A01, G) @ alpha
    Y = unvec(mvn(random, m, kron(C0, K) + kron(C1, eye(n))), (n, -1))

    idx = [[0, 1], 2, [3]]
    r = scan(G, Y, idx=idx, K=K, M=M, A=A, A0=A0, A1=A1, verbose=False)
    str(r)
Esempio n. 4
0
def test_qtl_scan_glmm_wrong_dimensions():
    random = RandomState(0)
    nsamples = 25

    X = random.randn(nsamples, 2)
    G = random.randn(nsamples, 100)
    K = dot(G, G.T)
    ntrials = random.randint(1, 100, nsamples)
    z = dot(G, random.randn(100)) / sqrt(100)

    successes = zeros(len(ntrials), int)
    for i, nt in enumerate(ntrials):
        for _ in range(nt):
            successes[i] += int(z[i] + 0.5 * random.randn() > 0)

    M = random.randn(49, 2)
    scan(X, successes, ("binomial", ntrials), K, M=M, verbose=False)
Esempio n. 5
0
def test_qtl_scan_lmm():
    random = RandomState(0)
    nsamples = 50

    G = random.randn(50, 100)
    K = linear_kinship(G[:, 0:80], verbose=False)

    y = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples)

    M = G[:, :5]
    X = G[:, 68:70]

    result = scan(X, y, lik="normal", K=K, M=M, verbose=False)

    pv = result.stats["pv20"]

    ix_best_snp = argmin(array(pv))
    M = concatenate((M, X[:, [ix_best_snp]]), axis=1)
    result = scan(X, y, "normal", K, M=M, verbose=False)
    pv = result.stats["pv20"]
    assert_allclose(pv[ix_best_snp], 1.0, atol=1e-6)
Esempio n. 6
0
def test_qtl_finite():
    random = RandomState(0)
    nsamples = 20

    X = random.randn(50, 2)
    G = random.randn(50, 100)
    K = dot(G, G.T)
    ntrials = random.randint(1, 100, nsamples)
    z = dot(G, random.randn(100)) / sqrt(100)

    successes = zeros(len(ntrials), int)
    for i, nt in enumerate(ntrials):
        for _ in range(nt):
            successes[i] += int(z[i] + 0.5 * random.randn() > 0)

    successes = successes.astype(float)
    ntrials = ntrials.astype(float)

    successes[0] = nan
    with pytest.raises(ValueError):
        scan(X, successes, ("binomial", ntrials), K, verbose=False)
    successes[0] = 1.0

    K[0, 0] = nan
    with pytest.raises(ValueError):
        scan(X, successes, ("binomial", ntrials), K, verbose=False)
    K[0, 0] = 1.0

    X[0, 0] = nan
    with pytest.raises(ValueError):
        scan(X, successes, ("binomial", ntrials), K, verbose=False)
    X[0, 0] = 1.0
Esempio n. 7
0
def test_qtl_scan_lm():
    random = RandomState(0)
    nsamples = 25

    G = random.randn(nsamples, 100)

    y = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples)

    M = G[:, :5]
    X = G[:, 5:]
    result = scan(X, y, "normal", M=M, verbose=False)
    pv = result.stats["pv20"]
    assert_allclose(pv[:2], [0.02625506841465465, 0.9162689001409643], rtol=1e-5)
Esempio n. 8
0
 def do_gwas(self, geno_mat =None):
     from limix.qtl import scan
     print("Start to perform GWAS......")
     if geno_mat is None:
         geno_mat = self.geno_matrix.T
     else:
         geno_mat = geno_mat
     if self.kinship is not None:
         res = scan(geno_mat, self.pheno_list.values, "normal", K= self.kinship.values ,verbose=False)
     else:
         res = scan(self.geno_matrix.T.values, self.pheno_list.values, "normal", K= None ,verbose=False)
     res_p = res.stats
     res_p.index = self.SNPinfo.rsid
     res_p.loc[:,'rsid'] = self.SNPinfo.rsid
     res_p.loc[:,'chrom'] = self.SNPinfo.chrom
     res_p.loc[:,'position'] = self.SNPinfo.position
     betas = np.array(res.effsizes['h2'].effsize[res.effsizes['h2'].effect_type=='candidate'])
     se = np.array(res.effsizes['h2'].effsize_se[res.effsizes['h2'].effect_type=='candidate'])
     res_p.loc[:,'beta'] = betas
     res_p.loc[:,'se'] = se
     res_p.loc[:,'z_score'] = betas/se
     self.res_p = res_p
     return res_p
Esempio n. 9
0
def test_qtl_scan_lmm_nokinship():
    random = RandomState(0)
    nsamples = 50

    G = random.randn(50, 100)
    K = linear_kinship(G[:, 0:80], verbose=False)

    y = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples)

    M = G[:, :5]
    X = G[:, 68:70]

    result = scan(X, y, "normal", K, M=M, verbose=False)
    pv = result.stats["pv20"].values
    assert_allclose(pv[:2], [8.159539103135342e-05, 0.10807353641893498], atol=1e-5)
Esempio n. 10
0
 def get_qtl_maps(self, covs=None):
     filter_nanaccs_ix = self.get_filter_accs_nans()
     if covs is None:
         covs = np.ones((self.genos.shape[0]))
     else:  ### Need to also filter the accessions where covs has nan
         assert type(covs) is np.ndarray
         filter_nanaccs_ix = np.intersect1d(
             filter_nanaccs_ix,
             np.where(np.isfinite(np.array(covs)))[0])
     if type(self.pheno) is pd.Series:
         lm = scan(self.genos[filter_nanaccs_ix, :],
                   np.array(self.pheno.iloc[filter_nanaccs_ix]),
                   covs=covs[filter_nanaccs_ix])
         if len(np.where(np.isfinite(lm.getPv()[0]))[0]) == 0:
             return (None)
         return (lm)
     else:
         lm = []
         for cl in self.pheno:
             lm.append(
                 scan(self.genos[filter_nanaccs_ix, :],
                      np.array(self.pheno[cl][filter_nanaccs_ix]),
                      covs=covs[filter_nanaccs_ix]))
         return (lm)  ## returns an array
Esempio n. 11
0
def test_qtl_scan_glmm_bernoulli_nokinship():
    random = RandomState(0)
    nsamples = 25

    X = random.randn(nsamples, 2)
    G = random.randn(nsamples, 100)
    ntrials = random.randint(1, 2, nsamples)
    z = dot(G, random.randn(100)) / sqrt(100)

    successes = zeros(len(ntrials), int)
    for i, nt in enumerate(ntrials):
        for _ in range(nt):
            successes[i] += int(z[i] + 0.5 * random.randn() > 0)

    result = scan(X, successes, "bernoulli", verbose=False)
    pv = result.stats["pv20"]
    assert_allclose(pv, [0.3399067917883736, 0.8269568797830423], rtol=1e-5)
Esempio n. 12
0
def test_qtl_scan_glmm_binomial():
    random = RandomState(0)
    nsamples = 25

    X = random.randn(nsamples, 2)
    G = random.randn(nsamples, 100)
    K = dot(G, G.T)
    ntrials = random.randint(1, 100, nsamples)
    z = dot(G, random.randn(100)) / sqrt(100)

    successes = zeros(len(ntrials), int)
    for i, nt in enumerate(ntrials):
        for _ in range(nt):
            successes[i] += int(z[i] + 0.5 * random.randn() > 0)

    result = scan(X, successes, ("binomial", ntrials), K, verbose=False)
    pv = result.stats["pv20"]
    assert_allclose(pv, [0.9315770010211236, 0.8457015828837173], atol=1e-6, rtol=1e-6)
Esempio n. 13
0
def test_qtl_scan_gmm_binomial():
    random = RandomState(0)
    nsamples = 25

    X = random.randn(nsamples, 2)
    ntrials = random.randint(1, nsamples, nsamples)
    z = dot(X, random.randn(2))

    successes = zeros(len(ntrials), int)
    for i in range(len(ntrials)):
        for _ in range(ntrials[i]):
            successes[i] += int(z[i] + 0.5 * random.randn() > 0)

    result = scan(X, successes, ("binomial", ntrials), verbose=False)
    pv = result.stats["pv20"]
    assert_allclose(
        pv, [2.4604711379400065e-06, 0.01823278752006871], rtol=1e-5, atol=1e-5
    )
Esempio n. 14
0
def test_qtl_scan_glmm_bernoulli():
    random = RandomState(0)
    nsamples = 25

    X = random.randn(nsamples, 2)
    G = random.randn(nsamples, 100)
    K = dot(G, G.T)
    ntrials = random.randint(1, 2, nsamples)
    z = dot(G, random.randn(100)) / sqrt(100)

    successes = zeros(len(ntrials), int)
    for i, nt in enumerate(ntrials):
        for _ in range(nt):
            successes[i] += int(z[i] + 0.5 * random.randn() > 0)

    result = scan(X, successes, "bernoulli", K, verbose=False)
    pv = result.stats["pv20"]
    assert_allclose(pv, [0.3399326545917558, 0.8269454251659921], rtol=1e-5)
Esempio n. 15
0
def _test_qtl_scan_st(lik):
    random = RandomState(0)
    n = 30
    ncovariates = 3

    M = random.randn(n, ncovariates)

    v0 = random.rand()
    v1 = random.rand()

    G = random.randn(n, 4)

    K = random.randn(n, n + 1)
    K = normalise_covariance(K @ K.T)

    beta = random.randn(ncovariates)
    alpha = random.randn(G.shape[1])

    m = M @ beta + G @ alpha
    y = mvn(random, m, v0 * K + v1 * eye(n))

    idx = [[0, 1], 2, [3]]

    if lik == "poisson":
        y = random.poisson(exp(y))
    elif lik == "bernoulli":
        y = random.binomial(1, 1 / (1 + exp(-y)))
    elif lik == "probit":
        y = random.binomial(1, st.norm.cdf(y))
    elif lik == "binomial":
        ntrials = random.randint(0, 30, len(y))
        y = random.binomial(ntrials, 1 / (1 + exp(-y)))
        lik = (lik, ntrials)

    r = scan(G, y, lik=lik, idx=idx, K=K, M=M, verbose=False)
    str(r)
    str(r.stats.head())
    str(r.effsizes["h2"].head())
    str(r.h0.trait)
    str(r.h0.likelihood)
    str(r.h0.lml)
    str(r.h0.effsizes)
    str(r.h0.variances)
Esempio n. 16
0
def test_qtl_scan_two_hypotheses_mt_A0A1_none():
    random = RandomState(0)
    n = 30
    ntraits = 2
    ncovariates = 3

    A = random.randn(ntraits, ntraits)
    A = A @ A.T
    M = random.randn(n, ncovariates)

    C0 = random.randn(ntraits, ntraits)
    C0 = C0 @ C0.T

    C1 = random.randn(ntraits, ntraits)
    C1 = C1 @ C1.T

    G = random.randn(n, 4)

    A1 = eye(ntraits)

    K = random.randn(n, n + 1)
    K = normalise_covariance(K @ K.T)

    beta = vec(random.randn(ntraits, ncovariates))
    alpha = vec(random.randn(A1.shape[1], G.shape[1]))

    m = kron(A, M) @ beta + kron(A1, G) @ alpha
    Y = unvec(mvn(random, m, kron(C0, K) + kron(C1, eye(n))), (n, -1))
    Y = DataArray(Y, dims=["sample", "trait"], coords={"trait": ["WA", "Cx"]})

    idx = [[0, 1], 2, [3]]
    r = scan(G, Y, idx=idx, K=K, M=M, A=A, verbose=False)
    df = r.effsizes["h2"]
    df = df[df["test"] == 0]
    assert_array_equal(df["trait"], ["WA"] * 3 + ["Cx"] * 3 + [None] * 4)
    assert_array_equal(
        df["env"], [None] * 6 + ["env1_WA", "env1_WA", "env1_Cx", "env1_Cx"]
    )
    str(r)
Esempio n. 17
0
]
acn_indices.sort()
K = kin_hdf['kinship'][acn_indices, :][:, acn_indices]
kin_hdf.close()

# get phenotype in correct order
pheno = pheno.loc[acn_order]
Y = pheno.to_numpy()

### MTMM TESTS ###
A = np.matrix('0 1; 1 0')
A0 = np.ones((len(traits), 1))
A1 = np.eye(len(traits))
# M = np.repeat(1, Y.shape[0])

r = scan(G, Y, K=K, lik='normal', A=A, A0=A0, A1=A1, verbose=True)

# save results
# link chromosome and positions to p-values and effect sizes
geno_hdf = h5py.File(genoFile, 'r')
chrIdx = geno_hdf['positions'].attrs['chr_regions']
chrom = [bisect(chrIdx[:, 1], snpIdx) + 1 for snpIdx in SNP_indices]
positions = geno_hdf['positions'][:]
pos = [positions[snp] for snp in SNP_indices]

# G effect only
pv10 = r.stats.pv10.tolist()
# G + GxE
pv20 = r.stats.pv20.tolist()
# GxE effect only
pv21 = r.stats.pv21.tolist()
Esempio n. 18
0
fam.drop(fam.index[ind], inplace=True)
# and from Kp, which gives K accession info
Kp.drop(Kp.index[ind], inplace=True)
Kp.drop(Kp.columns[ind], axis=1, inplace=True)


GP_check = fam.index == pheno_ids.index
PK_check = pheno_ids.index == Kp.index
if np.count_nonzero(GP_check == False) != 0 and np.count_nonzero(PK_check == False) != 0 : print("CAUTION: Not all data files are in the same order!")
if np.count_nonzero(GP_check == False) == 0 and np.count_nonzero(PK_check == False) == 0 : print("All input files in same order")

    
######################################
### 7. Run marginal GWAS in limix

r = scan(G=G, Y=Y, K = K, lik = 'normal', M = None, verbose = True)

####################################
### 8. Output results

chrom = bim[['chrom']]
pos = bim[['pos']]
#extract pvals
pvalues = r.stats.pv20.tolist()
#extract effect sizes
effsizes = r.effsizes['h2']['effsize'][r.effsizes['h2']['effect_type'] == 'candidate'].tolist()

gwas_results = np.c_[chrom, pos, pvalues, effsizes]
gwas_results = pd.DataFrame(data=gwas_results, index=None, columns=["chrom", "pos", "pv", "GVE"])
gwas_results.to_csv(output_file, index = False)
Esempio n. 19
0
kin_hdf = h5py.File(args.kinship, 'r')

# select kinship only for phenotyped and genotyped accessions
acn_indices = [
    np.where(kin_hdf['accessions'][:] == acn)[0][0] for acn in pheno.index
]
acn_indices.sort()
K = kin_hdf['kinship'][acn_indices, :][:, acn_indices]
kin_hdf.close()

# get phenotype in correct order
pheno = pheno.loc[acn_order]
Y = pheno.to_numpy()

# scan
r = scan(G, Y, K=K, lik="normal", M=None, verbose=True)

# save results
# link chromosome and positions to p-values and effect sizes
geno_hdf = h5py.File(args.genotype, 'r')
chrIdx = geno_hdf['positions'].attrs['chr_regions']
chrom = [bisect(chrIdx[:, 1], snpIdx) + 1 for snpIdx in SNP_indices]
positions = geno_hdf['positions'][:]
pos = [positions[snp] for snp in SNP_indices]
pvalues = r.stats.pv20.tolist()
effsizes = r.effsizes['h2']['effsize'][r.effsizes['h2']['effect_type'] ==
                                       'candidate'].to_list()
Bonferroni = multitest.multipletests(pvalues, alpha=0.05, method='fdr_bh')[3]

gwas_tuples = list(zip(chrom, pos, pvalues))
gwas_results = pd.DataFrame(gwas_tuples, columns=['chrom', 'pos', 'pv'])
Esempio n. 20
0
def run_lm_st(inputs):
    for snp in inputs.geno.get_snps_iterator(is_chunked=True):
        lm_chunk = scan(np.array(snp[:,inputs.accinds], dtype=int).T, np.array(inputs.pheno.values), test=inputs.test)
        yield(lm_chunk)
Esempio n. 21
0
def run_glmm_st(inputs):
    for snp in inputs.geno.get_snps_iterator(is_chunked=True):
        lmm_chunk = scan(np.array(snp[:,inputs.accinds], dtype=int).T, np.array(inputs.pheno.values), lik = inputs.pheno_type, K = inputs.kin, test=inputs.test, searchDelta=False)
        yield(lmm_chunk)
Esempio n. 22
0
def scan(ctx, trait, genotype, covariate, kinship, lik, output_dir, verbose,
         dry_run, **_):
    """ Single-variant association testing via mixed models.

    This analysis requires minimally the specification of one phenotype
    (PHENOTYPES_FILE) and genotype data (GENOTYPE_FILE).

    The --filter option allows for selecting a subset of the original dataset for
    the analysis. For example,

        --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)"

    states that only loci of chromosome 3 having a position inside the range (100, 200)
    will be considered. The --filter option can be used multiple times in the same
    call. In general, --filter accepts a string of the form

        <DATA-TYPE>: <BOOL-EXPR>

    where <DATA-TYPE> can be phenotype, genotype, or covariate. <BOOL-EXPR> is a boolean
    expression involving row or column names. Please, consult `pandas.DataFrame.query`
    function from Pandas package for further information.
    \f

    Examples
    --------

    ... doctest::

        # First we perform a quick file inspection. This step is optional but is very
        # useful to check whether `limix` is able to read them and print out their
        # metadata.
        limix show phenotypes.csv
        limix show genotype.bgen
        limix show kinship.raw

        # We now perform the analysis, specifying the genotype loci and the phenotype
        # of interest.
        limix phenotypes.csv genotype.bgen --kinship-file=kinship.raw \
            --output-dir=results \
            --filter="phenotype: col == 'height'" \
            --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)"
    """
    import sys
    from os import makedirs
    from os.path import abspath, exists, join
    import traceback
    from limix._display import session_block, banner, session_line, indent, print_exc
    from limix.qtl import scan
    from limix.io import fetch
    from .pipeline import Pipeline
    from limix._data import conform_dataset
    from .preprocess import impute as impute_func
    from .preprocess import normalize as normalize_func
    from .preprocess import where as where_func
    from .preprocess import drop_missing, drop_maf

    print(banner())

    if ctx.obj is None:
        ctx.obj = {"preprocess": []}

    output_dir = abspath(output_dir)
    if not dry_run:
        if not exists(output_dir):
            makedirs(output_dir, exist_ok=True)

    def _print_data_array(arr, verbose):
        if verbose:
            print("\n{}\n".format(indent(_clean_data_array_repr(arr))))

    data = {"y": None, "G": None, "K": None}

    data["y"] = fetch("trait", trait, verbose)
    _print_data_array(data["y"], verbose)

    data["G"] = fetch("genotype", genotype, verbose)
    _print_data_array(data["G"], verbose)

    if covariate is not None:
        data["M"] = fetch("covariate", covariate, verbose)
        _print_data_array(data["M"], verbose)

    if kinship is not None:
        data["K"] = fetch("kinship", kinship, verbose)
        _print_data_array(data["K"], verbose)

    with session_line("Matching samples... "):
        data = conform_dataset(**data)
    data = {k: v for k, v in data.items() if v is not None}

    if data["y"].sample.size == 0:
        raise RuntimeError(
            "Exiting early because there is no sample left after matching samples."
            + " Please, check your sample ids.")

    oparams = _ordered_params(ctx)

    with session_block("preprocessing", disable=not verbose):
        pipeline = Pipeline(data)
        preproc_params = [
            i for i in oparams if i[0] in
            ["impute", "normalize", "where", "drop_missing", "drop_maf"]
        ]

        for p in preproc_params:
            if p[0] == "where":
                pipeline.append(where_func, "where", p[1])
            elif p[0] == "normalize":
                pipeline.append(normalize_func, "normalize", p[1])
            elif p[0] == "impute":
                pipeline.append(impute_func, "impute", p[1])
            elif p[0] == "drop_maf":
                pipeline.append(drop_maf, "drop-maf", p[1])
            elif p[0] == "drop_missing":
                pipeline.append(drop_missing, "drop-missing", p[1])

        data = pipeline.run()

    if dry_run:
        print("Exiting early because of dry-run option.")
        return

    if "K" not in data:
        data["K"] = None
    try:
        res = scan(data["G"],
                   data["y"],
                   lik=lik,
                   K=data["K"],
                   M=data["M"],
                   verbose=verbose)
    except Exception as e:
        print_exc(traceback.format_stack(), e)
        sys.exit(1)

    with session_line("Saving results to `{}`... ".format(output_dir)):
        res.to_csv(join(output_dir, "null.csv"), join(output_dir, "alt.csv"))