def test_dataset_different_size(): random = RandomState(0) n0 = 5 n1 = 3 y = random.randn(n0) samples = ["sample{}".format(i) for i in range(len(y))] y = DataFrame(data=y, index=samples) G = random.randn(n1, 10) data = conform_dataset(y, G=G) assert_array_equal(data["y"].values, y[:n1]) assert_array_equal(data["G"].values, G[:n1, :]) n0 = 3 n1 = 5 y = random.randn(n0) samples = ["sample{}".format(i) for i in range(len(y))] y = DataFrame(data=y, index=samples) G = random.randn(n1, 10) data = conform_dataset(y, G=G) assert_array_equal(data["y"].values, y[:n0]) assert_array_equal(data["G"].values, G[:n0, :])
def estimate(y_phe, lik, kin, marker_mat=None, verbose=True): ''' estimate variance components ''' lik = normalize_likelihood(lik) lik_name = lik[0] with session_block("Heritability analysis", disable=not verbose): with session_line("Normalising input...", disable=not verbose): data = conform_dataset(y_phe, M=marker_mat, K=kin) y_phe = data["y"] marker_mat = data["M"] kin = data["K"] assert_finite(y_phe, marker_mat, kin) if kin is not None: # K = K / diag(K).mean() q_s = economic_qs(kin) else: q_s = None if lik_name == "normal": method = LMM(y_phe.values, marker_mat.values, q_s, restricted=True) method.fit(verbose=verbose) else: method = GLMMExpFam(y_phe, lik, marker_mat.values, q_s, n_int=500) method.fit(verbose=verbose, factr=1e6, pgtol=1e-3) v_g = method.scale * (1 - method.delta) v_e = method.scale * method.delta if lik_name == "bernoulli": v_e += pi * pi / 3 v_v = var(method.mean()) return v_g, v_v, v_e
def estimate(y, lik, K, M=None, verbose=True): from numpy_sugar.linalg import economic_qs from numpy import pi, var, diag from glimix_core.glmm import GLMMExpFam from glimix_core.lmm import LMM from limix._data._assert import assert_likelihood from limix._data import normalize_likelihood, conform_dataset from limix.qtl._assert import assert_finite from limix._display import session_block, session_line lik = normalize_likelihood(lik) lik_name = lik[0] with session_block("Heritability analysis", disable=not verbose): with session_line("Normalising input...", disable=not verbose): data = conform_dataset(y, M=M, K=K) y = data["y"] M = data["M"] K = data["K"] assert_finite(y, M, K) if K is not None: # K = K / diag(K).mean() QS = economic_qs(K) else: QS = None if lik_name == "normal": method = LMM(y.values, M.values, QS, restricted=True) method.fit(verbose=verbose) else: method = GLMMExpFam(y, lik, M.values, QS, n_int=500) method.fit(verbose=verbose, factr=1e6, pgtol=1e-3) g = method.scale * (1 - method.delta) e = method.scale * method.delta if lik_name == "bernoulli": e += pi * pi / 3 v = var(method.mean()) return g, v, e
def run(self, verbose=True): from limix._data import conform_dataset, CONF target = CONF["varname_to_target"] self._layout.append( "initial", {target[vn]: self._data[vn].shape for vn in self._data.keys()}) for p in self._process: self._data = p["func"](self._data, *p["args"], **p["kwargs"]) self._data = conform_dataset(**self._data) self._layout.append( p["name"], { target[n]: self._data[n].shape for n, v in self._data.items() if v is not None }, ) if self._get_samples().size == 0: print(self._layout.to_string()) raise RuntimeError( "Exiting early because there is no sample left.") if verbose: print(self._layout.to_string()) return self._data
def _preprocessing(data, filter, filter_missing, filter_maf, impute, verbose): from limix._data import conform_dataset from .._display import session_line layout = _LayoutChange() for target in data.keys(): layout.append(target, "initial", data[target].shape) with session_line("Matching samples... "): data = conform_dataset(**data) data = {k: v for k, v in data.items() if v is not None} for target in data.keys(): layout.append(target, "sample match", data[target].shape) if data["y"].sample.size == 0: print(layout.to_string()) raise RuntimeError("Exiting early because there is no sample left." + " Please, check your sample ids.") for i, f in enumerate(filter): data = _process_filter(f, data) for target in data.keys(): layout.append(target, "filter {}".format(i), data[target].shape) if data["y"].sample.size == 0: print(layout.to_string()) raise RuntimeError( "Exiting early because there is no sample left.") for f in filter_missing: with session_line("Applying `{}`... ".format(f)): _process_filter_missing(f, data) if data["y"].sample.size == 0: print(layout.to_string()) raise RuntimeError( "Exiting early because there is no sample left.") if filter_maf is not None: with session_line( "Removing candidates with MAF<{}... ".format(filter_maf)): data["G"] = _process_filter_maf(float(filter_maf), data["G"]) for target in data.keys(): layout.append(target, "maf filter", data[target].shape) if data["G"].candidate.size == 0: print(layout.to_string()) raise RuntimeError( "Exiting early because there is no candidate left.") for imp in impute: with session_line("Imputting missing values (`{}`)... ".format(imp)): data = _process_impute(imp, data) print(layout.to_string()) return data
def test_dataset_conform_dataset(): y = array([-1.2, 3.4, 0.1]) samples = ["sample{}".format(i) for i in range(len(y))] y = DataFrame(data=y, index=samples) random = RandomState(0) K = random.randn(3, 4) K = K.dot(K.T) K = DataFrame(data=K, index=samples, columns=samples) M = random.randn(3, 2) M = DataFrame(data=M, index=samples) G = random.randn(2, 4) G = DataFrame(data=G, index=samples[:2]) data = conform_dataset(y, M=M, K=K) assert_array_equal(y.values, data["y"].values) y = array([-1.2, 3.4, 0.1, 0.1, 0.0, -0.2]) data = conform_dataset(DataFrame(data=y, index=samples + samples), M=M, G=G, K=K) assert_equal(data["y"].shape, (4, 1)) assert_equal(data["M"].shape, (4, 2)) assert_equal(data["G"].shape, (4, 4)) assert_equal(data["K"].shape, (4, 4)) samples = ["sample0", "sample1", "sample0", "sample1"] assert_array_equal(data["y"].sample, samples) assert_array_equal(data["M"].sample, samples) assert_array_equal(data["G"].sample, samples) assert_array_equal(data["K"].sample_0, samples) assert_array_equal(data["K"].sample_1, samples) assert_array_equal(data["M"].covariate, [0, 1]) assert_array_equal(data["G"].candidate, [0, 1, 2, 3])
def scan(ctx, trait, genotype, covariate, kinship, lik, output_dir, verbose, dry_run, **_): """ Single-variant association testing via mixed models. This analysis requires minimally the specification of one phenotype (PHENOTYPES_FILE) and genotype data (GENOTYPE_FILE). The --filter option allows for selecting a subset of the original dataset for the analysis. For example, --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)" states that only loci of chromosome 3 having a position inside the range (100, 200) will be considered. The --filter option can be used multiple times in the same call. In general, --filter accepts a string of the form <DATA-TYPE>: <BOOL-EXPR> where <DATA-TYPE> can be phenotype, genotype, or covariate. <BOOL-EXPR> is a boolean expression involving row or column names. Please, consult `pandas.DataFrame.query` function from Pandas package for further information. \f Examples -------- ... doctest:: # First we perform a quick file inspection. This step is optional but is very # useful to check whether `limix` is able to read them and print out their # metadata. limix show phenotypes.csv limix show genotype.bgen limix show kinship.raw # We now perform the analysis, specifying the genotype loci and the phenotype # of interest. limix phenotypes.csv genotype.bgen --kinship-file=kinship.raw \ --output-dir=results \ --filter="phenotype: col == 'height'" \ --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)" """ import sys from os import makedirs from os.path import abspath, exists, join import traceback from limix._display import session_block, banner, session_line, indent, print_exc from limix.qtl import scan from limix.io import fetch from .pipeline import Pipeline from limix._data import conform_dataset from .preprocess import impute as impute_func from .preprocess import normalize as normalize_func from .preprocess import where as where_func from .preprocess import drop_missing, drop_maf print(banner()) if ctx.obj is None: ctx.obj = {"preprocess": []} output_dir = abspath(output_dir) if not dry_run: if not exists(output_dir): makedirs(output_dir, exist_ok=True) def _print_data_array(arr, verbose): if verbose: print("\n{}\n".format(indent(_clean_data_array_repr(arr)))) data = {"y": None, "G": None, "K": None} data["y"] = fetch("trait", trait, verbose) _print_data_array(data["y"], verbose) data["G"] = fetch("genotype", genotype, verbose) _print_data_array(data["G"], verbose) if covariate is not None: data["M"] = fetch("covariate", covariate, verbose) _print_data_array(data["M"], verbose) if kinship is not None: data["K"] = fetch("kinship", kinship, verbose) _print_data_array(data["K"], verbose) with session_line("Matching samples... "): data = conform_dataset(**data) data = {k: v for k, v in data.items() if v is not None} if data["y"].sample.size == 0: raise RuntimeError( "Exiting early because there is no sample left after matching samples." + " Please, check your sample ids.") oparams = _ordered_params(ctx) with session_block("preprocessing", disable=not verbose): pipeline = Pipeline(data) preproc_params = [ i for i in oparams if i[0] in ["impute", "normalize", "where", "drop_missing", "drop_maf"] ] for p in preproc_params: if p[0] == "where": pipeline.append(where_func, "where", p[1]) elif p[0] == "normalize": pipeline.append(normalize_func, "normalize", p[1]) elif p[0] == "impute": pipeline.append(impute_func, "impute", p[1]) elif p[0] == "drop_maf": pipeline.append(drop_maf, "drop-maf", p[1]) elif p[0] == "drop_missing": pipeline.append(drop_missing, "drop-missing", p[1]) data = pipeline.run() if dry_run: print("Exiting early because of dry-run option.") return if "K" not in data: data["K"] = None try: res = scan(data["G"], data["y"], lik=lik, K=data["K"], M=data["M"], verbose=verbose) except Exception as e: print_exc(traceback.format_stack(), e) sys.exit(1) with session_line("Saving results to `{}`... ".format(output_dir)): res.to_csv(join(output_dir, "null.csv"), join(output_dir, "alt.csv"))
def test_dataset_underline_prefix(): data = { "coords": { "trait": {"data": "gene1", "dims": (), "attrs": {}}, "_sample": { "data": ["0", "1", "2", "3", "4", "5"], "dims": ("_sample",), "attrs": {}, }, }, "attrs": {}, "dims": ("_sample",), "data": [ -3.7523451473100002, -0.421128991488, -0.536290093143, -0.9076827328799999, -0.251889685747, -0.602998035829, ], "name": "phenotype", } y = DataArray.from_dict(data) data = { "coords": { "fid": { "data": [ "HG00111", "HG00112", "HG00116", "HG00121", "HG00133", "HG00135", "HG00142", ], "dims": ("sample",), "attrs": {}, }, "iid": { "data": [ "HG00111", "HG00112", "HG00116", "HG00121", "HG00133", "HG00135", "HG00142", ], "dims": ("sample",), "attrs": {}, }, "father": { "data": ["0", "0", "0", "0", "0", "0", "0"], "dims": ("sample",), "attrs": {}, }, "mother": { "data": ["0", "0", "0", "0", "0", "0", "0"], "dims": ("sample",), "attrs": {}, }, "gender": { "data": ["0", "0", "0", "0", "0", "0", "0"], "dims": ("sample",), "attrs": {}, }, "trait": { "data": ["-9", "-9", "-9", "-9", "-9", "-9", "-9"], "dims": ("sample",), "attrs": {}, }, "i": {"data": [0, 1], "dims": ("candidate",), "attrs": {}}, "sample": { "data": [ "HG00111", "HG00112", "HG00116", "HG00121", "HG00133", "HG00135", "HG00142", ], "dims": ("sample",), "attrs": {}, }, "chrom": {"data": ["22", "22"], "dims": ("candidate",), "attrs": {}}, "snp": { "data": ["rs146752890", "rs62224610"], "dims": ("candidate",), "attrs": {}, }, "cm": {"data": [0.0, 0.0], "dims": ("candidate",), "attrs": {}}, "pos": {"data": [16050612, 16051347], "dims": ("candidate",), "attrs": {}}, "a0": {"data": ["G", "C"], "dims": ("candidate",), "attrs": {}}, "a1": {"data": ["C", "G"], "dims": ("candidate",), "attrs": {}}, "candidate": { "data": ["rs146752890", "rs62224610"], "dims": ("candidate",), "attrs": {}, }, }, "attrs": {}, "dims": ("sample", "candidate"), "data": [ [2.0, 0.0], [1.0, 2.0], [2.0, 2.0], [1.0, 2.0], [2.0, 1.0], [1.0, 1.0], [2.0, 2.0], ], "name": "genotype", } G = DataArray.from_dict(data) data = conform_dataset(y, G=G) assert_equal( data["y"].coords["sample"][:3].values, ["HG00111", "HG00112", "HG00116"] ) assert_equal(data["y"].shape, (6, 1)) assert_equal(data["y"].dims, ("sample", "trait")) data = { "coords": { "trait": {"data": "gene1", "dims": (), "attrs": {}}, "sample": { "data": ["0", "1", "2", "3", "4", "5"], "dims": ("sample",), "attrs": {}, }, }, "attrs": {}, "dims": ("sample",), "data": [ -3.7523451473100002, -0.421128991488, -0.536290093143, -0.9076827328799999, -0.251889685747, -0.602998035829, ], "name": "phenotype", } y = DataArray.from_dict(data) data = conform_dataset(y, G=G) assert_equal(data["y"].shape, (0, 1)) assert_equal(data["y"].dims, ("sample", "trait")) data = { "coords": {"trait": {"data": "gene1", "dims": (), "attrs": {}}}, "attrs": {}, "dims": ("sample",), "data": [ -3.7523451473100002, -0.421128991488, -0.536290093143, -0.9076827328799999, -0.251889685747, -0.602998035829, ], "name": "phenotype", } y = DataArray.from_dict(data) data = conform_dataset(y, G=G) assert_equal( data["y"].coords["sample"][:3].values, ["HG00111", "HG00112", "HG00116"] ) assert_equal(data["y"].shape, (6, 1)) assert_equal(data["y"].dims, ("sample", "trait"))