Python conform_dataset Examples, limix._data.conform_dataset Python Examples

Example #1

0

Show file

File: test_dataset.py Project: Joyvalley/limix

def test_dataset_different_size():
    random = RandomState(0)
    n0 = 5
    n1 = 3
    y = random.randn(n0)
    samples = ["sample{}".format(i) for i in range(len(y))]
    y = DataFrame(data=y, index=samples)

    G = random.randn(n1, 10)

    data = conform_dataset(y, G=G)

    assert_array_equal(data["y"].values, y[:n1])
    assert_array_equal(data["G"].values, G[:n1, :])

    n0 = 3
    n1 = 5
    y = random.randn(n0)
    samples = ["sample{}".format(i) for i in range(len(y))]
    y = DataFrame(data=y, index=samples)

    G = random.randn(n1, 10)

    data = conform_dataset(y, G=G)

    assert_array_equal(data["y"].values, y[:n0])
    assert_array_equal(data["G"].values, G[:n0, :])

Example #2

0

Show file

def estimate(y_phe, lik, kin, marker_mat=None, verbose=True):
    ''' estimate variance components '''
    lik = normalize_likelihood(lik)
    lik_name = lik[0]
    with session_block("Heritability analysis", disable=not verbose):
        with session_line("Normalising input...", disable=not verbose):
            data = conform_dataset(y_phe, M=marker_mat, K=kin)
        y_phe = data["y"]
        marker_mat = data["M"]
        kin = data["K"]
        assert_finite(y_phe, marker_mat, kin)
        if kin is not None:
            # K = K / diag(K).mean()
            q_s = economic_qs(kin)
        else:
            q_s = None
        if lik_name == "normal":
            method = LMM(y_phe.values, marker_mat.values, q_s, restricted=True)
            method.fit(verbose=verbose)
        else:
            method = GLMMExpFam(y_phe, lik, marker_mat.values, q_s, n_int=500)
            method.fit(verbose=verbose, factr=1e6, pgtol=1e-3)
        v_g = method.scale * (1 - method.delta)
        v_e = method.scale * method.delta
        if lik_name == "bernoulli":
            v_e += pi * pi / 3
        v_v = var(method.mean())
        return v_g, v_v, v_e

Example #3

0

Show file

def estimate(y, lik, K, M=None, verbose=True):
    from numpy_sugar.linalg import economic_qs
    from numpy import pi, var, diag
    from glimix_core.glmm import GLMMExpFam
    from glimix_core.lmm import LMM
    from limix._data._assert import assert_likelihood
    from limix._data import normalize_likelihood, conform_dataset
    from limix.qtl._assert import assert_finite
    from limix._display import session_block, session_line
    lik = normalize_likelihood(lik)
    lik_name = lik[0]
    with session_block("Heritability analysis", disable=not verbose):
        with session_line("Normalising input...", disable=not verbose):
            data = conform_dataset(y, M=M, K=K)
        y = data["y"]
        M = data["M"]
        K = data["K"]
        assert_finite(y, M, K)
        if K is not None:
            # K = K / diag(K).mean()
            QS = economic_qs(K)
        else:
            QS = None
        if lik_name == "normal":
            method = LMM(y.values, M.values, QS, restricted=True)
            method.fit(verbose=verbose)
        else:
            method = GLMMExpFam(y, lik, M.values, QS, n_int=500)
            method.fit(verbose=verbose, factr=1e6, pgtol=1e-3)
        g = method.scale * (1 - method.delta)
        e = method.scale * method.delta
        if lik_name == "bernoulli":
            e += pi * pi / 3
        v = var(method.mean())
        return g, v, e

Example #4

0

Show file

File: pipeline.py Project: phue/limix

    def run(self, verbose=True):
        from limix._data import conform_dataset, CONF

        target = CONF["varname_to_target"]
        self._layout.append(
            "initial",
            {target[vn]: self._data[vn].shape
             for vn in self._data.keys()})

        for p in self._process:
            self._data = p["func"](self._data, *p["args"], **p["kwargs"])
            self._data = conform_dataset(**self._data)

            self._layout.append(
                p["name"],
                {
                    target[n]: self._data[n].shape
                    for n, v in self._data.items() if v is not None
                },
            )

            if self._get_samples().size == 0:
                print(self._layout.to_string())
                raise RuntimeError(
                    "Exiting early because there is no sample left.")

        if verbose:
            print(self._layout.to_string())

        return self._data

Example #5

0

Show file

def _preprocessing(data, filter, filter_missing, filter_maf, impute, verbose):
    from limix._data import conform_dataset
    from .._display import session_line

    layout = _LayoutChange()

    for target in data.keys():
        layout.append(target, "initial", data[target].shape)

    with session_line("Matching samples... "):
        data = conform_dataset(**data)
    data = {k: v for k, v in data.items() if v is not None}

    for target in data.keys():
        layout.append(target, "sample match", data[target].shape)

    if data["y"].sample.size == 0:
        print(layout.to_string())
        raise RuntimeError("Exiting early because there is no sample left." +
                           " Please, check your sample ids.")

    for i, f in enumerate(filter):
        data = _process_filter(f, data)
        for target in data.keys():
            layout.append(target, "filter {}".format(i), data[target].shape)
            if data["y"].sample.size == 0:
                print(layout.to_string())
                raise RuntimeError(
                    "Exiting early because there is no sample left.")

    for f in filter_missing:
        with session_line("Applying `{}`... ".format(f)):
            _process_filter_missing(f, data)
            if data["y"].sample.size == 0:
                print(layout.to_string())
                raise RuntimeError(
                    "Exiting early because there is no sample left.")

    if filter_maf is not None:
        with session_line(
                "Removing candidates with MAF<{}... ".format(filter_maf)):
            data["G"] = _process_filter_maf(float(filter_maf), data["G"])

        for target in data.keys():
            layout.append(target, "maf filter", data[target].shape)

        if data["G"].candidate.size == 0:
            print(layout.to_string())
            raise RuntimeError(
                "Exiting early because there is no candidate left.")

    for imp in impute:
        with session_line("Imputting missing values (`{}`)... ".format(imp)):
            data = _process_impute(imp, data)

    print(layout.to_string())

    return data

Example #6

0

Show file

File: test_dataset.py Project: Joyvalley/limix

def test_dataset_conform_dataset():
    y = array([-1.2, 3.4, 0.1])
    samples = ["sample{}".format(i) for i in range(len(y))]

    y = DataFrame(data=y, index=samples)

    random = RandomState(0)

    K = random.randn(3, 4)
    K = K.dot(K.T)
    K = DataFrame(data=K, index=samples, columns=samples)

    M = random.randn(3, 2)
    M = DataFrame(data=M, index=samples)

    G = random.randn(2, 4)
    G = DataFrame(data=G, index=samples[:2])

    data = conform_dataset(y, M=M, K=K)

    assert_array_equal(y.values, data["y"].values)

    y = array([-1.2, 3.4, 0.1, 0.1, 0.0, -0.2])

    data = conform_dataset(DataFrame(data=y, index=samples + samples),
                           M=M,
                           G=G,
                           K=K)

    assert_equal(data["y"].shape, (4, 1))
    assert_equal(data["M"].shape, (4, 2))
    assert_equal(data["G"].shape, (4, 4))
    assert_equal(data["K"].shape, (4, 4))

    samples = ["sample0", "sample1", "sample0", "sample1"]
    assert_array_equal(data["y"].sample, samples)
    assert_array_equal(data["M"].sample, samples)
    assert_array_equal(data["G"].sample, samples)
    assert_array_equal(data["K"].sample_0, samples)
    assert_array_equal(data["K"].sample_1, samples)

    assert_array_equal(data["M"].covariate, [0, 1])
    assert_array_equal(data["G"].candidate, [0, 1, 2, 3])

Example #7

0

Show file

def scan(ctx, trait, genotype, covariate, kinship, lik, output_dir, verbose,
         dry_run, **_):
    """ Single-variant association testing via mixed models.

    This analysis requires minimally the specification of one phenotype
    (PHENOTYPES_FILE) and genotype data (GENOTYPE_FILE).

    The --filter option allows for selecting a subset of the original dataset for
    the analysis. For example,

        --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)"

    states that only loci of chromosome 3 having a position inside the range (100, 200)
    will be considered. The --filter option can be used multiple times in the same
    call. In general, --filter accepts a string of the form

        <DATA-TYPE>: <BOOL-EXPR>

    where <DATA-TYPE> can be phenotype, genotype, or covariate. <BOOL-EXPR> is a boolean
    expression involving row or column names. Please, consult `pandas.DataFrame.query`
    function from Pandas package for further information.
    \f

    Examples
    --------

    ... doctest::

        # First we perform a quick file inspection. This step is optional but is very
        # useful to check whether `limix` is able to read them and print out their
        # metadata.
        limix show phenotypes.csv
        limix show genotype.bgen
        limix show kinship.raw

        # We now perform the analysis, specifying the genotype loci and the phenotype
        # of interest.
        limix phenotypes.csv genotype.bgen --kinship-file=kinship.raw \
            --output-dir=results \
            --filter="phenotype: col == 'height'" \
            --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)"
    """
    import sys
    from os import makedirs
    from os.path import abspath, exists, join
    import traceback
    from limix._display import session_block, banner, session_line, indent, print_exc
    from limix.qtl import scan
    from limix.io import fetch
    from .pipeline import Pipeline
    from limix._data import conform_dataset
    from .preprocess import impute as impute_func
    from .preprocess import normalize as normalize_func
    from .preprocess import where as where_func
    from .preprocess import drop_missing, drop_maf

    print(banner())

    if ctx.obj is None:
        ctx.obj = {"preprocess": []}

    output_dir = abspath(output_dir)
    if not dry_run:
        if not exists(output_dir):
            makedirs(output_dir, exist_ok=True)

    def _print_data_array(arr, verbose):
        if verbose:
            print("\n{}\n".format(indent(_clean_data_array_repr(arr))))

    data = {"y": None, "G": None, "K": None}

    data["y"] = fetch("trait", trait, verbose)
    _print_data_array(data["y"], verbose)

    data["G"] = fetch("genotype", genotype, verbose)
    _print_data_array(data["G"], verbose)

    if covariate is not None:
        data["M"] = fetch("covariate", covariate, verbose)
        _print_data_array(data["M"], verbose)

    if kinship is not None:
        data["K"] = fetch("kinship", kinship, verbose)
        _print_data_array(data["K"], verbose)

    with session_line("Matching samples... "):
        data = conform_dataset(**data)
    data = {k: v for k, v in data.items() if v is not None}

    if data["y"].sample.size == 0:
        raise RuntimeError(
            "Exiting early because there is no sample left after matching samples."
            + " Please, check your sample ids.")

    oparams = _ordered_params(ctx)

    with session_block("preprocessing", disable=not verbose):
        pipeline = Pipeline(data)
        preproc_params = [
            i for i in oparams if i[0] in
            ["impute", "normalize", "where", "drop_missing", "drop_maf"]
        ]

        for p in preproc_params:
            if p[0] == "where":
                pipeline.append(where_func, "where", p[1])
            elif p[0] == "normalize":
                pipeline.append(normalize_func, "normalize", p[1])
            elif p[0] == "impute":
                pipeline.append(impute_func, "impute", p[1])
            elif p[0] == "drop_maf":
                pipeline.append(drop_maf, "drop-maf", p[1])
            elif p[0] == "drop_missing":
                pipeline.append(drop_missing, "drop-missing", p[1])

        data = pipeline.run()

    if dry_run:
        print("Exiting early because of dry-run option.")
        return

    if "K" not in data:
        data["K"] = None
    try:
        res = scan(data["G"],
                   data["y"],
                   lik=lik,
                   K=data["K"],
                   M=data["M"],
                   verbose=verbose)
    except Exception as e:
        print_exc(traceback.format_stack(), e)
        sys.exit(1)

    with session_line("Saving results to `{}`... ".format(output_dir)):
        res.to_csv(join(output_dir, "null.csv"), join(output_dir, "alt.csv"))

Example #8

0

Show file

File: test_dataset.py Project: zhzheng92/limix

def test_dataset_underline_prefix():

    data = {
        "coords": {
            "trait": {"data": "gene1", "dims": (), "attrs": {}},
            "_sample": {
                "data": ["0", "1", "2", "3", "4", "5"],
                "dims": ("_sample",),
                "attrs": {},
            },
        },
        "attrs": {},
        "dims": ("_sample",),
        "data": [
            -3.7523451473100002,
            -0.421128991488,
            -0.536290093143,
            -0.9076827328799999,
            -0.251889685747,
            -0.602998035829,
        ],
        "name": "phenotype",
    }

    y = DataArray.from_dict(data)

    data = {
        "coords": {
            "fid": {
                "data": [
                    "HG00111",
                    "HG00112",
                    "HG00116",
                    "HG00121",
                    "HG00133",
                    "HG00135",
                    "HG00142",
                ],
                "dims": ("sample",),
                "attrs": {},
            },
            "iid": {
                "data": [
                    "HG00111",
                    "HG00112",
                    "HG00116",
                    "HG00121",
                    "HG00133",
                    "HG00135",
                    "HG00142",
                ],
                "dims": ("sample",),
                "attrs": {},
            },
            "father": {
                "data": ["0", "0", "0", "0", "0", "0", "0"],
                "dims": ("sample",),
                "attrs": {},
            },
            "mother": {
                "data": ["0", "0", "0", "0", "0", "0", "0"],
                "dims": ("sample",),
                "attrs": {},
            },
            "gender": {
                "data": ["0", "0", "0", "0", "0", "0", "0"],
                "dims": ("sample",),
                "attrs": {},
            },
            "trait": {
                "data": ["-9", "-9", "-9", "-9", "-9", "-9", "-9"],
                "dims": ("sample",),
                "attrs": {},
            },
            "i": {"data": [0, 1], "dims": ("candidate",), "attrs": {}},
            "sample": {
                "data": [
                    "HG00111",
                    "HG00112",
                    "HG00116",
                    "HG00121",
                    "HG00133",
                    "HG00135",
                    "HG00142",
                ],
                "dims": ("sample",),
                "attrs": {},
            },
            "chrom": {"data": ["22", "22"], "dims": ("candidate",), "attrs": {}},
            "snp": {
                "data": ["rs146752890", "rs62224610"],
                "dims": ("candidate",),
                "attrs": {},
            },
            "cm": {"data": [0.0, 0.0], "dims": ("candidate",), "attrs": {}},
            "pos": {"data": [16050612, 16051347], "dims": ("candidate",), "attrs": {}},
            "a0": {"data": ["G", "C"], "dims": ("candidate",), "attrs": {}},
            "a1": {"data": ["C", "G"], "dims": ("candidate",), "attrs": {}},
            "candidate": {
                "data": ["rs146752890", "rs62224610"],
                "dims": ("candidate",),
                "attrs": {},
            },
        },
        "attrs": {},
        "dims": ("sample", "candidate"),
        "data": [
            [2.0, 0.0],
            [1.0, 2.0],
            [2.0, 2.0],
            [1.0, 2.0],
            [2.0, 1.0],
            [1.0, 1.0],
            [2.0, 2.0],
        ],
        "name": "genotype",
    }

    G = DataArray.from_dict(data)

    data = conform_dataset(y, G=G)
    assert_equal(
        data["y"].coords["sample"][:3].values, ["HG00111", "HG00112", "HG00116"]
    )
    assert_equal(data["y"].shape, (6, 1))
    assert_equal(data["y"].dims, ("sample", "trait"))

    data = {
        "coords": {
            "trait": {"data": "gene1", "dims": (), "attrs": {}},
            "sample": {
                "data": ["0", "1", "2", "3", "4", "5"],
                "dims": ("sample",),
                "attrs": {},
            },
        },
        "attrs": {},
        "dims": ("sample",),
        "data": [
            -3.7523451473100002,
            -0.421128991488,
            -0.536290093143,
            -0.9076827328799999,
            -0.251889685747,
            -0.602998035829,
        ],
        "name": "phenotype",
    }

    y = DataArray.from_dict(data)
    data = conform_dataset(y, G=G)
    assert_equal(data["y"].shape, (0, 1))
    assert_equal(data["y"].dims, ("sample", "trait"))

    data = {
        "coords": {"trait": {"data": "gene1", "dims": (), "attrs": {}}},
        "attrs": {},
        "dims": ("sample",),
        "data": [
            -3.7523451473100002,
            -0.421128991488,
            -0.536290093143,
            -0.9076827328799999,
            -0.251889685747,
            -0.602998035829,
        ],
        "name": "phenotype",
    }
    y = DataArray.from_dict(data)
    data = conform_dataset(y, G=G)
    assert_equal(
        data["y"].coords["sample"][:3].values, ["HG00111", "HG00112", "HG00116"]
    )
    assert_equal(data["y"].shape, (6, 1))
    assert_equal(data["y"].dims, ("sample", "trait"))