def _fixup(test_snps, G, pheno, covar,count_A1=None):
    test_snps = _snps_fixup(test_snps,count_A1=count_A1)
    G = _snps_fixup(G or test_snps,count_A1=count_A1)
    pheno = _pheno_fixup(pheno,count_A1=count_A1).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid,count_A1=count_A1)
    G, test_snps, pheno, covar  = pstutil.intersect_apply([G, test_snps, pheno, covar])
    return test_snps, G, pheno, covar
def single_snp_all_plus_select(test_snps, pheno, G=None, covar=None,
                 k_list = None,
                 n_folds=10, #1 is special and means test on train
                 seed = 0, output_file_name = None,
                 GB_goal=None, force_full_rank=False, force_low_rank=False, mixing=None, h2=None, do_plot=False, runner=None):
    """
    Function performing single SNP GWAS based on two kernels. The first kernel is based on all SNPs. The second kernel is a similarity matrix
    constructed of the top *k* SNPs where the SNPs are ordered via the PValue from :meth:`.single_snp` and *k* is determined via out-of-sample prediction.
    All work is done via 'leave_out_one_chrom', that one chromosome is tested and the kernels are constructed from the other chromosomes.
    Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param G: SNPs from which to create a similarity matrix of the top *k* SNPs. If not given, will use test_snps.
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type G: :class:`.SnpReader` or a string

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string

    :param k_list: Values of *k* (in addition to 0) to test. Default to [1,2,4,8,...8192].
    :type k_list: list of numbers

    :param n_folds: Number of folds of cross validation to use for out-of-sample evaluation of various values of *k*. Default to 10.
    :type n_folds: number
    
    :param seed: (optional) Random seed used to generate permutations for lrt G0 fitting.
    :type seed: number

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param mixing: A parameter to LMM learning telling how to combine the two kernels, optional
            If not given will search for best value.
    :type mixing: number

    :param h2: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional
            If not given will search for best value.
    :type h2: number

    :param do_plot: If true, will plot, for each chrom, the negative loglikelihood vs k.
    :type do_plot: boolean


    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_all_plus_select
    >>> from pysnptools.snpreader import Bed
    >>> from fastlmm.util.runner import LocalMultiProc
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> snps = Bed("../feature_selection/examples/toydata.5chrom.bed")[:,::100] #To make example faster, run on only 1/100th of the data
    >>> chrom5_snps = snps[:,snps.pos[:,0]==5] # Test on only chrom5
    >>> results_dataframe = single_snp_all_plus_select(test_snps=chrom5_snps,G=snps,pheno=pheno_fn,GB_goal=2,runner=LocalMultiProc(20,mkl_num_threads=5)) #Run multiproc
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_9800 0.0793397 4

    """

    #=================================================
    # Start of definition of inner functions
    #=================================================
    def _best_snps_for_each_chrom(chrom_list, input_files, runner, G, n_folds, seed, pheno, covar, force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal):
        #logging.info("Doing GWAS_1K for each chrom and fold. Work_count={0}".format(len(chrom_list)*(n_folds+1)))

        max_k = int(max(k_list))
        assert np.array_equal(G.iid,pheno.iid) and np.array_equal(G.iid,covar.iid), "real assert"

        def mapper_find_best_given_chrom(test_chr):
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
    
            def mapper_gather_lots(i_fold_and_pair):
                i_fold, (train_idx, test_idx) = i_fold_and_pair
                logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold))

                G_train = G_for_chrom[train_idx,:]

                #Precompute whole x whole standardized on train
                from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal
                min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank)
                block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count)
                K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read()

                assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert"
                K_train = K_whole_unittrain[train_idx]
                    
                single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno
                             covar=covar, leave_out_one_chrom=False,
                             GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2)

                is_all = (i_fold == n_folds) if n_folds > 1 else True

                k_list_in =  [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)]

                if is_all:
                    top_snps = list(single_snp_result.SNP[:max_k])
                else:
                    top_snps = None

                if i_fold == n_folds:
                    k_index_to_nLL = None
                else:
                    k_index_to_nLL = []
                    for k in k_list_in:
                        top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])]
                        logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k))

                        top_k_train = top_k[train_idx,:] if k > 0 else None
                        fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal)
                        fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2=h2) #iid intersection means when can give the whole covariate and pheno
    
                        top_k_test = top_k[test_idx,:] if k > 0 else None
                        K0_whole_test = K_whole_unittrain[:,test_idx]
                        nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno
                        k_index_to_nLL.append(nLL)

                if i_fold > 0:
                    k_list_in = None
    
                return k_list_in, top_snps, k_index_to_nLL

            def reducer_find_best(top_snps_and_k_index_to_nLL_sequence):
                #Starts fold_index+all -> k_index -> nll
                #Need:  k_index -> sum(fold_index -> nll)

                k_index_to_sum_nll = None
                top_snps_all = None
                k_list_in_all = None
                for i_fold, (k_list_in, top_snps, k_index_to_nLL) in enumerate(top_snps_and_k_index_to_nLL_sequence):
                    if k_list_in is not None:
                        assert k_list_in_all is None, "real assert"
                        k_list_in_all = k_list_in
                        k_index_to_sum_nll = np.zeros(len(k_list_in))

                    if top_snps is not None:
                        assert top_snps_all is None, "real assert"
                        top_snps_all = top_snps

                    if k_index_to_nLL is not None:
                        assert i_fold < n_folds or n_folds == 1, "real assert"
                        for k_index, nLL in enumerate(k_index_to_nLL):
                            k_index_to_sum_nll[k_index] += nLL

                #find best # top_snps
                best_k = k_list_in_all[np.argmin(k_index_to_sum_nll)]
                logging.info("For chrom={0}, best_k={1}".format(test_chr,best_k))
                if do_plot: _nll_plot(k_list_in_all, k_index_to_sum_nll)

                #Return the top snps from all
                result = top_snps_all[:best_k]
                return result


            i_fold_index_to_top_snps_and_k_index_to_nLL = map_reduce(
                    _kfold(G_for_chrom.iid_count, n_folds, seed, end_with_all=True),
                    mapper=mapper_gather_lots,
                    reducer=reducer_find_best)
            return i_fold_index_to_top_snps_and_k_index_to_nLL

        chrom_index_to_best_sid = map_reduce(
                chrom_list,
                nested=mapper_find_best_given_chrom,
                input_files=input_files,
                name="best snps for each chrom",
                runner=runner)
        return chrom_index_to_best_sid


    def _gwas_2k_via_loo_chrom(test_snps, chrom_list, input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal):
        logging.info("Doing GWAS_2K for each chrom. Work_count={0}".format(len(chrom_list)))

        def mapper_single_snp_2K_given_chrom(test_chr):
            logging.info("Working on chr={0}".format(test_chr))
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==test_chr]
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
            chrom_index = chrom_list.index(test_chr)
            best_sid = chrom_index_to_best_sid[chrom_index]
    
            K1 = G_for_chrom[:,G_for_chrom.sid_to_index(best_sid)]
            result = single_snp(test_snps=test_snps_chrom, K0=G_for_chrom, K1=K1, pheno=pheno,
                        covar=covar, leave_out_one_chrom=False, 
                        GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank,mixing=mixing,h2=h2)
            return result
    
        def reducer_closure(frame_sequence): #!!!very similar code in single_snp
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(G.iid_count))
            logging.info("SNPCount\t{0}".format(G.sid_count))
    
            return frame
    
    
        frame = map_reduce(
            chrom_list,
            mapper=mapper_single_snp_2K_given_chrom,
            reducer=reducer_closure,
            input_files=input_files,
            name="single_snp with two K's for all chroms",
            runner=runner
            )
        return frame

    #=================================================
    # End of definition of inner functions
    #=================================================

    #!!!code similar to single_snp
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")
    if k_list is None:
        k_list = np.logspace(start=0, stop=13, num=14, base=2)

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps)
    G = _snps_fixup(G or test_snps)
    pheno = _pheno_fixup(pheno).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid)
    chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data
    G, test_snps, pheno, covar  = pstutil.intersect_apply([G, test_snps, pheno, covar])
    common_input_files = [test_snps, G, pheno, covar]

    chrom_index_to_best_sid = _best_snps_for_each_chrom(chrom_list, common_input_files, runner, G, n_folds, seed, pheno, covar, force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal)

    frame = _gwas_2k_via_loo_chrom(test_snps, chrom_list, common_input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal)

    return frame
Esempio n. 3
0
def single_snp(test_snps, pheno, K0=None,
                 K1=None, mixing=None,
                 covar=None, covar_by_chrom=None, leave_out_one_chrom=True, output_file_name=None, h2=None, log_delta=None,
                 cache_file = None, GB_goal=None, interact_with_snp=None, force_full_rank=False, force_low_rank=False, G0=None, G1=None, runner=None):
    """
    Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed.
    (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.)

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps.
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.)
    :type K0: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing').
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.)
    :type K1: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0.
            If you give no mixing number and a K1 is given, the best weight will be learned.
    :type mixing: number

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string

    :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True.
           (Warning: setting False can cause proximal contamination.)
    :type leave_out_one_chrom: boolean
    

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param h2: A parameter to LMM learning, optional
            If not given will search for best value.
            If mixing is unspecified, then h2 must also be unspecified.
    :type h2: number

    :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1)
    :type log_delta: number

    :param cache_file: Name of  file to read or write cached precomputation values to, optional.
                If not given, no cache file will be used.
                If given and file does not exist, will write precomputation values to file.
                If given and file does exist, will read precomputation values from file.
                The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format.
                Calls using the same cache file should have the same 'K0' and 'K1'
                If given and the file does exist then K0 and K1 need not be given.
    :type cache_file: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param interact_with_snp: index of a covariate to perform an interaction test with. 
            Allows for interaction testing (interact_with_snp x snp will be tested)
            default: None

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given.
    :type G0: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given.
    :type G1: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"



    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    t0 = time.time()
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps)
    pheno = _pheno_fixup(pheno).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid)

    if not leave_out_one_chrom:
        assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None"
        K0 = _kernel_fixup(K0 or G0 or test_snps, iid_if_none=test_snps.iid, standardizer=Unit())
        K1 = _kernel_fixup(K1 or G1, iid_if_none=test_snps.iid, standardizer=Unit())
        K0, K1, test_snps, pheno, covar  = pstutil.intersect_apply([K0, K1, test_snps, pheno, covar])
        logging.debug("# of iids now {0}".format(K0.iid_count))
        K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal, force_full_rank, force_low_rank)

        frame =  _internal_single(K0=K0, test_snps=test_snps, pheno=pheno,
                                    covar=covar, K1=K1,
                                    mixing=mixing, h2=h2, log_delta=log_delta,
                                    cache_file = cache_file, force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                    output_file_name=output_file_name,block_size=block_size, interact_with_snp=interact_with_snp,
                                    runner=runner)
        sid_index_range = IntRangeSet(frame['sid_index'])
        assert sid_index_range == (0,test_snps.sid_count), "Some SNP rows are missing from the output"
    else:
        chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data

        input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + ([] if covar_by_chrom is None else covar_by_chrom.values())

        def nested_closure(chrom):
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==chrom]
            covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)

            K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom, test_snps.iid)
            K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid)

            K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom  = pstutil.intersect_apply([K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom])
            logging.debug("# of iids now {0}".format(K0_chrom.iid_count))
            K0_chrom, K1_chrom, block_size = _set_block_size(K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank, force_low_rank)

            distributable = _internal_single(K0=K0_chrom, test_snps=test_snps_chrom, pheno=pheno_chrom,
                                        covar=covar_chrom, K1=K1_chrom,
                                        mixing=mixing, h2=h2, log_delta=log_delta, cache_file=None,
                                        force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                        output_file_name=None, block_size=block_size, interact_with_snp=interact_with_snp,
                                        runner=Local())
            
            return distributable

        def reducer_closure(frame_sequence):
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(test_snps.iid_count))
            logging.info("SNPCount\t{0}".format(test_snps.sid_count))
            logging.info("Runtime\t{0}".format(time.time()-t0))

            return frame

        frame = map_reduce(chrom_list,
                   mapper = nested_closure,
                   reducer = reducer_closure,
                   input_files = input_files,
                   output_files = [output_file_name],
                   name = "single_snp (leave_out_one_chrom), out='{0}'".format(output_file_name),
                   runner = runner)

    return frame
Esempio n. 4
0
def single_snp_linreg(test_snps, pheno, covar=None, max_output_len=None, output_file_name=None, GB_goal=None, runner=None):
    """
    Function performing single SNP GWAS using linear regression. Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string


    :param max_output_len: Maximum number of Pvalues to return. Default to None, which means 'Return all'.
    :type max_output_len: number
    
    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks of size iid_count,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_linreg
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp_linreg(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps)
    pheno = _pheno_fixup(pheno).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid)
    test_snps, pheno, covar  = pstutil.intersect_apply([test_snps, pheno, covar])
    logging.debug("# of iids now {0}".format(test_snps.iid_count))

    _, _, block_size = _set_block_size(test_snps, None, 0, GB_goal, force_full_rank=False, force_low_rank=False)

    #!!!what about missing data in covar, in test_snps, in y
    covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))]  #view_ok because np.c_ will allocation new memory
    y =  pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values

    def mapper(start):
        snp_index = np.arange(start,min(start+block_size,test_snps.sid_count))
        x = test_snps[:,start:start+block_size].read().standardize().val
        _,pval_in = lin_reg.f_regression_cov_alt(x,y,covar)
        pval_in = pval_in.reshape(-1)

        if max_output_len is None:
            return pval_in,snp_index
        else: #We only need to return the top max_output_len results
            sort_index = np.argsort(pval_in)[:max_output_len]
            return pval_in[sort_index],snp_index[sort_index]

    def reducer(pval_and_snp_index_sequence):
        pval_list = []
        snp_index_list = []
        for pval, snp_index in pval_and_snp_index_sequence:
            pval_list.append(pval)
            snp_index_list.append(snp_index)
        pval = np.concatenate(pval_list)
        snp_index = np.concatenate(snp_index_list)
        sort_index = np.argsort(pval)
        if max_output_len is not None:
            sort_index = sort_index[:max_output_len]
        index = snp_index[sort_index]

        dataframe = pd.DataFrame(
            index=np.arange(len(index)),
            columns=('sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue')
            )
        #!!Is this the only way to set types in a dataframe?
        dataframe['sid_index'] = dataframe['sid_index'].astype(np.float)
        dataframe['Chr'] = dataframe['Chr'].astype(np.float)
        dataframe['GenDist'] = dataframe['GenDist'].astype(np.float)
        dataframe['ChrPos'] = dataframe['ChrPos'].astype(np.float)
        dataframe['PValue'] = dataframe['PValue'].astype(np.float)

        dataframe['sid_index'] = index
        dataframe['SNP'] = test_snps.sid[index]
        dataframe['Chr'] = test_snps.pos[index,0]
        dataframe['GenDist'] = test_snps.pos[index,1]
        dataframe['ChrPos'] = test_snps.pos[index,2]
        dataframe['PValue'] = pval[sort_index]

        if output_file_name is not None:
            dataframe.to_csv(output_file_name, sep="\t", index=False)

        return dataframe

    dataframe = map_reduce(xrange(0,test_snps.sid_count,block_size),
                           mapper=mapper,
                           reducer=reducer,
                           input_files=[test_snps,pheno,covar],
                           output_files=[output_file_name],
                           name = "single_snp_linreg",
                           runner=runner)
    return dataframe
Esempio n. 5
0
def single_snp(test_snps,
               pheno,
               K0=None,
               K1=None,
               mixing=None,
               covar=None,
               covar_by_chrom=None,
               leave_out_one_chrom=True,
               output_file_name=None,
               h2=None,
               log_delta=None,
               cache_file=None,
               GB_goal=None,
               interact_with_snp=None,
               force_full_rank=False,
               force_low_rank=False,
               G0=None,
               G1=None,
               runner=None,
               count_A1=None):
    """
    Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed.
    (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.)

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps.
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.)
    :type K0: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing').
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.)
    :type K1: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0.
            If you give no mixing number and a K1 is given, the best weight will be learned.
    :type mixing: number

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string

    :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True.
           (Warning: setting False can cause proximal contamination.)
    :type leave_out_one_chrom: boolean
    

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-deleted text.
    :type output_file_name: file name

    :param h2: A parameter to LMM learning, optional
            If not given will search for best value.
            If mixing is unspecified, then h2 must also be unspecified.
    :type h2: number

    :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1)
    :type log_delta: number

    :param cache_file: Name of  file to read or write cached precomputation values to, optional.
                If not given, no cache file will be used.
                If given and file does not exist, will write precomputation values to file.
                If given and file does exist, will read precomputation values from file.
                The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format.
                Calls using the same cache file should have the same 'K0' and 'K1'
                If given and the file does exist then K0 and K1 need not be given.
    :type cache_file: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param interact_with_snp: index of a covariate to perform an interaction test with. 
            Allows for interaction testing (interact_with_snp x snp will be tested)
            default: None

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given.
    :type G0: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given.
    :type G1: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool


    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"



    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    t0 = time.time()
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps, count_A1=count_A1)
    pheno = _pheno_fixup(pheno, count_A1=count_A1).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val == pheno.val)[:, 0], :]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid, count_A1=count_A1)

    if not leave_out_one_chrom:
        assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None"
        K0 = _kernel_fixup(K0 or G0 or test_snps,
                           iid_if_none=test_snps.iid,
                           standardizer=Unit(),
                           count_A1=count_A1)
        K1 = _kernel_fixup(K1 or G1,
                           iid_if_none=test_snps.iid,
                           standardizer=Unit(),
                           count_A1=count_A1)
        K0, K1, test_snps, pheno, covar = pstutil.intersect_apply(
            [K0, K1, test_snps, pheno, covar])
        logging.debug("# of iids now {0}".format(K0.iid_count))
        K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal,
                                             force_full_rank, force_low_rank)

        frame = _internal_single(K0=K0,
                                 test_snps=test_snps,
                                 pheno=pheno,
                                 covar=covar,
                                 K1=K1,
                                 mixing=mixing,
                                 h2=h2,
                                 log_delta=log_delta,
                                 cache_file=cache_file,
                                 force_full_rank=force_full_rank,
                                 force_low_rank=force_low_rank,
                                 output_file_name=output_file_name,
                                 block_size=block_size,
                                 interact_with_snp=interact_with_snp,
                                 runner=runner)
        sid_index_range = IntRangeSet(frame['sid_index'])
        assert sid_index_range == (
            0,
            test_snps.sid_count), "Some SNP rows are missing from the output"
    else:
        chrom_list = list(
            set(test_snps.pos[:, 0])
        )  # find the set of all chroms mentioned in test_snps, the main testing data
        assert not np.isnan(
            chrom_list).any(), "chrom list should not contain NaN"
        input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + (
            [] if covar_by_chrom is None else covar_by_chrom.values())

        def nested_closure(chrom):
            test_snps_chrom = test_snps[:, test_snps.pos[:, 0] == chrom]
            covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)
            cache_file_chrom = None if cache_file is None else cache_file + ".{0}".format(
                chrom)

            K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom,
                                    test_snps.iid)
            K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid)

            K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom = pstutil.intersect_apply(
                [K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom])
            logging.debug("# of iids now {0}".format(K0_chrom.iid_count))
            K0_chrom, K1_chrom, block_size = _set_block_size(
                K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank,
                force_low_rank)

            distributable = _internal_single(
                K0=K0_chrom,
                test_snps=test_snps_chrom,
                pheno=pheno_chrom,
                covar=covar_chrom,
                K1=K1_chrom,
                mixing=mixing,
                h2=h2,
                log_delta=log_delta,
                cache_file=cache_file_chrom,
                force_full_rank=force_full_rank,
                force_low_rank=force_low_rank,
                output_file_name=None,
                block_size=block_size,
                interact_with_snp=interact_with_snp,
                runner=Local())

            return distributable

        def reducer_closure(frame_sequence):
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(test_snps.iid_count))
            logging.info("SNPCount\t{0}".format(test_snps.sid_count))
            logging.info("Runtime\t{0}".format(time.time() - t0))

            return frame

        frame = map_reduce(
            chrom_list,
            mapper=nested_closure,
            reducer=reducer_closure,
            input_files=input_files,
            output_files=[output_file_name],
            name="single_snp (leave_out_one_chrom), out='{0}'".format(
                output_file_name),
            runner=runner)

    return frame
Esempio n. 6
0
    '../Outputs/Fast-Lmm-Outputs/Tergite-5-Online-Adjusted-All-Variants-All-Lines-with-Leave-Out-False'
)

#results_df = single_snp(variants_maf5_head,  phenotypes_online, cache_file='I-Want-Cache',
#                             leave_out_one_chrom=False, output_file_name='Test-Fast-Lmm')

import pylab
import fastlmm.util.util as flutil
flutil.manhattan_plot(results_df.as_matrix(["Chr", "ChrPos", "PValue"]),
                      pvalue_line=1e-5,
                      xaxis_unit_bp=False)
pylab.show()

from fastlmm.util.stats import plotp
plotp.qqplot(results_df["PValue"].values, xlim=[0, 5], ylim=[0, 5])

# print head of results data frame
import pandas as pd
pd.set_option('display.width', 1000)
results_df.head(n=10)

results_df.to_csv(
    '../Outputs/Fast-Lmm-Outputs/Quality-Control-Tergite-Online-Adjusted-Phenotype-Leave-False.txt',
    sep='\t')

from pysnptools.standardizer import Unit
from fastlmm.inference.fastlmm_predictor import _snps_fixup, _pheno_fixup, _kernel_fixup, _SnpTrainTest
test_snps = _snps_fixup(test_snps)
K_causal = test_snps.read_kernel(Unit()).standardize()
pylab.imshow(K_causal.val, cmap=pylab.gray(), vmin=0, vmax=1)
def single_snp_all_plus_select(test_snps, pheno, G=None, covar=None,
                 k_list = None,
                 n_folds=10, #1 is special and means test on train
                 seed = 0, output_file_name = None,
                 GB_goal=None, force_full_rank=False, force_low_rank=False, mixing=None, h2=None, do_plot=False, runner=None, count_A1=None):
    """
    Function performing single SNP GWAS based on two kernels. The first kernel is based on all SNPs. The second kernel is a similarity matrix
    constructed of the top *k* SNPs where the SNPs are ordered via the PValue from :meth:`.single_snp` and *k* is determined via out-of-sample prediction.
    All work is done via 'leave_out_one_chrom', that one chromosome is tested and the kernels are constructed from the other chromosomes.
    Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param G: SNPs from which to create a similarity matrix of the top *k* SNPs. If not given, will use test_snps.
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type G: :class:`.SnpReader` or a string

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string

    :param k_list: Values of *k* (in addition to 0) to test. Default to [1,2,4,8,...8192].
    :type k_list: list of numbers

    :param n_folds: Number of folds of cross validation to use for out-of-sample evaluation of various values of *k*. Default to 10.
    :type n_folds: number
    
    :param seed: (optional) Random seed used to generate permutations for lrt G0 fitting.
    :type seed: number

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param mixing: A parameter to LMM learning telling how to combine the two kernels, optional
            If not given will search for best value.
    :type mixing: number

    :param h2: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional
            If not given will search for best value.
    :type h2: number

    :param do_plot: If true, will plot, for each chrom, the negative loglikelihood vs k.
    :type do_plot: boolean


    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool


    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_all_plus_select
    >>> from pysnptools.snpreader import Bed
    >>> from fastlmm.util.runner import LocalMultiProc
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> snps = Bed("../feature_selection/examples/toydata.5chrom.bed",count_A1=False)[:,::100] #To make example faster, run on only 1/100th of the data
    >>> chrom5_snps = snps[:,snps.pos[:,0]==5] # Test on only chrom5
    >>> results_dataframe = single_snp_all_plus_select(test_snps=chrom5_snps,G=snps,pheno=pheno_fn,GB_goal=2,runner=LocalMultiProc(20,mkl_num_threads=5), count_A1=False) #Run multiproc
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_9800 0.0793385 4

    """

    #=================================================
    # Start of definition of inner functions
    #=================================================
    def _best_snps_for_each_chrom(chrom_list, input_files, runner, G, n_folds, seed, pheno, covar, force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal):
        #logging.info("Doing GWAS_1K for each chrom and fold. Work_count={0}".format(len(chrom_list)*(n_folds+1)))

        max_k = int(max(k_list))
        assert np.array_equal(G.iid,pheno.iid) and np.array_equal(G.iid,covar.iid), "real assert"

        def mapper_find_best_given_chrom(test_chr):
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
    
            def mapper_gather_lots(i_fold_and_pair):
                i_fold, (train_idx, test_idx) = i_fold_and_pair
                logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold))

                G_train = G_for_chrom[train_idx,:]

                #Precompute whole x whole standardized on train
                from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal
                min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank)
                block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count)
                K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read()

                assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert"
                K_train = K_whole_unittrain[train_idx]
                    
                single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno
                             covar=covar, leave_out_one_chrom=False,
                             GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2, count_A1=count_A1)

                is_all = (i_fold == n_folds) if n_folds > 1 else True

                k_list_in =  [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)]

                if is_all:
                    top_snps = list(single_snp_result.SNP[:max_k])
                else:
                    top_snps = None

                if i_fold == n_folds:
                    k_index_to_nLL = None
                else:
                    k_index_to_nLL = []
                    for k in k_list_in:
                        top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])]
                        logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k))

                        top_k_train = top_k[train_idx,:] if k > 0 else None
                        fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal)
                        fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2raw=h2) #iid intersection means when can give the whole covariate and pheno
    
                        top_k_test = top_k[test_idx,:] if k > 0 else None
                        K0_whole_test = K_whole_unittrain[:,test_idx]
                        nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno
                        k_index_to_nLL.append(nLL)

                if i_fold > 0:
                    k_list_in = None
    
                return k_list_in, top_snps, k_index_to_nLL

            def reducer_find_best(top_snps_and_k_index_to_nLL_sequence):
                #Starts fold_index+all -> k_index -> nll
                #Need:  k_index -> sum(fold_index -> nll)

                k_index_to_sum_nll = None
                top_snps_all = None
                k_list_in_all = None
                for i_fold, (k_list_in, top_snps, k_index_to_nLL) in enumerate(top_snps_and_k_index_to_nLL_sequence):
                    if k_list_in is not None:
                        assert k_list_in_all is None, "real assert"
                        k_list_in_all = k_list_in
                        k_index_to_sum_nll = np.zeros(len(k_list_in))

                    if top_snps is not None:
                        assert top_snps_all is None, "real assert"
                        top_snps_all = top_snps

                    if k_index_to_nLL is not None:
                        assert i_fold < n_folds or n_folds == 1, "real assert"
                        for k_index, nLL in enumerate(k_index_to_nLL):
                            k_index_to_sum_nll[k_index] += nLL

                #find best # top_snps
                best_k = k_list_in_all[np.argmin(k_index_to_sum_nll)]
                logging.info("For chrom={0}, best_k={1}".format(test_chr,best_k))
                if do_plot: _nll_plot(k_list_in_all, k_index_to_sum_nll)

                #Return the top snps from all
                result = top_snps_all[:best_k]
                return result


            i_fold_index_to_top_snps_and_k_index_to_nLL = map_reduce(
                    _kfold(G_for_chrom.iid_count, n_folds, seed, end_with_all=True),
                    mapper=mapper_gather_lots,
                    reducer=reducer_find_best)
            return i_fold_index_to_top_snps_and_k_index_to_nLL

        chrom_index_to_best_sid = map_reduce(
                chrom_list,
                nested=mapper_find_best_given_chrom,
                input_files=input_files,
                name="best snps for each chrom",
                runner=runner)
        return chrom_index_to_best_sid


    def _gwas_2k_via_loo_chrom(test_snps, chrom_list, input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal):
        logging.info("Doing GWAS_2K for each chrom. Work_count={0}".format(len(chrom_list)))

        def mapper_single_snp_2K_given_chrom(test_chr):
            logging.info("Working on chr={0}".format(test_chr))
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==test_chr]
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
            chrom_index = chrom_list.index(test_chr)
            best_sid = chrom_index_to_best_sid[chrom_index]
    
            K1 = G_for_chrom[:,G_for_chrom.sid_to_index(best_sid)]
            result = single_snp(test_snps=test_snps_chrom, K0=G_for_chrom, K1=K1, pheno=pheno,
                        covar=covar, leave_out_one_chrom=False, 
                        GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank,mixing=mixing,h2=h2,count_A1=count_A1)
            return result
    
        def reducer_closure(frame_sequence): #!!!very similar code in single_snp
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(G.iid_count))
            logging.info("SNPCount\t{0}".format(G.sid_count))
    
            return frame
    
    
        frame = map_reduce(
            chrom_list,
            mapper=mapper_single_snp_2K_given_chrom,
            reducer=reducer_closure,
            input_files=input_files,
            name="single_snp with two K's for all chroms",
            runner=runner
            )
        return frame

    #=================================================
    # End of definition of inner functions
    #=================================================

    #!!!code similar to single_snp
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")
    if k_list is None:
        k_list = np.logspace(start=0, stop=13, num=14, base=2)

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps,count_A1=count_A1)
    G = _snps_fixup(G or test_snps,count_A1=count_A1)
    pheno = _pheno_fixup(pheno,count_A1=count_A1).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid,count_A1=count_A1)
    chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data
    G, test_snps, pheno, covar  = pstutil.intersect_apply([G, test_snps, pheno, covar])
    common_input_files = [test_snps, G, pheno, covar]

    chrom_index_to_best_sid = _best_snps_for_each_chrom(chrom_list, common_input_files, runner, G, n_folds, seed, pheno, covar, force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal)

    frame = _gwas_2k_via_loo_chrom(test_snps, chrom_list, common_input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal)

    return frame
Esempio n. 8
0
def single_snp_linreg(test_snps,
                      pheno,
                      covar=None,
                      max_output_len=None,
                      output_file_name=None,
                      GB_goal=None,
                      runner=None,
                      count_A1=None):
    """
    Function performing single SNP GWAS using linear regression. Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string


    :param max_output_len: Maximum number of Pvalues to return. Default to None, which means 'Return all'.
    :type max_output_len: number
    
    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-deleted text.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks of size iid_count,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_linreg
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp_linreg(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn, count_A1=False)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps, count_A1=count_A1)
    pheno = _pheno_fixup(pheno, count_A1=count_A1).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val == pheno.val)[:, 0], :]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid)
    test_snps, pheno, covar = pstutil.intersect_apply(
        [test_snps, pheno, covar])
    logging.debug("# of iids now {0}".format(test_snps.iid_count))

    if GB_goal is not None:
        bytes_per_sid = test_snps.iid_count * 8
        sid_per_GB_goal = 1024.0**3 * GB_goal / bytes_per_sid
        block_size = max(1, int(sid_per_GB_goal + .5))
        block_count = test_snps.sid_count / block_size
    else:
        block_count = 1
        block_size = test_snps.sid_count
    logging.debug("block_count={0}, block_size={1}".format(
        block_count, block_size))

    #!!!what about missing data in covar, in test_snps, in y
    covar = np.c_[covar.read(view_ok=True, order='A').val,
                  np.ones(
                      (test_snps.iid_count,
                       1))]  #view_ok because np.c_ will allocation new memory
    y = pheno.read(
        view_ok=True, order='A'
    ).val  #view_ok because this code already did a fresh read to look for any missing values

    def mapper(start):
        logging.info(
            "single_snp_linereg reading start={0},block_size={1}".format(
                start, block_size))
        snp_index = np.arange(start,
                              min(start + block_size, test_snps.sid_count))
        x = test_snps[:, start:start + block_size].read().standardize().val
        logging.info("single_snp_linereg linreg")
        _, pval_in = lin_reg.f_regression_cov_alt(x, y, covar)
        logging.info("single_snp_linereg done")
        pval_in = pval_in.reshape(-1)

        if max_output_len is None:
            return pval_in, snp_index
        else:  #We only need to return the top max_output_len results
            sort_index = np.argsort(pval_in)[:max_output_len]
            return pval_in[sort_index], snp_index[sort_index]

    def reducer(pval_and_snp_index_sequence):
        pval_list = []
        snp_index_list = []
        for pval, snp_index in pval_and_snp_index_sequence:
            pval_list.append(pval)
            snp_index_list.append(snp_index)
        pval = np.concatenate(pval_list)
        snp_index = np.concatenate(snp_index_list)
        sort_index = np.argsort(pval)
        if max_output_len is not None:
            sort_index = sort_index[:max_output_len]
        index = snp_index[sort_index]

        dataframe = pd.DataFrame(index=np.arange(len(index)),
                                 columns=('sid_index', 'SNP', 'Chr', 'GenDist',
                                          'ChrPos', 'PValue'))
        #!!Is this the only way to set types in a dataframe?
        dataframe['sid_index'] = dataframe['sid_index'].astype(np.float)
        dataframe['Chr'] = dataframe['Chr'].astype(np.float)
        dataframe['GenDist'] = dataframe['GenDist'].astype(np.float)
        dataframe['ChrPos'] = dataframe['ChrPos'].astype(np.float)
        dataframe['PValue'] = dataframe['PValue'].astype(np.float)

        dataframe['sid_index'] = index
        dataframe['SNP'] = test_snps.sid[index]
        dataframe['Chr'] = test_snps.pos[index, 0]
        dataframe['GenDist'] = test_snps.pos[index, 1]
        dataframe['ChrPos'] = test_snps.pos[index, 2]
        dataframe['PValue'] = pval[sort_index]

        if output_file_name is not None:
            dataframe.to_csv(output_file_name, sep="\t", index=False)

        return dataframe

    dataframe = map_reduce(xrange(0, test_snps.sid_count, block_size),
                           mapper=mapper,
                           reducer=reducer,
                           input_files=[test_snps, pheno, covar],
                           output_files=[output_file_name],
                           name="single_snp_linreg",
                           runner=runner)
    return dataframe