コード例 #1
0
    def _gwas_2k_via_loo_chrom(test_snps, chrom_list, input_files, runner, G,
                               chrom_index_to_best_sid, pheno, covar,
                               force_full_rank, force_low_rank, mixing, h2,
                               output_file_name, GB_goal):
        logging.info("Doing GWAS_2K for each chrom. Work_count={0}".format(
            len(chrom_list)))

        def mapper_single_snp_2K_given_chrom(test_chr):
            logging.info("Working on chr={0}".format(test_chr))
            test_snps_chrom = test_snps[:, test_snps.pos[:, 0] == test_chr]
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
            chrom_index = chrom_list.index(test_chr)
            best_sid = chrom_index_to_best_sid[chrom_index]

            K1 = G_for_chrom[:, G_for_chrom.sid_to_index(best_sid)]
            result = single_snp(test_snps=test_snps_chrom,
                                K0=G_for_chrom,
                                K1=K1,
                                pheno=pheno,
                                covar=covar,
                                leave_out_one_chrom=False,
                                GB_goal=GB_goal,
                                force_full_rank=force_full_rank,
                                force_low_rank=force_low_rank,
                                mixing=mixing,
                                h2=h2)
            return result

        def reducer_closure(
                frame_sequence):  #!!!very similar code in single_snp
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(G.iid_count))
            logging.info("SNPCount\t{0}".format(G.sid_count))

            return frame

        frame = map_reduce(chrom_list,
                           mapper=mapper_single_snp_2K_given_chrom,
                           reducer=reducer_closure,
                           input_files=input_files,
                           name="single_snp with two K's for all chroms",
                           runner=runner)
        return frame
コード例 #2
0
def prime_search1(start,stop,runner):

    def mapper(i):
        if is_prime(i):
            #assert i != 5, "I just don't like fives"
            return i
        else:
            return None

    def reducer(sequence):
        result = []
        for i in sequence:
            if i is not None:
                result.append(i)
        return result

    return map_reduce(xrange(start,stop),
                        mapper=mapper,
                        reducer=reducer, #lambda sequence: [i for i in sequence if i is not None], #Filter out the None's
                        runner=runner)
コード例 #3
0
ファイル: examples.py プロジェクト: quanrd/GWAS_Pipeline
def prime_search1(start, stop, runner):
    def mapper(i):
        if is_prime(i):
            #assert i != 5, "I just don't like fives"
            return i
        else:
            return None

    def reducer(sequence):
        result = []
        for i in sequence:
            if i is not None:
                result.append(i)
        return result

    return map_reduce(
        xrange(start, stop),
        mapper=mapper,
        reducer=
        reducer,  #lambda sequence: [i for i in sequence if i is not None], #Filter out the None's
        runner=runner)
コード例 #4
0
    def _gwas_2k_via_loo_chrom(test_snps, chrom_list, input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal):
        logging.info("Doing GWAS_2K for each chrom. Work_count={0}".format(len(chrom_list)))

        def mapper_single_snp_2K_given_chrom(test_chr):
            logging.info("Working on chr={0}".format(test_chr))
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==test_chr]
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
            chrom_index = chrom_list.index(test_chr)
            best_sid = chrom_index_to_best_sid[chrom_index]
    
            K1 = G_for_chrom[:,G_for_chrom.sid_to_index(best_sid)]
            result = single_snp(test_snps=test_snps_chrom, K0=G_for_chrom, K1=K1, pheno=pheno,
                        covar=covar, leave_out_one_chrom=False, 
                        GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank,mixing=mixing,h2=h2)
            return result
    
        def reducer_closure(frame_sequence): #!!!very similar code in single_snp
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(G.iid_count))
            logging.info("SNPCount\t{0}".format(G.sid_count))
    
            return frame
    
    
        frame = map_reduce(
            chrom_list,
            mapper=mapper_single_snp_2K_given_chrom,
            reducer=reducer_closure,
            input_files=input_files,
            name="single_snp with two K's for all chroms",
            runner=runner
            )
        return frame
コード例 #5
0
        def mapper_find_best_given_chrom(test_chr):
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
    
            def mapper_gather_lots(i_fold_and_pair):
                i_fold, (train_idx, test_idx) = i_fold_and_pair
                logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold))

                G_train = G_for_chrom[train_idx,:]

                #Precompute whole x whole standardized on train
                from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal
                min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank)
                block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count)
                K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read()

                assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert"
                K_train = K_whole_unittrain[train_idx]
                    
                single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno
                             covar=covar, leave_out_one_chrom=False,
                             GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2)

                is_all = (i_fold == n_folds) if n_folds > 1 else True

                k_list_in =  [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)]

                if is_all:
                    top_snps = list(single_snp_result.SNP[:max_k])
                else:
                    top_snps = None

                if i_fold == n_folds:
                    k_index_to_nLL = None
                else:
                    k_index_to_nLL = []
                    for k in k_list_in:
                        top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])]
                        logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k))

                        top_k_train = top_k[train_idx,:] if k > 0 else None
                        fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal)
                        fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2=h2) #iid intersection means when can give the whole covariate and pheno
    
                        top_k_test = top_k[test_idx,:] if k > 0 else None
                        K0_whole_test = K_whole_unittrain[:,test_idx]
                        nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno
                        k_index_to_nLL.append(nLL)

                if i_fold > 0:
                    k_list_in = None
    
                return k_list_in, top_snps, k_index_to_nLL

            def reducer_find_best(top_snps_and_k_index_to_nLL_sequence):
                #Starts fold_index+all -> k_index -> nll
                #Need:  k_index -> sum(fold_index -> nll)

                k_index_to_sum_nll = None
                top_snps_all = None
                k_list_in_all = None
                for i_fold, (k_list_in, top_snps, k_index_to_nLL) in enumerate(top_snps_and_k_index_to_nLL_sequence):
                    if k_list_in is not None:
                        assert k_list_in_all is None, "real assert"
                        k_list_in_all = k_list_in
                        k_index_to_sum_nll = np.zeros(len(k_list_in))

                    if top_snps is not None:
                        assert top_snps_all is None, "real assert"
                        top_snps_all = top_snps

                    if k_index_to_nLL is not None:
                        assert i_fold < n_folds or n_folds == 1, "real assert"
                        for k_index, nLL in enumerate(k_index_to_nLL):
                            k_index_to_sum_nll[k_index] += nLL

                #find best # top_snps
                best_k = k_list_in_all[np.argmin(k_index_to_sum_nll)]
                logging.info("For chrom={0}, best_k={1}".format(test_chr,best_k))
                if do_plot: _nll_plot(k_list_in_all, k_index_to_sum_nll)

                #Return the top snps from all
                result = top_snps_all[:best_k]
                return result


            i_fold_index_to_top_snps_and_k_index_to_nLL = map_reduce(
                    _kfold(G_for_chrom.iid_count, n_folds, seed, end_with_all=True),
                    mapper=mapper_gather_lots,
                    reducer=reducer_find_best)
            return i_fold_index_to_top_snps_and_k_index_to_nLL
コード例 #6
0
ファイル: single_snp.py プロジェクト: 42binwang/FaST-LMM
def _internal_single(K0, test_snps, pheno, covar, K1,
                 mixing, h2, log_delta,
                 cache_file, force_full_rank, force_low_rank,
                 output_file_name, block_size, interact_with_snp, runner):
    assert K0 is not None, "real assert"
    assert K1 is not None, "real assert"
    assert block_size is not None, "real assert"
    assert mixing is None or 0.0 <= mixing <= 1.0
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified"
    if log_delta is not None:
        h2 = 1.0/(np.exp(log_delta)+1)

    covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))]  #view_ok because np.c_ will allocation new memory

    y =  pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values 

    if cache_file is not None and os.path.exists(cache_file):
        lmm = lmm_cov(X=covar, Y=y, G=None, K=None)
        with np.load(cache_file) as data: #!! similar code in epistasis
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
            h2 = data['arr_2'][0]
            mixing = data['arr_2'][1]
    else:
        K, h2, mixer = _Mixer.combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=force_full_rank, force_low_rank=force_low_rank,kernel_standardizer=DiagKtoN())
        mixing = mixer.mixing

        if mixer.do_g:
            lmm = lmm_cov(X=covar, Y=y, K=None, G=K.snpreader.val, inplace=True)
        else:
            lmm = lmm_cov(X=covar, Y=y, K=K.val, G=None, inplace=True)

        if h2 is None:
            result = lmm.findH2()
            h2 = result['h2']
        logging.info("h2={0}".format(h2))

        if cache_file is not None and not os.path.exists(cache_file):
            pstutil.create_directory_if_necessary(cache_file)
            lmm.getSU()
            np.savez(cache_file, lmm.U,lmm.S,np.array([h2,mixing])) #using np.savez instead of pickle because it seems to be faster to read and write

    if interact_with_snp is not None:
        logging.info("interaction with %i" % interact_with_snp)
        assert 0 <= interact_with_snp and interact_with_snp < covar.shape[1]-1, "interact_with_snp is out of range"
        interact = covar[:,interact_with_snp].copy()
        interact -=interact.mean()
        interact /= interact.std()
    else:
        interact = None

    work_count = -(test_snps.sid_count // -block_size) #Find the work count based on batch size (rounding up)

    # We define three closures, that is, functions define inside function so that the inner function has access to the local variables of the outer function.
    def debatch_closure(work_index):
        return test_snps.sid_count * work_index // work_count

    def mapper_closure(work_index):
        if work_count > 1: logging.info("single_snp: Working on part {0} of {1}".format(work_index,work_count))
        do_work_time = time.time()
        start = debatch_closure(work_index)
        end = debatch_closure(work_index+1)

        snps_read = test_snps[:,start:end].read().standardize()
        if interact_with_snp is not None:
            variables_to_test = snps_read.val * interact[:,np.newaxis]
        else:
            variables_to_test = snps_read.val
        res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test)

        beta = res['beta']
        
        chi2stats = beta*beta/res['variance_beta']
        #p_values = stats.chi2.sf(chi2stats,1)[:,0]
        assert test_snps.iid_count == lmm.U.shape[0]
        p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-3)[:,0]#note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP)

        dataframe = _create_dataframe(snps_read.sid_count)
        dataframe['sid_index'] = np.arange(start,end)
        dataframe['SNP'] = snps_read.sid
        dataframe['Chr'] = snps_read.pos[:,0]
        dataframe['GenDist'] = snps_read.pos[:,1]
        dataframe['ChrPos'] = snps_read.pos[:,2] 
        dataframe['PValue'] = p_values
        dataframe['SnpWeight'] = beta[:,0]
        dataframe['SnpWeightSE'] = np.sqrt(res['variance_beta'][:,0])
        dataframe['SnpFractVarExpl'] = np.sqrt(res['fraction_variance_explained_beta'][:,0])
        dataframe['Mixing'] = np.zeros((snps_read.sid_count)) + mixing
        dataframe['Nullh2'] = np.zeros((snps_read.sid_count)) + h2

        logging.info("time={0}".format(time.time()-do_work_time))

        #logging.info(dataframe)
        return dataframe

    def reducer_closure(result_sequence):
        if output_file_name is not None:
            create_directory_if_necessary(output_file_name)

        frame = pd.concat(result_sequence)
        frame.sort_values(by="PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if output_file_name is not None:
            frame.to_csv(output_file_name, sep="\t", index=False)

        return frame

    frame = map_reduce(xrange(work_count),
                       mapper=mapper_closure,reducer=reducer_closure,
                       input_files=[test_snps],output_files=[output_file_name],
                       name="single_snp(output_file={0})".format(output_file_name),
                       runner=runner)
    return frame
コード例 #7
0
ファイル: single_snp.py プロジェクト: 42binwang/FaST-LMM
def single_snp(test_snps, pheno, K0=None,
                 K1=None, mixing=None,
                 covar=None, covar_by_chrom=None, leave_out_one_chrom=True, output_file_name=None, h2=None, log_delta=None,
                 cache_file = None, GB_goal=None, interact_with_snp=None, force_full_rank=False, force_low_rank=False, G0=None, G1=None, runner=None):
    """
    Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed.
    (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.)

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps.
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.)
    :type K0: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing').
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.)
    :type K1: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0.
            If you give no mixing number and a K1 is given, the best weight will be learned.
    :type mixing: number

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string

    :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True.
           (Warning: setting False can cause proximal contamination.)
    :type leave_out_one_chrom: boolean
    

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param h2: A parameter to LMM learning, optional
            If not given will search for best value.
            If mixing is unspecified, then h2 must also be unspecified.
    :type h2: number

    :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1)
    :type log_delta: number

    :param cache_file: Name of  file to read or write cached precomputation values to, optional.
                If not given, no cache file will be used.
                If given and file does not exist, will write precomputation values to file.
                If given and file does exist, will read precomputation values from file.
                The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format.
                Calls using the same cache file should have the same 'K0' and 'K1'
                If given and the file does exist then K0 and K1 need not be given.
    :type cache_file: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param interact_with_snp: index of a covariate to perform an interaction test with. 
            Allows for interaction testing (interact_with_snp x snp will be tested)
            default: None

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given.
    :type G0: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given.
    :type G1: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"



    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    t0 = time.time()
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps)
    pheno = _pheno_fixup(pheno).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid)

    if not leave_out_one_chrom:
        assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None"
        K0 = _kernel_fixup(K0 or G0 or test_snps, iid_if_none=test_snps.iid, standardizer=Unit())
        K1 = _kernel_fixup(K1 or G1, iid_if_none=test_snps.iid, standardizer=Unit())
        K0, K1, test_snps, pheno, covar  = pstutil.intersect_apply([K0, K1, test_snps, pheno, covar])
        logging.debug("# of iids now {0}".format(K0.iid_count))
        K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal, force_full_rank, force_low_rank)

        frame =  _internal_single(K0=K0, test_snps=test_snps, pheno=pheno,
                                    covar=covar, K1=K1,
                                    mixing=mixing, h2=h2, log_delta=log_delta,
                                    cache_file = cache_file, force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                    output_file_name=output_file_name,block_size=block_size, interact_with_snp=interact_with_snp,
                                    runner=runner)
        sid_index_range = IntRangeSet(frame['sid_index'])
        assert sid_index_range == (0,test_snps.sid_count), "Some SNP rows are missing from the output"
    else:
        chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data

        input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + ([] if covar_by_chrom is None else covar_by_chrom.values())

        def nested_closure(chrom):
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==chrom]
            covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)

            K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom, test_snps.iid)
            K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid)

            K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom  = pstutil.intersect_apply([K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom])
            logging.debug("# of iids now {0}".format(K0_chrom.iid_count))
            K0_chrom, K1_chrom, block_size = _set_block_size(K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank, force_low_rank)

            distributable = _internal_single(K0=K0_chrom, test_snps=test_snps_chrom, pheno=pheno_chrom,
                                        covar=covar_chrom, K1=K1_chrom,
                                        mixing=mixing, h2=h2, log_delta=log_delta, cache_file=None,
                                        force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                        output_file_name=None, block_size=block_size, interact_with_snp=interact_with_snp,
                                        runner=Local())
            
            return distributable

        def reducer_closure(frame_sequence):
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(test_snps.iid_count))
            logging.info("SNPCount\t{0}".format(test_snps.sid_count))
            logging.info("Runtime\t{0}".format(time.time()-t0))

            return frame

        frame = map_reduce(chrom_list,
                   mapper = nested_closure,
                   reducer = reducer_closure,
                   input_files = input_files,
                   output_files = [output_file_name],
                   name = "single_snp (leave_out_one_chrom), out='{0}'".format(output_file_name),
                   runner = runner)

    return frame
コード例 #8
0
def single_snp_linreg(test_snps, pheno, covar=None, max_output_len=None, output_file_name=None, GB_goal=None, runner=None):
    """
    Function performing single SNP GWAS using linear regression. Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string


    :param max_output_len: Maximum number of Pvalues to return. Default to None, which means 'Return all'.
    :type max_output_len: number
    
    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks of size iid_count,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_linreg
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp_linreg(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps)
    pheno = _pheno_fixup(pheno).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid)
    test_snps, pheno, covar  = pstutil.intersect_apply([test_snps, pheno, covar])
    logging.debug("# of iids now {0}".format(test_snps.iid_count))

    _, _, block_size = _set_block_size(test_snps, None, 0, GB_goal, force_full_rank=False, force_low_rank=False)

    #!!!what about missing data in covar, in test_snps, in y
    covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))]  #view_ok because np.c_ will allocation new memory
    y =  pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values

    def mapper(start):
        snp_index = np.arange(start,min(start+block_size,test_snps.sid_count))
        x = test_snps[:,start:start+block_size].read().standardize().val
        _,pval_in = lin_reg.f_regression_cov_alt(x,y,covar)
        pval_in = pval_in.reshape(-1)

        if max_output_len is None:
            return pval_in,snp_index
        else: #We only need to return the top max_output_len results
            sort_index = np.argsort(pval_in)[:max_output_len]
            return pval_in[sort_index],snp_index[sort_index]

    def reducer(pval_and_snp_index_sequence):
        pval_list = []
        snp_index_list = []
        for pval, snp_index in pval_and_snp_index_sequence:
            pval_list.append(pval)
            snp_index_list.append(snp_index)
        pval = np.concatenate(pval_list)
        snp_index = np.concatenate(snp_index_list)
        sort_index = np.argsort(pval)
        if max_output_len is not None:
            sort_index = sort_index[:max_output_len]
        index = snp_index[sort_index]

        dataframe = pd.DataFrame(
            index=np.arange(len(index)),
            columns=('sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue')
            )
        #!!Is this the only way to set types in a dataframe?
        dataframe['sid_index'] = dataframe['sid_index'].astype(np.float)
        dataframe['Chr'] = dataframe['Chr'].astype(np.float)
        dataframe['GenDist'] = dataframe['GenDist'].astype(np.float)
        dataframe['ChrPos'] = dataframe['ChrPos'].astype(np.float)
        dataframe['PValue'] = dataframe['PValue'].astype(np.float)

        dataframe['sid_index'] = index
        dataframe['SNP'] = test_snps.sid[index]
        dataframe['Chr'] = test_snps.pos[index,0]
        dataframe['GenDist'] = test_snps.pos[index,1]
        dataframe['ChrPos'] = test_snps.pos[index,2]
        dataframe['PValue'] = pval[sort_index]

        if output_file_name is not None:
            dataframe.to_csv(output_file_name, sep="\t", index=False)

        return dataframe

    dataframe = map_reduce(xrange(0,test_snps.sid_count,block_size),
                           mapper=mapper,
                           reducer=reducer,
                           input_files=[test_snps,pheno,covar],
                           output_files=[output_file_name],
                           name = "single_snp_linreg",
                           runner=runner)
    return dataframe
コード例 #9
0
def single_snp_select(test_snps, pheno, G=None, covar=None,
                 k_list = None,
                 n_folds=10, #1 is special and means test on train
                 just_return_selected_snps=False,
                 seed = 0, output_file_name = None,
                 GB_goal=None, force_full_rank=False, force_low_rank=False, h2=None, runner=None, count_A1=None):
    """
    Function performing single SNP GWAS based on covariates (often PCs) and a similarity matrix constructed of the top *k* SNPs where
    SNPs are ordered via the PValue from :meth:`.single_snp_linreg` and *k* is determined via out-of-sample prediction. Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param G: SNPs from which to create a similarity matrix of the top *k* SNPs. If not given, will use test_snps.
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type G: :class:`.SnpReader` or a string

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string

    :param k_list: Values of *k* (in addition to 0) to test. Default to [1,2,4,8,...8192].
    :type k_list: list of numbers

    :param n_folds: Number of folds of cross validation to use for out-of-sample evaluation of various values of *k*. Default to 10.
    :type n_folds: number
    
    :param just_return_selected_snps: Instead of returning the results of GWAS, return the top *k* SNPs selected.
    :type just_return_selected_snps: list of strings

    :param seed: (optional) Random seed used to generate permutations for lrt G0 fitting.
    :type seed: number

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param h2: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional
            If not given will search for best value.
    :type h2: number

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool


    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_select
    >>> from pysnptools.snpreader import Bed
    >>> from fastlmm.util import compute_auto_pcs
    >>> bed_fn = "../../tests/datasets/synth/all.bed"
    >>> phen_fn = "../../tests/datasets/synth/pheno_10_causals.txt"
    >>> covar = compute_auto_pcs(bed_fn,count_A1=False)
    >>> results_dataframe = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn, covar=covar, GB_goal=2, count_A1=False)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    snp495_m0_.01m1_.04 0.0 5000

    """

    #!!!code similar to single_snp and feature_selection
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert test_snps is not None, "test_snps must be given as input"

    if k_list is None:
        k_list = np.logspace(start=0, stop=13, num=14, base=2)

    test_snps, G, pheno, covar = _fixup(test_snps, G, pheno, covar, count_A1=count_A1)
    common_input_files = [test_snps, G, pheno, covar]

    k_list_in = [0] + [int(k) for k in k_list if 0 < k and k <= G.sid_count]

    def top_snps_for_each_fold_nested(kfold_item):
        fold_index, (train_idx, test_idx) = kfold_item
        _, G_in, pheno_in, covar_in = _fixup(test_snps, G, pheno, covar,count_A1=count_A1)
        nested = single_snp_linreg(G_in[train_idx,:],pheno_in[train_idx,:],covar_in[train_idx,:],GB_goal=GB_goal,max_output_len=max(k_list_in),count_A1=count_A1)
        return nested
    def top_snps_for_each_fold_reducer(dataframe_list):
        result = [list(dataframe['SNP']) for dataframe in dataframe_list]
        return result


    #Find top snps for each fold        
    fold_index_to_top_snps = map_reduce(_kfold(G.iid_count, n_folds, seed, end_with_all=True,iid_to_index=G.iid_to_index),
                                         nested=top_snps_for_each_fold_nested,
                                         reducer=top_snps_for_each_fold_reducer,
                                         name = "top_snps_for_each_fold",
                                         input_files = common_input_files,
                                         runner=runner)

    #=================================================
    # Start of definition of inner functions
    #=================================================
    def k_index_to_nLL_mapper(k):
        _, G_in, pheno_in, covar_in = _fixup(test_snps, G, pheno, covar,count_A1=count_A1)
        nll_sum=0
        mse_sum = 0
        n_folds_in = 0
        for fold_index, (train_idx, test_idx) in _kfold(G.iid_count, n_folds, seed, end_with_all=False,iid_to_index=G.iid_to_index):
            assert set(train_idx).isdisjoint(set(test_idx)), "real assert"
            top_snps_in_fold = fold_index_to_top_snps[fold_index][:k]
            sid_idx_in_fold = G_in.sid_to_index(top_snps_in_fold)
            G_train = G_in[train_idx,sid_idx_in_fold] if k > 0 else None
            fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal)
            fastlmm.fit(K0_train=G_train, X=covar_in[train_idx,:], y=pheno_in[train_idx,:], h2raw=h2) #iid intersection means when can give the whole covariate and pheno
            G_test = G_in[test_idx,sid_idx_in_fold] if k > 0 else KernelIdentity(G_in.iid,G_in.iid[test_idx]) #!!! instead of this, which blows up when # of iids is large, should switch to linear regression model with k is 0
            nll,mse = fastlmm.score(K0_whole_test=G_test,X=covar_in[test_idx,:],y=pheno_in[test_idx,:],return_mse_too=True) #iid intersection means when can give the whole covariate and pheno
            nll_sum += nll
            mse_sum += mse
            n_folds_in += 1
        logging.info("k={0},nLL={1},average mse={2}".format(k,nll_sum,mse_sum / n_folds_in))
        return nll_sum
    #=================================================
    # End of definition of inner functions
    #=================================================


    #find best # of top SNPs
    k_index_to_nLL = map_reduce(k_list_in,
                                mapper = k_index_to_nLL_mapper,
                                input_files = common_input_files,
                                name = "k_index_to_nLL",
                                runner=runner)
    best_k = k_list_in[np.argmin(k_index_to_nLL)]

    top_snps = fold_index_to_top_snps[-1][:best_k]
    if just_return_selected_snps:
        return top_snps

    sid_idx = G.sid_to_index(top_snps)
    G_top = G[:,sid_idx]

    # Run GWAS with leave-one-chrom out
    single_snp_result = single_snp(test_snps=test_snps, K0=G_top, pheno=pheno,
                                covar=covar, leave_out_one_chrom=True,
                                GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank, h2=h2,
                                output_file_name=output_file_name,runner=runner,count_A1=count_A1)

    return single_snp_result
コード例 #10
0
def _internal_single(K0, test_snps, pheno, covar, K1, mixing, h2, log_delta,
                     cache_file, force_full_rank, force_low_rank,
                     output_file_name, block_size, interact_with_snp, runner):
    assert K0 is not None, "real assert"
    assert K1 is not None, "real assert"
    assert block_size is not None, "real assert"
    assert mixing is None or 0.0 <= mixing <= 1.0
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified"
    if log_delta is not None:
        h2 = 1.0 / (np.exp(log_delta) + 1)

    covar = np.c_[covar.read(view_ok=True, order='A').val,
                  np.ones(
                      (test_snps.iid_count,
                       1))]  #view_ok because np.c_ will allocation new memory

    y = pheno.read(
        view_ok=True, order='A'
    ).val  #view_ok because this code already did a fresh read to look for any missing values

    if cache_file is not None and os.path.exists(cache_file):
        lmm = lmm_cov(X=covar, Y=y, G=None, K=None)
        with np.load(cache_file) as data:  #!! similar code in epistasis
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
            h2 = data['arr_2'][0]
            mixing = data['arr_2'][1]
    else:
        K, h2, mixer = _Mixer.combine_the_best_way(
            K0,
            K1,
            covar,
            y,
            mixing,
            h2,
            force_full_rank=force_full_rank,
            force_low_rank=force_low_rank,
            kernel_standardizer=DiagKtoN())
        mixing = mixer.mixing

        if mixer.do_g:
            lmm = lmm_cov(X=covar,
                          Y=y,
                          K=None,
                          G=K.snpreader.val,
                          inplace=True)
        else:
            lmm = lmm_cov(X=covar, Y=y, K=K.val, G=None, inplace=True)

        if h2 is None:
            result = lmm.findH2()
            h2 = result['h2']
        logging.info("h2={0}".format(h2))

        if cache_file is not None and not os.path.exists(cache_file):
            pstutil.create_directory_if_necessary(cache_file)
            lmm.getSU()
            np.savez(
                cache_file, lmm.U, lmm.S, np.array([h2, mixing])
            )  #using np.savez instead of pickle because it seems to be faster to read and write

    if interact_with_snp is not None:
        logging.info("interaction with %i" % interact_with_snp)
        assert 0 <= interact_with_snp and interact_with_snp < covar.shape[
            1] - 1, "interact_with_snp is out of range"
        interact = covar[:, interact_with_snp].copy()
        interact -= interact.mean()
        interact /= interact.std()
    else:
        interact = None

    work_count = -(test_snps.sid_count // -block_size
                   )  #Find the work count based on batch size (rounding up)

    # We define three closures, that is, functions define inside function so that the inner function has access to the local variables of the outer function.
    def debatch_closure(work_index):
        return test_snps.sid_count * work_index // work_count

    def mapper_closure(work_index):
        if work_count > 1:
            logging.info("single_snp: Working on snp block {0} of {1}".format(
                work_index, work_count))
        do_work_time = time.time()
        start = debatch_closure(work_index)
        end = debatch_closure(work_index + 1)

        snps_read = test_snps[:, start:end].read().standardize()
        if interact_with_snp is not None:
            variables_to_test = snps_read.val * interact[:, np.newaxis]
        else:
            variables_to_test = snps_read.val
        res = lmm.nLLeval(h2=h2,
                          dof=None,
                          scale=1.0,
                          penalty=0.0,
                          snps=variables_to_test)

        beta = res['beta']

        chi2stats = beta * beta / res['variance_beta']
        #p_values = stats.chi2.sf(chi2stats,1)[:,0]
        assert test_snps.iid_count == lmm.U.shape[0]
        p_values = stats.f.sf(
            chi2stats, 1, lmm.U.shape[0] -
            (lmm.linreg.D +
             1))[:, 0]  #note that G.shape is the number of individuals#

        dataframe = _create_dataframe(snps_read.sid_count)
        dataframe['sid_index'] = np.arange(start, end)
        dataframe['SNP'] = snps_read.sid
        dataframe['Chr'] = snps_read.pos[:, 0]
        dataframe['GenDist'] = snps_read.pos[:, 1]
        dataframe['ChrPos'] = snps_read.pos[:, 2]
        dataframe['PValue'] = p_values
        dataframe['SnpWeight'] = beta[:, 0]
        dataframe['SnpWeightSE'] = np.sqrt(res['variance_beta'][:, 0])
        dataframe['SnpFractVarExpl'] = np.sqrt(
            res['fraction_variance_explained_beta'][:, 0])
        dataframe['Mixing'] = np.zeros((snps_read.sid_count)) + mixing
        dataframe['Nullh2'] = np.zeros((snps_read.sid_count)) + h2

        logging.info("time={0}".format(time.time() - do_work_time))

        #logging.info(dataframe)
        return dataframe

    def reducer_closure(result_sequence):
        if output_file_name is not None:
            create_directory_if_necessary(output_file_name)

        frame = pd.concat(result_sequence)
        frame.sort_values(by="PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if output_file_name is not None:
            frame.to_csv(output_file_name, sep="\t", index=False)

        return frame

    frame = map_reduce(
        xrange(work_count),
        mapper=mapper_closure,
        reducer=reducer_closure,
        input_files=[test_snps],
        output_files=[output_file_name],
        name="single_snp(output_file={0})".format(output_file_name),
        runner=runner)
    return frame
コード例 #11
0
def single_snp(test_snps,
               pheno,
               K0=None,
               K1=None,
               mixing=None,
               covar=None,
               covar_by_chrom=None,
               leave_out_one_chrom=True,
               output_file_name=None,
               h2=None,
               log_delta=None,
               cache_file=None,
               GB_goal=None,
               interact_with_snp=None,
               force_full_rank=False,
               force_low_rank=False,
               G0=None,
               G1=None,
               runner=None,
               count_A1=None):
    """
    Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed.
    (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.)

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps.
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.)
    :type K0: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing').
           Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.)
    :type K1: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0.
            If you give no mixing number and a K1 is given, the best weight will be learned.
    :type mixing: number

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string

    :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True.
           (Warning: setting False can cause proximal contamination.)
    :type leave_out_one_chrom: boolean
    

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-deleted text.
    :type output_file_name: file name

    :param h2: A parameter to LMM learning, optional
            If not given will search for best value.
            If mixing is unspecified, then h2 must also be unspecified.
    :type h2: number

    :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1)
    :type log_delta: number

    :param cache_file: Name of  file to read or write cached precomputation values to, optional.
                If not given, no cache file will be used.
                If given and file does not exist, will write precomputation values to file.
                If given and file does exist, will read precomputation values from file.
                The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format.
                Calls using the same cache file should have the same 'K0' and 'K1'
                If given and the file does exist then K0 and K1 need not be given.
    :type cache_file: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param interact_with_snp: index of a covariate to perform an interaction test with. 
            Allows for interaction testing (interact_with_snp x snp will be tested)
            default: None

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given.
    :type G0: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given.
    :type G1: :class:`.SnpReader` or a string (or :class:`.KernelReader`)

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool


    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"



    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn, count_A1=False)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    t0 = time.time()
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps, count_A1=count_A1)
    pheno = _pheno_fixup(pheno, count_A1=count_A1).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val == pheno.val)[:, 0], :]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid, count_A1=count_A1)

    if not leave_out_one_chrom:
        assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None"
        K0 = _kernel_fixup(K0 or G0 or test_snps,
                           iid_if_none=test_snps.iid,
                           standardizer=Unit(),
                           count_A1=count_A1)
        K1 = _kernel_fixup(K1 or G1,
                           iid_if_none=test_snps.iid,
                           standardizer=Unit(),
                           count_A1=count_A1)
        K0, K1, test_snps, pheno, covar = pstutil.intersect_apply(
            [K0, K1, test_snps, pheno, covar])
        logging.debug("# of iids now {0}".format(K0.iid_count))
        K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal,
                                             force_full_rank, force_low_rank)

        frame = _internal_single(K0=K0,
                                 test_snps=test_snps,
                                 pheno=pheno,
                                 covar=covar,
                                 K1=K1,
                                 mixing=mixing,
                                 h2=h2,
                                 log_delta=log_delta,
                                 cache_file=cache_file,
                                 force_full_rank=force_full_rank,
                                 force_low_rank=force_low_rank,
                                 output_file_name=output_file_name,
                                 block_size=block_size,
                                 interact_with_snp=interact_with_snp,
                                 runner=runner)
        sid_index_range = IntRangeSet(frame['sid_index'])
        assert sid_index_range == (
            0,
            test_snps.sid_count), "Some SNP rows are missing from the output"
    else:
        chrom_list = list(
            set(test_snps.pos[:, 0])
        )  # find the set of all chroms mentioned in test_snps, the main testing data
        assert not np.isnan(
            chrom_list).any(), "chrom list should not contain NaN"
        input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + (
            [] if covar_by_chrom is None else covar_by_chrom.values())

        def nested_closure(chrom):
            test_snps_chrom = test_snps[:, test_snps.pos[:, 0] == chrom]
            covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)
            cache_file_chrom = None if cache_file is None else cache_file + ".{0}".format(
                chrom)

            K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom,
                                    test_snps.iid)
            K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid)

            K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom = pstutil.intersect_apply(
                [K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom])
            logging.debug("# of iids now {0}".format(K0_chrom.iid_count))
            K0_chrom, K1_chrom, block_size = _set_block_size(
                K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank,
                force_low_rank)

            distributable = _internal_single(
                K0=K0_chrom,
                test_snps=test_snps_chrom,
                pheno=pheno_chrom,
                covar=covar_chrom,
                K1=K1_chrom,
                mixing=mixing,
                h2=h2,
                log_delta=log_delta,
                cache_file=cache_file_chrom,
                force_full_rank=force_full_rank,
                force_low_rank=force_low_rank,
                output_file_name=None,
                block_size=block_size,
                interact_with_snp=interact_with_snp,
                runner=Local())

            return distributable

        def reducer_closure(frame_sequence):
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(test_snps.iid_count))
            logging.info("SNPCount\t{0}".format(test_snps.sid_count))
            logging.info("Runtime\t{0}".format(time.time() - t0))

            return frame

        frame = map_reduce(
            chrom_list,
            mapper=nested_closure,
            reducer=reducer_closure,
            input_files=input_files,
            output_files=[output_file_name],
            name="single_snp (leave_out_one_chrom), out='{0}'".format(
                output_file_name),
            runner=runner)

    return frame
コード例 #12
0
        def mapper_find_best_given_chrom(test_chr):
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
    
            def mapper_gather_lots(i_fold_and_pair):
                i_fold, (train_idx, test_idx) = i_fold_and_pair
                logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold))

                G_train = G_for_chrom[train_idx,:]

                #Precompute whole x whole standardized on train
                from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal
                min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank)
                block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count)
                K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read()

                assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert"
                K_train = K_whole_unittrain[train_idx]
                    
                single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno
                             covar=covar, leave_out_one_chrom=False,
                             GB_goal=GB_goal,  force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2)

                is_all = (i_fold == n_folds) if n_folds > 1 else True

                k_list_in =  [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)]

                if is_all:
                    top_snps = list(single_snp_result.SNP[:max_k])
                else:
                    top_snps = None

                if i_fold == n_folds:
                    k_index_to_nLL = None
                else:
                    k_index_to_nLL = []
                    for k in k_list_in:
                        top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])]
                        logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k))

                        top_k_train = top_k[train_idx,:] if k > 0 else None
                        fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal)
                        fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2raw=h2) #iid intersection means when can give the whole covariate and pheno
    
                        top_k_test = top_k[test_idx,:] if k > 0 else None
                        K0_whole_test = K_whole_unittrain[:,test_idx]
                        nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno
                        k_index_to_nLL.append(nLL)

                if i_fold > 0:
                    k_list_in = None
    
                return k_list_in, top_snps, k_index_to_nLL

            def reducer_find_best(top_snps_and_k_index_to_nLL_sequence):
                #Starts fold_index+all -> k_index -> nll
                #Need:  k_index -> sum(fold_index -> nll)

                k_index_to_sum_nll = None
                top_snps_all = None
                k_list_in_all = None
                for i_fold, (k_list_in, top_snps, k_index_to_nLL) in enumerate(top_snps_and_k_index_to_nLL_sequence):
                    if k_list_in is not None:
                        assert k_list_in_all is None, "real assert"
                        k_list_in_all = k_list_in
                        k_index_to_sum_nll = np.zeros(len(k_list_in))

                    if top_snps is not None:
                        assert top_snps_all is None, "real assert"
                        top_snps_all = top_snps

                    if k_index_to_nLL is not None:
                        assert i_fold < n_folds or n_folds == 1, "real assert"
                        for k_index, nLL in enumerate(k_index_to_nLL):
                            k_index_to_sum_nll[k_index] += nLL

                #find best # top_snps
                best_k = k_list_in_all[np.argmin(k_index_to_sum_nll)]
                logging.info("For chrom={0}, best_k={1}".format(test_chr,best_k))
                if do_plot: _nll_plot(k_list_in_all, k_index_to_sum_nll)

                #Return the top snps from all
                result = top_snps_all[:best_k]
                return result


            i_fold_index_to_top_snps_and_k_index_to_nLL = map_reduce(
                    _kfold(G_for_chrom.iid_count, n_folds, seed, end_with_all=True),
                    mapper=mapper_gather_lots,
                    reducer=reducer_find_best)
            return i_fold_index_to_top_snps_and_k_index_to_nLL
コード例 #13
0
def single_snp_linreg(test_snps,
                      pheno,
                      covar=None,
                      max_output_len=None,
                      output_file_name=None,
                      GB_goal=None,
                      runner=None):
    """
    Function performing single SNP GWAS using linear regression. Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a :class:`.SnpReader` or a string

    :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a :class:`.SnpReader` or a string

    :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a :class:`.SnpReader` or a string


    :param max_output_len: Maximum number of Pvalues to return. Default to None, which means 'Return all'.
    :type max_output_len: number
    
    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-deleted text.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks of size iid_count,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: a runner.

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_linreg
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp_linreg(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps)
    pheno = _pheno_fixup(pheno).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val == pheno.val)[:, 0], :]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid)
    test_snps, pheno, covar = pstutil.intersect_apply(
        [test_snps, pheno, covar])
    logging.debug("# of iids now {0}".format(test_snps.iid_count))

    if GB_goal is not None:
        bytes_per_sid = test_snps.iid_count * 8
        sid_per_GB_goal = 1024.0**3 * GB_goal / bytes_per_sid
        block_size = max(1, int(sid_per_GB_goal + .5))
        block_count = test_snps.sid_count / block_size
    else:
        block_count = 1
        block_size = test_snps.sid_count
    logging.debug("block_count={0}, block_size={1}".format(
        block_count, block_size))

    #!!!what about missing data in covar, in test_snps, in y
    covar = np.c_[covar.read(view_ok=True, order='A').val,
                  np.ones(
                      (test_snps.iid_count,
                       1))]  #view_ok because np.c_ will allocation new memory
    y = pheno.read(
        view_ok=True, order='A'
    ).val  #view_ok because this code already did a fresh read to look for any missing values

    def mapper(start):
        logging.info(
            "single_snp_linereg reading start={0},block_size={1}".format(
                start, block_size))
        snp_index = np.arange(start,
                              min(start + block_size, test_snps.sid_count))
        x = test_snps[:, start:start + block_size].read().standardize().val
        logging.info("single_snp_linereg linreg")
        _, pval_in = lin_reg.f_regression_cov_alt(x, y, covar)
        logging.info("single_snp_linereg done")
        pval_in = pval_in.reshape(-1)

        if max_output_len is None:
            return pval_in, snp_index
        else:  #We only need to return the top max_output_len results
            sort_index = np.argsort(pval_in)[:max_output_len]
            return pval_in[sort_index], snp_index[sort_index]

    def reducer(pval_and_snp_index_sequence):
        pval_list = []
        snp_index_list = []
        for pval, snp_index in pval_and_snp_index_sequence:
            pval_list.append(pval)
            snp_index_list.append(snp_index)
        pval = np.concatenate(pval_list)
        snp_index = np.concatenate(snp_index_list)
        sort_index = np.argsort(pval)
        if max_output_len is not None:
            sort_index = sort_index[:max_output_len]
        index = snp_index[sort_index]

        dataframe = pd.DataFrame(index=np.arange(len(index)),
                                 columns=('sid_index', 'SNP', 'Chr', 'GenDist',
                                          'ChrPos', 'PValue'))
        #!!Is this the only way to set types in a dataframe?
        dataframe['sid_index'] = dataframe['sid_index'].astype(np.float)
        dataframe['Chr'] = dataframe['Chr'].astype(np.float)
        dataframe['GenDist'] = dataframe['GenDist'].astype(np.float)
        dataframe['ChrPos'] = dataframe['ChrPos'].astype(np.float)
        dataframe['PValue'] = dataframe['PValue'].astype(np.float)

        dataframe['sid_index'] = index
        dataframe['SNP'] = test_snps.sid[index]
        dataframe['Chr'] = test_snps.pos[index, 0]
        dataframe['GenDist'] = test_snps.pos[index, 1]
        dataframe['ChrPos'] = test_snps.pos[index, 2]
        dataframe['PValue'] = pval[sort_index]

        if output_file_name is not None:
            dataframe.to_csv(output_file_name, sep="\t", index=False)

        return dataframe

    dataframe = map_reduce(xrange(0, test_snps.sid_count, block_size),
                           mapper=mapper,
                           reducer=reducer,
                           input_files=[test_snps, pheno, covar],
                           output_files=[output_file_name],
                           name="single_snp_linreg",
                           runner=runner)
    return dataframe