コード例 #1
0
ファイル: test.py プロジェクト: hyacz/PySnpTools
    def test_some_std(self):
        k0 = self.snpdata.read_kernel(standardizer=Unit()).val
        from pysnptools.kernelreader import SnpKernel
        k1 = self.snpdata.read_kernel(standardizer=Unit())
        np.testing.assert_array_almost_equal(k0, k1.val, decimal=10)

        from pysnptools.snpreader import SnpData
        snpdata2 = SnpData(iid=self.snpdata.iid,
                           sid=self.snpdata.sid,
                           pos=self.snpdata.pos,
                           val=np.array(self.snpdata.val))
        s = str(snpdata2)
        snpdata2.standardize()
        s = str(snpdata2)

        snpreader = Bed(self.currentFolder + "/examples/toydata",
                        count_A1=False)
        k2 = snpreader.read_kernel(standardizer=Unit(), block_size=500).val
        np.testing.assert_array_almost_equal(k0, k2, decimal=10)

        from pysnptools.standardizer.identity import Identity
        from pysnptools.standardizer.diag_K_to_N import DiagKtoN
        for dtype in [sp.float64, sp.float32]:
            for std in [Unit(), Beta(1, 25), Identity(), DiagKtoN()]:
                s = str(std)
                np.random.seed(0)
                x = np.array(np.random.randint(3, size=[60, 100]), dtype=dtype)
                x2 = x[:, ::2]
                x2b = np.array(x2)
                #LATER what's this about? It doesn't do non-contiguous?
                #assert not x2.flags['C_CONTIGUOUS'] and not x2.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #assert x2b.flags['C_CONTIGUOUS'] or x2b.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #a,b = std.standardize(x2b),std.standardize(x2)
                #np.testing.assert_array_almost_equal(a,b)
        logging.info("done")
コード例 #2
0
    def test_respect_read_inputs(self):
        from pysnptools.kernelreader import KernelHdf5, Identity, KernelNpz, SnpKernel
        from pysnptools.standardizer import Unit
        from pysnptools.standardizer import Identity as StdIdentity
        from pysnptools.snpreader import Bed

        previous_wd = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        iidref = KernelNpz('../examples/toydata.kernel.npz').iid

        for kernelreader in [
                SnpKernel(Bed('../examples/toydata.5chrom.bed', count_A1=True),
                          StdIdentity())[::2, ::2],
                Bed('../examples/toydata.5chrom.bed',
                    count_A1=True)[::2, ::2].read_kernel(StdIdentity()),
                KernelHdf5('../examples/toydata.kernel.hdf5'),
                Identity(iidref, test=[('0', 'x'), ('0', 'y')]),
                Identity(iidref),
                KernelNpz('../examples/toydata.kernel.npz'),
                KernelNpz('../examples/toydata.kernel.npz').read(),
                KernelNpz('../examples/toydata.kernel.npz')[::2, ::2],
                Bed('../examples/toydata.5chrom.bed',
                    count_A1=True).read_kernel(Unit()),
                SnpKernel(Bed('../examples/toydata.5chrom.bed', count_A1=True),
                          Unit())
        ]:
            logging.info(str(kernelreader))
            for order in ['F', 'C', 'A']:
                for dtype in [np.float32, np.float64]:
                    for force_python_only in [True, False]:
                        for view_ok in [True, False]:
                            val = kernelreader.read(
                                order=order,
                                dtype=dtype,
                                force_python_only=force_python_only,
                                view_ok=view_ok).val
                            has_right_order = order == "A" or (
                                order == "C" and val.flags["C_CONTIGUOUS"]
                            ) or (order == "F" and val.flags["F_CONTIGUOUS"])
                            if hasattr(kernelreader, 'val') and not view_ok:
                                assert kernelreader.val is not val
                            if (hasattr(kernelreader, 'val') and view_ok
                                    and kernelreader.val is not val and
                                (order == 'A' or
                                 (order == 'F' and
                                  kernelreader.val.flags['F_CONTIGUOUS']) or
                                 (order == 'C'
                                  and kernelreader.val.flags['C_CONTIGUOUS']))
                                    and (dtype is None
                                         or kernelreader.val.dtype == dtype)):
                                logging.info(
                                    "{0} could have read a view, but didn't".
                                    format(distreader))
                            assert val.dtype == dtype and has_right_order
        os.chdir(previous_wd)
コード例 #3
0
ファイル: test.py プロジェクト: hyacz/PySnpTools
    def standardize(self, snpreader):
        """
        make sure blocked standardize yields same result as regular standardize
        """

        for dtype in [sp.float64, sp.float32]:

            snps = snpreader.read(order='F',
                                  force_python_only=True,
                                  dtype=dtype).val
            self.assertEqual(dtype, snps.dtype)

            snp_s1 = Unit().standardize(snps.copy(), force_python_only=True)
            snp_s2 = Unit().standardize(snps.copy(),
                                        block_size=100,
                                        force_python_only=True)
            snps_F = np.array(snps, dtype=dtype, order="F")
            snp_s3 = Unit().standardize(snps_F)
            snps_C = np.array(snps, dtype=dtype, order="C")
            snp_s4 = Unit().standardize(snps_C)

            snp_beta1 = Beta(1, 25).standardize(snps.copy(),
                                                force_python_only=True)
            snps_F = np.array(snps, dtype=dtype, order="F")
            snp_beta2 = Beta(1, 25).standardize(snps_F)
            snps_C = np.array(snps, dtype=dtype, order="C")
            snp_beta3 = Beta(1, 25).standardize(snps_C)

            self.assertEqual(snp_s1.shape[0], snp_s2.shape[0])
            self.assertEqual(snp_s1.shape[1], snp_s2.shape[1])

            self.assertEqual(snp_s1.shape[0], snp_s3.shape[0])
            self.assertEqual(snp_s1.shape[1], snp_s3.shape[1])

            self.assertEqual(snp_s1.shape[0], snp_s4.shape[0])
            self.assertEqual(snp_s1.shape[1], snp_s4.shape[1])

            self.assertTrue(np.allclose(snp_s1, snp_s2, rtol=1e-05,
                                        atol=1e-05))
            self.assertTrue(np.allclose(snp_s1, snp_s3, rtol=1e-05,
                                        atol=1e-05))
            self.assertTrue(np.allclose(snp_s1, snp_s4, rtol=1e-05,
                                        atol=1e-05))

            self.assertEqual(snp_beta1.shape[0], snp_beta2.shape[0])
            self.assertEqual(snp_beta1.shape[1], snp_beta2.shape[1])
            self.assertEqual(snp_beta1.shape[0], snp_beta3.shape[0])
            self.assertEqual(snp_beta1.shape[1], snp_beta3.shape[1])

            self.assertTrue(
                np.allclose(snp_beta1, snp_beta2, rtol=1e-05, atol=1e-05))
            self.assertTrue(
                np.allclose(snp_beta1, snp_beta3, rtol=1e-05, atol=1e-05))
コード例 #4
0
ファイル: fastlmm_predictor.py プロジェクト: fastlmm/FaST-LMM
 def __init__(self,
              GB_goal=None,
              force_full_rank=False,
              force_low_rank=False,
              snp_standardizer=Unit(),
              covariate_standardizer=Unit(),
              kernel_standardizer=DiagKtoN()):
     self.GB_goal = GB_goal
     self.force_full_rank = force_full_rank
     self.force_low_rank = force_low_rank
     self.snp_standardizer = snp_standardizer
     self.covariate_standardizer = covariate_standardizer
     self.kernel_standardizer = kernel_standardizer
     self.is_fitted = False
コード例 #5
0
ファイル: snpdata.py プロジェクト: fastlmm/PySnpTools
    def standardize(self,
                    standardizer=Unit(),
                    block_size=None,
                    return_trained=False,
                    force_python_only=False,
                    num_threads=None):
        """Does in-place standardization of the in-memory
        SNP data. By default, it applies 'Unit' standardization, that is: the values for each SNP will have mean zero and standard deviation 1.0.
        NaN values are then filled with zero, the mean (consequently, if there are NaN values, the final standard deviation will not be zero.
        Note that, for efficiency, this method works in-place, actually changing values in the ndarray. Although it works in place, for convenience
        it also returns the SnpData.

        :param standardizer: optional -- Specify standardization to be applied. 
             Any :class:`.Standardizer` may be used. Some choices include :class:`.Unit` (default, makes values for each SNP have mean zero and
             standard deviation 1.0) and :class:`.Beta`.
        :type standardizer: :class:`.Standardizer`

        :param block_size: optional -- Deprecated.
        :type block_size: None

        :param return_trained: If true, returns a second value containing a constant :class:`.Standardizer` trained on this data.
        :type return_trained: bool

        :param force_python_only: optional -- If true, will use pure Python instead of faster C++ libraries.
        :type force_python_only: bool

        :param num_threads: optional -- The number of threads with which to standardize data. Defaults to all available
            processors. Can also be set with these environment variables (listed in priority order):
            'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'.
        :type num_threads: None or int

        :rtype: :class:`.SnpData` (standardizes in place, but for convenience, returns 'self')

        >>> from pysnptools.snpreader import Bed
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> bed_file = example_file("tests/datasets/all_chr.maf0.001.N300.*","*.bed")
        >>> snp_on_disk = Bed(bed_file,count_A1=False) # Specify some data on disk in Bed format
        >>> snpdata1 = snp_on_disk.read() # read all SNP values into memory
        >>> print(snpdata1) # Prints the specification for this SnpData
        SnpData(Bed(...tests/datasets/all_chr.maf0.001.N300.bed',count_A1=False))
        >>> print(snpdata1.val[0,0])
        2.0
        >>> snpdata1.standardize() # standardize changes the values in snpdata1.val and changes the specification.
        SnpData(Bed(...tests/datasets/all_chr.maf0.001.N300.bed',count_A1=False),Unit())
        >>> print('{0:.6f}'.format(snpdata1.val[0,0]))
        0.229416
        >>> snpdata2 = snp_on_disk.read().standardize() # Read and standardize in one expression with only one ndarray allocated.
        >>> print('{0:.6f}'.format(snpdata2.val[0,0]))
        0.229416
        """
        self._std_string_list.append(str(standardizer))
        _, trained = standardizer.standardize(
            self,
            return_trained=True,
            force_python_only=force_python_only,
            num_threads=num_threads)
        if return_trained:
            return self, trained
        else:
            return self
コード例 #6
0
    def test_leave_one_out_with_prekernels(self):
        logging.info(
            "TestSingleSnpLeaveOutOneChrom test_leave_one_out_with_prekernels")
        from pysnptools.kernelstandardizer import DiagKtoN
        test_snps = Bed(self.bedbase, count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        chrom_to_kernel = {}
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:
            for chrom in np.unique(test_snps.pos[:, 0]):
                other_snps = test_snps[:, test_snps.pos[:, 0] != chrom]
                kernel = other_snps.read_kernel(
                    standardizer=Unit(), block_size=500
                )  #Create a kernel from the SNPs not used in testing
                chrom_to_kernel[chrom] = kernel.standardize(
                    DiagKtoN()
                )  #improves the kernel numerically by making its diagonal sum to iid_count

        output_file = self.file_name("one_looc_prekernel")
        frame = single_snp(test_snps,
                           pheno,
                           covar=covar,
                           K0=chrom_to_kernel,
                           output_file_name=output_file,
                           count_A1=False)

        self.compare_files(frame, "one_looc")
コード例 #7
0
    def _build_G0(self):
        """Low rank case: constructs :math:`G_0` from provided bed file (PLINK 1).

        :return: normalized genotypes :math:`G_0` and number of SNVs that where loaded
        :rtype: numpy.ndarray, int
        """

        temp_genotypes = self.bed[:,
                                  self.variants_to_include].read().standardize(
                                      Unit()).val

        # Replaced the code below with PySnpTools internal standardizer
        #filter_invariant = ~(temp_genotypes == temp_genotypes[0, :]).all(0)
        #filter_invariant = ~filter_invariant.all(0)
        #filter_all_nan = ~np.all(np.isnan(temp_genotypes), axis=0)
        #total_filter = filter_invariant & filter_all_nan
        #temp_genotypes = temp_genotypes[:, total_filter]
        #temp_genotypes = VariantLoader.standardize(temp_genotypes)
        #nb_SNVs_filtered = temp_genotypes.shape[1]
        # Normalize
        #return temp_genotypes / np.sqrt(nb_SNVs_filtered), nb_SNVs_filtered

        # TODO: is invariant-filtering really necessary here?
        invariant = (temp_genotypes == temp_genotypes[0, :]).all(0)

        n_filtered = (~invariant).sum()
        temp_genotypes /= np.sqrt(n_filtered)

        return temp_genotypes[:, ~invariant], n_filtered
コード例 #8
0
def load_snp_data(snpreader,
                  pheno_fn,
                  cov_fn=None,
                  offset=True,
                  mpheno=0,
                  standardizer=Unit()):
    """Load plink files
    ----------

    snpreader : snpreader object
        object to read in binary SNP file

    pheno_fn : str
        File name of phenotype file

    cov_fn : str
        File name of covariates file

    offset : bool, default=True
        Adds offset to the covariates specified in cov_fn, if neccesssary


    Returns
    -------
    G : array, shape = [n_samples, n_features]
        SNP matrix

    X : array, shape = [n_samples, n_covariates]
        Matrix of covariates (e.g. age, gender)

    y : array, shape = [n_samples]
        Phenotype (target) vector

    """

    #TODO: completely remove this
    pheno = pstpheno.loadOnePhen(pheno_fn, mpheno, vectorize=True)
    geno = snpreader.read(order='C').standardize(standardizer)

    # sanity check
    #assert np.testing.assert_array_equal(ind_iid, pheno['iid'][indarr[:,0]])

    # load covariates or generate vector of ones (for bias)
    if cov_fn == None:
        cov = {'vals': np.ones((len(pheno['iid']), 1)), 'iid': pheno['iid']}
    else:
        cov = pstpheno.loadPhen(cov_fn)

    (y, yiid), G, (X, xiid) = pstutil.intersect_apply(
        [(pheno['vals'], pheno['iid']), geno, (cov['vals'], cov['iid'])],
        sort_by_dataset=False)
    G = G.read(order='C', view_ok=True)

    # add bias column if not present
    if offset and sp.all(X.std(0) != 0):
        offset = sp.ones((len(indarr), 1))
        X = sp.hstack((X, offset))

    return G, X, y
コード例 #9
0
    def test_mixingKs(self):
        logging.info("TestSingleSnp test_mixingKs")
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file_name = self.file_name("mixingKs")
        frame = single_snp(test_snps=test_snps[:, :10],
                           pheno=pheno,
                           K0=SnpKernel(test_snps[:, 10:100], Unit()),
                           leave_out_one_chrom=False,
                           covar=covar,
                           K1=SnpKernel(test_snps[:, 100:200], Unit()),
                           mixing=None,
                           output_file_name=output_file_name)

        self.compare_files(frame, "mixing")
コード例 #10
0
def core_run(snpreader, pheno_fn, k, delta):
    """
    extracted core functionality, to avoid shuffle of data and not correct delta
    """

    G, X, y = load_snp_data(snpreader, pheno_fn, standardizer=Unit())
    kf = KFold(n_splits=10, shuffle=False).split(list(range(len(y))))

    ll = np.zeros(10)

    fold_idx = 0
    fold_data = {}
    for split_idx, (train_idx, test_idx) in enumerate(kf):
        fold_idx += 1

        fold_data["train_idx"] = train_idx
        fold_data["test_idx"] = test_idx

        # set up data
        ##############################
        fold_data["G_train"] = G[train_idx,:].read()
        fold_data["G_test"] = G[test_idx,:]

        fold_data["X_train"] = X[train_idx]
        fold_data["X_test"] = X[test_idx]

        fold_data["y_train"] = y[train_idx]
        fold_data["y_test"] = y[test_idx]


        # feature selection
        ##############################
        _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt,fold_data["G_train"].val,fold_data["y_train"],blocksize=1E4,C=fold_data["X_train"])
        feat_idx = np.argsort(_pval)
        fold_data["feat_idx"] = feat_idx
        
        # re-order SNPs (and cut to max num)
        ##############################
        fold_data["G_train"] = fold_data["G_train"][:,feat_idx[0:k]].read()
        fold_data["G_test"] = fold_data["G_test"][:,feat_idx[0:k]].read()

        model = getLMM()
        model.setG(fold_data["G_train"].val)
        model.sety(fold_data["y_train"])
        model.setX(fold_data["X_train"])

        REML = False
        
        # predict on test set
        res = model.nLLeval(delta=delta, REML=REML)
        model.setTestData(Xstar=fold_data["X_test"], G0star=fold_data["G_test"].val)
        model.predictMean(beta=res["beta"], delta=delta)
        #mse_cv1[k_idx, delta_idx] = mean_squared_error(fold_data["y_test"],
        #out)
        ll[split_idx] = model.nLLeval_test(fold_data["y_test"], res["beta"], sigma2=res["sigma2"], delta=delta)


    return ll
コード例 #11
0
def _K_per_chrom(K, chrom, iid,count_A1=None):
    if K is None:
        return KernelIdentity(iid)
    else:
        K_all = _kernel_fixup(K, iid_if_none=iid, standardizer=Unit(),count_A1=count_A1) 
        if isinstance(K_all, SnpKernel):
            return SnpKernel(K_all.snpreader[:,K_all.pos[:,0] != chrom],K_all.standardizer)
        else:
            raise Exception("Don't know how to make '{0}' work per chrom".format(K_all))
コード例 #12
0
    def _build_K0_blocked(self):
        """Full rank case: Builds background kernel :math:`K_0` by loading blocks of SNPs from provided bed file (PLINK 1).

        :return: normalized background kernel :math:`K_0` and number of SNVs that where used to built the kernel
        :rtype: numpy.ndarray, int
        """

        # TODO: make use of PySnpTools KernelReader functionality

        K0 = np.zeros([self.nb_ind, self.nb_ind], dtype=np.float32)
        nb_SNVs_filtered = 0
        stop = self.nb_SNVs_unf

        for start in range(0, stop, self.blocksize):

            if start + self.blocksize >= stop:
                temp_genotypes = self.bed[:, self.
                                          variants_to_include[start:]].read(
                                          ).standardize(Unit()).val
            else:
                temp_genotypes = self.bed[:, self.variants_to_include[
                    start:start + self.blocksize]].read().standardize(
                        Unit()).val

            # Replaced the code below with the PySnpTools internal standardizer
            # temp_genotypes = VariantLoader.mean_imputation(temp_genotypes)
            # filter_invariant = temp_genotypes == temp_genotypes[0, :]
            # filter_invariant = ~filter_invariant.all(0)
            # filter_all_nan = ~np.all(np.isnan(temp_genotypes), axis=0)
            # total_filter = filter_invariant & filter_all_nan
            # temp_genotypes = temp_genotypes[:, total_filter]
            # temp_genotypes = VariantLoader.standardize(temp_genotypes)
            # temp_n_SNVS = temp_genotypes.shape[1]
            # nb_SNVs_filtered += temp_n_SNVS

            # TODO: is invariant-filtering really necessary here?
            invariant = (temp_genotypes == temp_genotypes[0, :]).all(0)

            K0 += np.matmul(temp_genotypes[:, ~invariant],
                            temp_genotypes[:, ~invariant].T)
            nb_SNVs_filtered += (~invariant).sum()

        return K0 / nb_SNVs_filtered, nb_SNVs_filtered
コード例 #13
0
 def train_standardizer(self, apply_in_place, standardizer=Unit(), force_python_only=False):
     """
     .. deprecated:: 0.2.23
        Use :meth:`standardize` with return_trained=True instead.
     """
     warnings.warn("train_standardizer is deprecated. standardize(...,return_trained=True,...) instead", DeprecationWarning)
     assert apply_in_place, "code assumes apply_in_place"
     self._std_string_list.append(str(standardizer))
     _, trained_standardizer = standardizer.standardize(self, return_trained=True, force_python_only=force_python_only)
     return trained_standardizer
コード例 #14
0
def sim_zsc(bfile,
            nsample,
            start_chrom,
            end_chrom,
            pheno,
            legend,
            standardize,
            freq,
            nblock=40):

    zsc_maf_thres = 0.01

    nindv = nsample

    nsnp_all = legend.shape[0]
    zsc = np.zeros(nsnp_all, dtype=np.float32)

    for i in xrange(start_chrom, end_chrom + 1):

        snpdata = Bed('{}{}.bed'.format(bfile, i), count_A1=False)
        nsnp = snpdata.sid_count
        blocks = create_block(0, nsnp - 1, nblock)

        snp_idx = np.where(legend['CHR'] == i)[0]
        zsc_chrom = np.zeros(snp_idx.shape[0])

        freq_chrom = freq[snp_idx]
        mask_chrom = np.zeros(nsnp, dtype=bool)
        mask_chrom[freq_chrom > zsc_maf_thres] = True

        for blk in blocks:

            mask_chrom_blk = mask_chrom[blk]
            use_idx = blk[mask_chrom_blk == True]

            snpdata_blk = snpdata[0:nindv, use_idx]
            if standardize == False:
                snpdata_blk = snpdata_blk.read(dtype=np.float32).val
            else:
                snpdata_blk = snpdata_blk.read(dtype=np.float32)\
                    .standardize(Unit()).val
            if standardize == False:
                snpdata_blk -= snpdata_blk.mean(axis=0)
            if standardize == True:
                zsc_chrom[use_idx] = np.dot(snpdata_blk.T,
                                            pheno) / np.sqrt(nindv)
            else:
                sigmasq = snpdata_blk.var(axis=0)
                zsc_chrom[use_idx] = np.dot(snpdata_blk.T, pheno)
                zsc_chrom[use_idx] /= np.sqrt(nindv * sigmasq)

        zsc[snp_idx] = zsc_chrom

    return zsc[freq > zsc_maf_thres]
コード例 #15
0
ファイル: test.py プロジェクト: hyacz/PySnpTools
    def factory_iterator():

        snp_reader_factory_bed = lambda: Bed("examples/toydata",
                                             count_A1=False)
        snp_reader_factory_snpmajor_hdf5 = lambda: SnpHdf5(
            "examples/toydata.snpmajor.snp.hdf5")
        snp_reader_factory_iidmajor_hdf5 = lambda: SnpHdf5(
            "examples/toydata.iidmajor.snp.hdf5")
        snp_reader_factory_dat = lambda: Dat("examples/toydata.dat")

        previous_wd = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        snpreader0 = snp_reader_factory_bed()
        S_original = snpreader0.sid_count
        N_original = snpreader0.iid_count

        snps_to_read_count = min(S_original, 100)

        for iid_index_list in [
                list(range(N_original)),
                list(range(N_original / 2)),
                list(range(N_original - 1, 0, -2))
        ]:
            for snp_index_list in [
                    list(range(snps_to_read_count)),
                    list(range(snps_to_read_count / 2)),
                    list(range(snps_to_read_count - 1, 0, -2))
            ]:
                for standardizer in [Unit(), Beta(1, 25)]:
                    reference_snps, reference_dtype = NaNCNCTestCases(
                        iid_index_list, snp_index_list, standardizer,
                        snp_reader_factory_bed(), sp.float64, "C", "False",
                        None, None).read_and_standardize()
                    for snpreader_factory in [
                            snp_reader_factory_bed,
                            snp_reader_factory_snpmajor_hdf5,
                            snp_reader_factory_iidmajor_hdf5,
                            snp_reader_factory_dat
                    ]:
                        for dtype in [sp.float64, sp.float32]:
                            for order in ["C", "F"]:
                                for force_python_only in [False, True]:
                                    snpreader = snpreader_factory()
                                    test_case = NaNCNCTestCases(
                                        iid_index_list, snp_index_list,
                                        standardizer, snpreader, dtype, order,
                                        force_python_only, reference_snps,
                                        reference_dtype)
                                    yield test_case
        os.chdir(previous_wd)
コード例 #16
0
ファイル: snpdata.py プロジェクト: hyacz/PySnpTools
    def standardize(self,
                    standardizer=Unit(),
                    block_size=None,
                    return_trained=False,
                    force_python_only=False):
        """Does in-place standardization of the in-memory
        SNP data. By default, it applies 'Unit' standardization, that is: the values for each SNP will have mean zero and standard deviation 1.0.
        NaN values are then filled with zero, the mean (consequently, if there are NaN values, the final standard deviation will not be zero.
        Note that, for efficiency, this method works in-place, actually changing values in the ndarray. Although it works in place, for convenience
        it also returns the SnpData.

        :param standardizer: optional -- Specify standardization to be applied. 
             Any :class:`.Standardizer` may be used. Some choices include :class:`.Unit` (default, makes values for each SNP have mean zero and
             standard deviation 1.0) and :class:`.Beta`.
        :type standardizer: :class:`.Standardizer`

        :param block_size: optional -- Deprecated.
        :type block_size: None

        :param return_trained: If true, returns a second value containing a constant :class:`.Standardizer` trained on this data.
        :type return_trained: boolean

        :param force_python_only: optional -- If true, will use pure Python instead of faster C++ libraries.
        :type force_python_only: bool

        :rtype: :class:`.SnpData` (standardizes in place, but for convenience, returns 'self')

        >>> from pysnptools.snpreader import Bed
        >>> snp_on_disk = Bed('../../tests/datasets/all_chr.maf0.001.N300',count_A1=False) # Specify some data on disk in Bed format
        >>> snpdata1 = snp_on_disk.read() # read all SNP values into memory
        >>> print snpdata1 # Prints the specification for this SnpData
        SnpData(Bed('../../tests/datasets/all_chr.maf0.001.N300',count_A1=False))
        >>> print snpdata1.val[0,0]
        2.0
        >>> snpdata1.standardize() # standardize changes the values in snpdata1.val and changes the specification.
        SnpData(Bed('../../tests/datasets/all_chr.maf0.001.N300',count_A1=False),Unit())
        >>> print snpdata1.val[0,0]
        0.229415733871
        >>> snpdata2 = snp_on_disk.read().standardize() # Read and standardize in one expression with only one ndarray allocated.
        >>> print snpdata2.val[0,0]
        0.229415733871
        """
        self._std_string_list.append(str(standardizer))
        _, trained = standardizer.standardize(
            self, return_trained=True, force_python_only=force_python_only)
        if return_trained:
            return self, trained
        else:
            return self
コード例 #17
0
def sim_pheno(bfile,
              start_chrom,
              end_chrom,
              cau_idx,
              beta,
              legend,
              standardize,
              nblock=40):

    mask = np.zeros(beta.shape[0], dtype=bool)
    mask[cau_idx] = True

    fam = '{}{}.fam'.format(bfile, start_chrom)
    nindv = pd.read_table(fam, header=None).shape[0]
    pheno = np.zeros(nindv, dtype=np.float32)

    for i in xrange(start_chrom, end_chrom + 1):

        snpdata = Bed('{}{}.bed'.format(bfile, i), count_A1=False)
        nindv = snpdata.iid_count
        nsnp = snpdata.sid_count
        blocks = create_block(0, nsnp - 1, nblock)

        snp_idx = np.where(legend['CHR'] == i)[0]
        beta_chrom = beta[snp_idx]
        mask_chrom = mask[snp_idx]

        for blk in blocks:
            mask_chrom_blk = mask_chrom[blk]
            use_idx = blk[mask_chrom_blk == True]

            snpdata_blk = snpdata[:, use_idx]
            if standardize == False:
                snpdata_blk = snpdata_blk.read(dtype=np.float32).val
            else:
                snpdata_blk = snpdata_blk.read(dtype=np.float32)\
                    .standardize(Unit()).val
            if standardize == False:
                snpdata_blk -= snpdata_blk.mean(axis=0)

            pheno += np.dot(snpdata_blk, beta_chrom[use_idx])

    sigma_e = np.sqrt(1.0 - np.var(pheno))

    eps = np.random.normal(scale=sigma_e, size=nindv).astype(np.float32)
    pheno += eps

    return pheno
コード例 #18
0
ファイル: pairs.py プロジェクト: fastlmm/PySnpTools
def epi_reml(pair_snps,
             pheno,
             covar=None,
             kernel_snps=None,
             output_dir='results',
             part_count=33,
             runner=None,
             override=False):
    from pysnptools.kernelreader import SnpKernel
    from pysnptools.standardizer import Unit
    import datetime
    from fastlmm.association import single_snp

    part_list = list(split_on_sids(pair_snps, part_count))
    part_pair_count = (part_count * part_count + part_count) / 2
    part_pair_index = -1
    print("part_pair_count={0:,}".format(part_pair_count))
    K0 = SnpKernel(kernel_snps or pair_snps,
                   standardizer=Unit()).read()  #Precompute the similarity
    if not os.path.exists(output_dir): os.makedirs(output_dir)
    start_time = datetime.datetime.now()
    for i in range(part_count):
        part_i = part_list[i]
        for j in range(i, part_count):
            part_pair_index += 1
            pairs = _Pairs2(part_i) if i == j else _Pairs2(
                part_i, part_list[j])
            print("Looking at pair {0},{1} which is {2} of {3}".format(
                i, j, part_pair_index, part_pair_count))
            output_file = '{0}/result.{1}.{2}.tsv'.format(
                output_dir, part_pair_index, part_pair_count)
            if override or not os.path.exists(output_file):
                result_df_ij = single_snp(pairs,
                                          K0=K0,
                                          pheno=pheno,
                                          covar=covar,
                                          leave_out_one_chrom=False,
                                          count_A1=True,
                                          runner=runner)
                result_df_ij.to_csv(output_file, sep="\t", index=False)
                print(result_df_ij[:1])
                time_so_far = datetime.datetime.now() - start_time
                total_time_estimate = time_so_far * part_pair_count / (
                    part_pair_index + 1)
                print(total_time_estimate)
コード例 #19
0
def factory(s):
    s = s.capitalize()
    if s == "Unit" or s == "Unit()":
        return Unit()

    if s == "Identity" or s == "Identity()":
        return Identity()

    if s == "BySqrtSidCount" or s == "BySqrtSidCount()":
        return BySqrtSidCount()

    if s == "BySidCount" or s == "BySidCount()":
        return BySidCount()

    if s == "Beta":
        return Beta()

    if s.startswith("Beta("):
        standardizer = eval(s)
        return standardizer
コード例 #20
0
ファイル: test.py プロジェクト: hyacz/PySnpTools
    def load_and_standardize(self, snpreader2, snpreader3):
        """
        test c-version of load and standardize
        """

        S = snpreader2.sid_count
        N_original = snpreader2.iid_count

        iid_index_list = list(range(N_original - 1, 0, -2))
        snpreader3 = snpreader3[iid_index_list, :]

        for dtype in [sp.float64, sp.float32]:

            G2 = snpreader2.read(order='F', force_python_only=True).val
            G2 = Unit().standardize(G2,
                                    block_size=10000,
                                    force_python_only=True)

            SNPs_floatF = snpreader2.read(order="F",
                                          dtype=dtype,
                                          force_python_only=False).val
            GF = Unit().standardize(SNPs_floatF)

            SNPs_floatC = snpreader2.read(order="C",
                                          dtype=dtype,
                                          force_python_only=False).val
            GC = Unit().standardize(SNPs_floatC)

            self.assertTrue(np.allclose(GF, G2, rtol=1e-05, atol=1e-05))
            self.assertTrue(np.allclose(GF, GC, rtol=1e-05, atol=1e-05))

            #testing selecting a subset of snps and iids
            snp_index_list = list(range(S - 1, 0, -2))

            G2x = snpreader2.read(order='F', force_python_only=True).val
            G2x = G2x[iid_index_list, :][:, snp_index_list]
            G2x = Unit().standardize(G2x,
                                     block_size=10000,
                                     force_python_only=True)

            SNPs_floatFx = snpreader3[:, snp_index_list].read(
                order="F", dtype=dtype, force_python_only=False).val
            GFx = Unit().standardize(SNPs_floatFx)
            self.assertTrue(np.allclose(GFx, G2x, rtol=1e-05, atol=1e-05))

            SNPs_floatCx = snpreader3[:, snp_index_list].read(
                order="C", dtype=dtype, force_python_only=False).val
            GCx = Unit().standardize(SNPs_floatCx)
            self.assertTrue(np.allclose(GFx, G2x, rtol=1e-05, atol=1e-05))
コード例 #21
0
    def __init__(self,
                 snpreader,
                 pheno_fn,
                 num_folds,
                 test_size=0.1,
                 cov_fn=None,
                 num_snps_in_memory=100000,
                 random_state=None,
                 log=None,
                 offset=True,
                 num_pcs=0,
                 interpolate_delta=False,
                 mpheno=0,
                 standardizer=Unit()):
        """Set up Feature selection strategy
        ----------

        snpreader : str or snpreader
            File name of binary SNP file or a snpreader.

        pheno_fn : str
            File name of phenotype file

        num_folds : int
            Number of folds in k-fold cross-validation

        test_size : float, default=0.1
            Fraction of samples to use as test set (train_size = 1-test_size)

        cov_fn : str, optional, default=None
            File name of covariates file

        num_snps_in_memory: int, optional, default=100000
            Number of SNPs to keep in memory at a time. Setting this higher than the largest k
            will dramatically increase speed at the cost of higher memory use.

        random_state : int, default=None
            Seed to use for random number generation (e.g. random splits)

        log : Level of log messages, defaults=None (don't change)
            e.g. logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO

        offset : bool, default=True
            Adds offset to the covariates specified in cov_fn, if necessary

        num_pcs : int, default=0
            Number of principle components to be included as fixed effects.
            If num_pcs>0, a PCA will be computed as preprocessing.

        interpolate_delta : bool, default=False
            Interpolate delta around optimum with parabola (for best k).

        mpheno : int, default=0
            Column id of phenotype

        standardizer: a standandizer-like object such as Unit() or Beta(1,25), default=Unit()

        """
        self._ran_once = False

        # data file names
        self.snpreader = snpreader
        if isinstance(self.snpreader, str):
            self.snpreader = Bed(self.snpreader)
        #!!test speed of new vs old
        #!!make all readers take optional file extension

        self.pheno_fn = pheno_fn
        self.cov_fn = cov_fn

        # data fields
        self.G = None
        self.y = None
        self.X = None

        # flags
        self.num_folds = num_folds
        self.test_size = test_size
        self.random_state = random_state
        self.offset = offset
        self.num_pcs = num_pcs
        self.pcs = None
        self.interpolate_delta = interpolate_delta
        self.mpheno = mpheno
        self.standardizer = standardizer

        # efficiency
        self.num_snps_in_memory = num_snps_in_memory
        self.blocksize = 1000
        self.biggest_k = None

        if log is not None:
            logger.setLevel(log)
コード例 #22
0
    def generate_and_analyze(seed,
                             N,
                             do_shuffle,
                             just_testing=True,
                             map_function=None,
                             cache_folder=None):

        #Generate SNPs
        snpdata = snp_gen(fst=.1,
                          dfr=0,
                          iid_count=N,
                          sid_count=1000,
                          chr_count=10,
                          label_with_pop=True,
                          seed=seed)
        K_causal = snpdata.read_kernel(Unit()).standardize()

        #Generate geo-spatial locations and K_loc
        distance_between_centers = 2500000
        x0 = distance_between_centers * 0.5
        x1 = distance_between_centers * 1.5
        y0 = distance_between_centers
        y1 = distance_between_centers
        sd = distance_between_centers / 4.

        spatial_iid = snpdata.iid
        center_dict = {"0": (x0, y0), "1": (x1, y1)}
        centers = np.array(
            [center_dict[iid_item[0]] for iid_item in spatial_iid])
        np.random.seed(seed)
        logging.info("Generating positions for seed {0}".format(seed))
        spatial_coor = SnpData(
            iid=snpdata.iid,
            sid=["x", "y"],
            val=centers + np.random.multivariate_normal(
                [0, 0], [[1, 0], [0, 1]], size=len(centers)) * sd,
            parent_string="'spatial_coor_gen_original'")
        alpha = distance_between_centers
        spatial_val = spatial_similarity(spatial_coor.val, alpha, power=2)
        K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize()

        #Generate phenotype
        iid = K_causal.iid
        iid_count = K_causal.iid_count
        np.random.seed(seed)
        pheno_causal = SnpData(iid=iid,
                               sid=["causal"],
                               val=np.random.multivariate_normal(
                                   np.zeros(iid_count),
                                   K_causal.val).reshape(-1, 1),
                               parent_string="causal")
        np.random.seed(seed ^ 998372)
        pheno_noise = SnpData(iid=iid,
                              sid=["noise"],
                              val=np.random.normal(size=iid_count).reshape(
                                  -1, 1),
                              parent_string="noise")
        np.random.seed(seed ^ 12230302)
        pheno_loc_original = SnpData(iid=iid,
                                     sid=["loc_original"],
                                     val=np.random.multivariate_normal(
                                         np.zeros(iid_count),
                                         K_loc.val).reshape(-1, 1),
                                     parent_string="loc_original")

        if do_shuffle:
            idx = np.arange(iid_count)
            np.random.seed(seed)
            np.random.shuffle(idx)
            pheno_loc = pheno_loc_original.read(
                view_ok=True
            )  #don't need to copy, because the next line will be fresh memory
            pheno_loc.val = pheno_loc.val[idx, :]
        else:
            pheno_loc = pheno_loc_original

        pheno = SnpData(iid=iid,
                        sid=["pheno_all"],
                        val=pheno_causal.val + pheno_noise.val + pheno_loc.val)

        #Analyze data
        alpha_list = [
            int(v) for v in np.logspace(np.log10(100), np.log10(1e10), 100)
        ]
        dataframe = heritability_spatial_correction(
            snpdata,
            spatial_coor.val,
            spatial_iid,
            alpha_list=[alpha] if just_testing else alpha_list,
            pheno=pheno,
            alpha_power=2,
            jackknife_count=0,
            permute_plus_count=0,
            permute_times_count=0,
            just_testing=just_testing,
            map_function=map_function,
            cache_folder=cache_folder)

        logging.info(dataframe)
        return dataframe
コード例 #23
0
def heritability_spatial_correction(G_kernel,
                                    spatial_coor,
                                    spatial_iid,
                                    alpha_list,
                                    alpha_power,
                                    pheno,
                                    map_function=map,
                                    cache_folder=None,
                                    jackknife_count=500,
                                    permute_plus_count=10000,
                                    permute_times_count=10000,
                                    seed=0,
                                    just_testing=False,
                                    always_remote=False,
                                    allow_gxe2=True,
                                    count_A1=None):
    """
    Function measuring heritability with correction for spatial location.

    :param G_kernel: A kernel that tells the genetic similarity between all pairs of individuals. The kernel can be given 
      explicitly, for example with a :class:`.KernelData`. The kernel can also be given implicitly by providing a set of
      SNPs or the name of a BED file.
    :type G_kernel: a `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__, `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string

    :param spatial_coor: The position of each individual given by two coordinates. Any units are allowed, but the two values
       must be compatible so that distance can be determined via Pythagoras' theorem. (So, longitude and latitude should
       not be used unless the locations are near the Equator.) 
    :type spatial_coor: a iid_count x 2 array

    :param spatial_iid: A ndarray of the iids. Each iid is a ndarray of two strings (a family ID and a case ID) that identifies an individual.
    :type spatial_iid: array of strings with shape [iid_count,2]

    :param alpha_list: a list of numbers to search to find the best alpha, which is the similarity scale. The similarity of two individuals
      is here defined as exp(-(distance_between/alpha)**alpha_power). If the closest individuals are 100 units apart and the farthest
      individuals are 4e6 units apart, a reasonable alpha_list might be: [int(v) for v in np.logspace(np.log10(100),np.log10(1e10), 100)]
      The function's reports on the alphas chosen. If an extreme alpha is picked, change alpha_list to cover more range.
    :type alpha_list: list of numbers

    :param alpha_power: 2 (a good choice) means that similarity goes with area. 1 means with distance.
    :type alpha_list: number

    :param pheno: The target values(s) to predict. It can be a file name readable via `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__.
    :type pheno: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or string

    :param cache_folder: (default 'None') The name of a directory in which to save intermediate results. If 'None', then no intermediate results are saved.
    :type cache_folder: a string

    :param map_function: (default 'map') A function with the same inputs and functionality as Python's 'map' function.
       Can be used to run 'heritability_spatial_correction' on a cluster.
    :type map_function: a function

    :param jackknife_count: (default 500) The number of jackknife groups to use when calculating standard errors (SE). Changing to a small number, 2, 
       speeds up calculation at the cost of unusable SEs.
    :type jackknife_count: number

    :param permute_plus_count: (default 10000) The number of permutations used when calculating P values. Changing to a small number, 1, 
       speeds up calculation at the cost of unusable P values.
    :type permute_plus_count: number

    :param permute_times_count: (default 10000) The number of permutations used when calculating P values. Changing to a small number, 1, 
       speeds up calculation at the cost of unusable P values.
    :type permute_times_count: number

    :param seed: (default 0) The random seed used by jackknifing and permutation.
    :type seed: number

    :param just_testing: (default False) If true, skips actual LMM-related search and calculation.
    :type just_testing: bool

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool

    :rtype: Pandas dataframe with one row per phenotype. Columns include "h2uncorr", "h2corr", etc.

    """

    ######################
    # Prepare the inputs
    ######################

    from fastlmm.inference.fastlmm_predictor import _kernel_fixup, _pheno_fixup
    G_kernel = _kernel_fixup(
        G_kernel, iid_if_none=None, standardizer=Unit(), count_A1=count_A1
    )  # Create a kernel from an in-memory kernel, some snps, or a text file.
    pheno = _pheno_fixup(
        pheno, iid_if_none=G_kernel.iid, missing='NA', count_A1=count_A1
    )  # Create phenotype data from in-memory data or a text file.

    if cache_folder is not None:
        pstutil.create_directory_if_necessary(cache_folder, isfile=False)

    jackknife_seed = seed or 1954692566
    permute_plus_seed = seed or 2372373100
    permute_times_seed = seed or 2574440128

    ######################
    # Find 'alpha', the scale for distance
    ######################

    # create the alpha table (unless it is already there)
    alpha_table_fn = "{0}/alpha_table.{1}.txt".format(
        cache_folder,
        pheno.sid_count)  # create a name for the alpha_table cache file
    phen_target_array = np.array(pheno.sid, dtype='str')
    if cache_folder is not None and os.path.exists(alpha_table_fn):
        alpha_table = pd.read_csv(alpha_table_fn,
                                  delimiter='\t',
                                  index_col=False,
                                  comment=None)
    else:
        # create the list of arguments to run
        arg_list = []
        for phen_target in phen_target_array:
            pheno_one = pheno[:, pheno.col_to_index(
                [phen_target])]  # Look at only this pheno_target
            for alpha in alpha_list:
                #pheno, G_kernel, spatial_coor, spatial_iid, alpha,     alpha_power,  (jackknife_index, jackknife_count, jackknife_seed),
                arg_tuple = (
                    pheno_one,
                    G_kernel,
                    spatial_coor,
                    spatial_iid,
                    alpha,
                    alpha_power,
                    (-1, 0, None),
                    # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2,               a2
                    (-1, 0, None),
                    (-1, 0, None),
                    just_testing,
                    False,
                    True and allow_gxe2,
                    None)
                arg_list.append(arg_tuple)

        # Run "run_line" on each set of arguments and save to file
        return_list = map_function(
            work_item,
            arg_list) if len(arg_list) > 1 or always_remote else list(
                map(work_item, arg_list))
        return_list = [line for line in return_list
                       if line is not None]  #Remove 'None' results
        alpha_table = pd.DataFrame(return_list)
        if cache_folder is not None:
            _write_csv(alpha_table, False, alpha_table_fn)

    # read the alpha table and find the best values
    grouped = alpha_table.groupby("phen")
    alpha_dict = {}
    for phen, phen_table in grouped:
        best_index_corr = phen_table['nLLcorr'].idxmin(
        )  # with Pandas, this returns the index in the parent table, not the group table
        best_index_gxe2 = phen_table['nLL_gxe2'].idxmin() if allow_gxe2 else 0
        alpha_corr = alpha_table.iloc[best_index_corr]['alpha']
        alpha_gxe2 = alpha_table.iloc[best_index_gxe2]['alpha']
        alpha_dict[phen] = alpha_corr, alpha_gxe2
    logging.info(alpha_dict)

    ######################
    # Use jackknifing to compute h2uncorr, SE, h2corr, SE, e2, SE, gxe2, SE
    ######################

    jackknife_count_actual = min(jackknife_count, G_kernel.iid_count)

    # Set up the run and do it (unless it has already been run)
    jackknife_table_fn = "{0}/jackknife.{1}.count{2}.txt".format(
        cache_folder, pheno.sid_count, jackknife_count_actual)
    if cache_folder is not None and os.path.exists(jackknife_table_fn):
        jackknife_table = pd.read_csv(jackknife_table_fn,
                                      delimiter='\t',
                                      index_col=False,
                                      comment=None)
    else:
        arg_list = []
        for phen_target in phen_target_array:
            pheno_one = pheno[:, pheno.col_to_index(
                [phen_target])]  # Look at only this pheno_target
            alpha_corr, alpha_gxe2 = alpha_dict[phen_target]
            alpha_set = set([
                alpha_corr, alpha_gxe2
            ])  #If these are the same, then only need to do half the work
            for alpha in alpha_set:
                logging.debug(alpha)
                do_uncorr = (alpha == alpha_corr)
                do_gxe2 = (alpha == alpha_gxe2) and allow_gxe2
                for jackknife in range(-1, jackknife_count_actual):
                    # pheno, G_kernel, spatial_coor, spatial_iid, alpha,     alpha_power, (jackknife_index, jackknife_count,         jackknife_seed),
                    arg_tuple = (
                        pheno_one,
                        G_kernel,
                        spatial_coor,
                        spatial_iid,
                        alpha,
                        alpha_power,
                        (jackknife, jackknife_count_actual, jackknife_seed),
                        # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2
                        (-1, 0, None),
                        (-1, 0, None),
                        just_testing,
                        do_uncorr,
                        do_gxe2,
                        None)
                    arg_list.append(arg_tuple)

        # Run "run_line" on each set of arguments and save to file
        return_list = map_function(
            work_item,
            arg_list) if len(arg_list) > 1 or always_remote else list(
                map(work_item, arg_list))
        return_list = [line for line in return_list
                       if line is not None]  #Remove 'None' results
        jackknife_table = pd.DataFrame(return_list)
        if cache_folder is not None:
            _write_csv(jackknife_table, False, jackknife_table_fn)

    # get the real (that is, unjackknifed) values
    jackknife_table[
        "diff"] = jackknife_table.h2uncorr - jackknife_table.h2corr  # Compute the diff = h2uncorr-h2corr column
    results_both = jackknife_table[
        jackknife_table.jackknife_index ==
        -1]  # Create a table of the real (non-jackknifed) results for both alphas (which may be the same)
    del results_both["jackknife_index"]
    results_corr = results_both[results_both.alpha == [
        alpha_dict[phen][0] for phen in results_both.phen
    ]]  #Create version for g+e's alpha
    results_gxe2 = results_both[results_both.alpha == [
        alpha_dict[phen][1] for phen in results_both.phen
    ]]  #Create version for gxe's alpha
    #remove unwanted columns
    for delcol in [
            "a2_gxe2", "gxe2", "nLL_gxe2", "permute_plus_count",
            "permute_plus_index", "permute_plus_seed", "permute_times_count",
            "permute_times_index", "permute_times_seed", "jackknife_count",
            "jackknife_seed"
    ]:
        del results_corr[delcol]
    for delcol in [
            "a2", "e2", "h2corr", "h2uncorr", "nLLcorr", "nLLuncorr", "diff",
            "permute_plus_count", "permute_plus_index", "permute_plus_seed",
            "permute_times_count", "permute_times_index", "permute_times_seed",
            "jackknife_count", "jackknife_seed"
    ]:
        del results_gxe2[delcol]

    if jackknife_count_actual > 0:
        #Use a pivottable to compute the jackknifed SE's
        corr_rows = np.logical_and(
            jackknife_table.jackknife_index != -1, jackknife_table.alpha == [
                alpha_dict[phen][0] for phen in jackknife_table.phen
            ])
        jk_table_corr = pd.pivot_table(
            jackknife_table[corr_rows],
            values=['h2uncorr', 'h2corr', 'diff', 'e2'],
            index=['phen'],
            columns=[],
            aggfunc=np.std)
        jk_table_corr["h2uncorr SE"] = jk_table_corr["h2uncorr"] * np.sqrt(
            jackknife_count_actual - 1)
        jk_table_corr["h2corr SE"] = jk_table_corr["h2corr"] * np.sqrt(
            jackknife_count_actual - 1)
        jk_table_corr["diff SE"] = jk_table_corr["diff"] * np.sqrt(
            jackknife_count_actual - 1)
        jk_table_corr["e2 SE"] = jk_table_corr["e2"] * np.sqrt(
            jackknife_count_actual - 1)
        del jk_table_corr["h2uncorr"]
        del jk_table_corr["h2corr"]
        del jk_table_corr["diff"]
        del jk_table_corr["e2"]
        gxe2_rows = np.logical_and(
            jackknife_table.jackknife_index != -1, jackknife_table.alpha == [
                alpha_dict[phen][1] for phen in jackknife_table.phen
            ])
        jk_table_gxe2 = pd.pivot_table(jackknife_table[gxe2_rows],
                                       values=['gxe2'],
                                       index=['phen'],
                                       columns=[],
                                       aggfunc=np.std)
        jk_table_gxe2["gxe2 SE"] = jk_table_gxe2["gxe2"] * np.sqrt(
            jackknife_count_actual - 1)
        del jk_table_gxe2["gxe2"]

        #Join the SE's to the main results table
        results_corr = results_corr.join(jk_table_corr, on='phen')
        results_gxe2 = results_gxe2.join(jk_table_gxe2, on='phen')
    else:
        for col in ['h2uncorr SE', 'h2corr SE', 'diff SE', 'e2 SE']:
            results_corr[col] = np.NaN
        results_gxe2['gxe2 SE'] = np.NaN

    #compute pValue columns
    results_corr["P (diff=0)"] = stats.t.sf(
        results_corr["diff"] / results_corr["diff SE"],
        df=jackknife_count_actual - 1) * 2  #two sided
    results_corr["from SE, one-sided, P (e2=0)"] = stats.t.sf(
        results_corr["e2"] / results_corr["e2 SE"],
        df=jackknife_count_actual - 1)
    results_gxe2["from SE, one-sided, P (gxe2=0)"] = stats.t.sf(
        results_gxe2["gxe2"] / results_gxe2["gxe2 SE"],
        df=jackknife_count_actual - 1)  #one sided

    if cache_folder is not None:
        _write_csv(
            results_corr, False,
            "{0}/jackknife_corr_summary.{1}.jackknife{2}.txt".format(
                cache_folder, pheno.sid_count, jackknife_count_actual))
        _write_csv(
            results_gxe2, False,
            "{0}/jackknife_gxe2_summary.{1}.jackknife{2}.txt".format(
                cache_folder, pheno.sid_count, jackknife_count_actual))

    ######################
    # compute p(e2=0) via permutation
    ######################

    permplus_table_fn = "{0}/permutation.GPlusE.{1}.count{2}.txt".format(
        cache_folder, pheno.sid_count, permute_plus_count)
    if cache_folder is not None and os.path.exists(permplus_table_fn):
        permplus_table = pd.read_csv(permplus_table_fn,
                                     delimiter='\t',
                                     index_col=False,
                                     comment=None)
    else:
        arg_list = []
        for phen_target in phen_target_array:
            pheno_one = pheno[:, pheno.col_to_index(
                [phen_target])]  # Look at only this pheno_target
            alpha_corr, alpha_gxe2 = alpha_dict[phen_target]
            for jackknife_index in range(-1, permute_plus_count):
                # pheno, G_kernel, spatial_coor, spatial_iid, alpha,          alpha_power,    (jackknife_index, jackknife_count, jackknife_seed),
                arg_tuple = (
                    pheno_one,
                    G_kernel,
                    spatial_coor,
                    spatial_iid,
                    alpha_corr,
                    alpha_power,
                    (-1, 0, None),
                    # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2
                    (jackknife_index, permute_plus_count, permute_plus_seed),
                    (-1, 0, None),
                    just_testing,
                    False,
                    False,
                    None)
                arg_list.append(arg_tuple)

        # Run "run_line" on each set of arguments and save to file
        return_list = map_function(
            work_item,
            arg_list) if len(arg_list) > 1 or always_remote else list(
                map(work_item, arg_list))
        return_list = [line for line in return_list
                       if line is not None]  #Remove 'None' results
        permplus_table = pd.DataFrame(return_list)
        if cache_folder is not None:
            _write_csv(permplus_table, False, permplus_table_fn)

    #Create a table of the real nLL for each pheno
    real_result_permplus = permplus_table[permplus_table.permute_plus_index ==
                                          -1][['phen', 'nLLcorr']]
    real_result_permplus.rename(columns={'nLLcorr': 'nLLcorr_real'},
                                inplace=True)
    real_result_permplus.set_index(['phen'], inplace=True)

    # Create a table of the permutation runs and add the real nLL to each row
    perm_table = permplus_table[permplus_table.permute_plus_index != -1]
    result = perm_table.join(real_result_permplus, on='phen')
    result['P(e2)'] = [
        1.0 if b else 0.0 for b in result.nLLcorr <= result.nLLcorr_real
    ]  # create a column showing where the perm is better (or as good) as the real
    # Use pivottable to find the fraction of of times when permutation is better
    pivot_table_plus = pd.pivot_table(result,
                                      values=['P(e2)'],
                                      index=['phen'],
                                      columns=[],
                                      aggfunc=np.mean)
    if cache_folder is not None:
        summary_permplus_table_fn = "{0}/summary.permutation.GPlusE.{1}.count{2}.txt".format(
            cache_folder, pheno.sid_count, permute_plus_count)
        _write_csv(pivot_table_plus, True, summary_permplus_table_fn)

    ################################################
    # compute p(gxe2=0) via permutation
    ################################################

    #Only process phenos for which gxe2 is not 0
    nonzero = set(results_gxe2[results_gxe2.gxe2 != 0].phen)
    permtimes_phenotypes = set(phen_target_array) & nonzero  #intersection
    permtimes_table_list = []
    for phen_target in permtimes_phenotypes:
        permtimes_table_fn = "{0}/permutation.GxE/{1}.count{2}.txt".format(
            cache_folder, phen_target, permute_times_count)

        if cache_folder is not None and os.path.exists(permtimes_table_fn):
            permtime_results = pd.read_csv(permtimes_table_fn,
                                           delimiter='\t',
                                           index_col=False,
                                           comment=None)
        else:
            arg_list = []
            pheno_one = pheno[:, pheno.col_to_index(
                [phen_target])]  # Look at only this pheno_target
            alpha_corr, alpha_gxe2 = alpha_dict[phen_target]
            a2 = float(permplus_table[permplus_table.phen == phen_target][
                permplus_table.permute_plus_index == -1]['a2'])
            for permute_index in range(-1, permute_times_count):
                # pheno, G_kernel, spatial_coor, spatial_iid, alpha,          alpha_powerm (permute_index, permute_count, permute_seed),
                arg_tuple = (
                    pheno_one,
                    G_kernel,
                    spatial_coor,
                    spatial_iid,
                    alpha_gxe2,
                    alpha_power,
                    (-1, 0, None),
                    # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2
                    (-1, 0, None),
                    (permute_index, permute_times_count, permute_times_seed),
                    just_testing,
                    False,
                    allow_gxe2,
                    a2)
                arg_list.append(arg_tuple)

            # Run "run_line" on each set of arguments and save to file
            return_list = map_function(
                work_item,
                arg_list) if len(arg_list) > 1 or always_remote else list(
                    map(work_item, arg_list))
            return_list = [line for line in return_list
                           if line is not None]  #Remove 'None' results
            permtime_results = pd.DataFrame(return_list)
            if cache_folder is not None:
                pstutil.create_directory_if_necessary(permtimes_table_fn)
                _write_csv(permtime_results, False, permtimes_table_fn)
        permtimes_table_list.append(permtime_results)

    if permtimes_table_list:  #not empty
        permtimes_table = pd.concat(permtimes_table_list)
        logging.info(permtimes_table.head())

        #Create a table of the real nLL for each pheno
        real_result_permtimes = permtimes_table[
            permtimes_table.permute_times_index == -1][['phen', 'nLL_gxe2']]
        real_result_permtimes.rename(columns={'nLL_gxe2': 'nLL_gxe2_real'},
                                     inplace=True)
        real_result_permtimes.set_index(['phen'], inplace=True)

        # Create a table of the permutation runs and add the real nLL to reach row
        summary_permtimes_table_fn = "{0}/summary.permutation.GxE.{1}.count{2}.txt".format(
            cache_folder, len(permtimes_phenotypes), permute_times_count)

        perm_table = permtimes_table[permtimes_table.permute_times_index != -1]
        resultx = perm_table.join(real_result_permtimes, on='phen')
        resultx['P(gxe2)'] = [
            1.0 if b else 0.0
            for b in resultx.nLL_gxe2 <= resultx.nLL_gxe2_real
        ]  # create a column showing where the perm is better (or as good) as the real
        # Use pivottable to find the fraction of times when permutation is better
        pivot_table_times = pd.pivot_table(resultx,
                                           values=['P(gxe2)'],
                                           index=['phen'],
                                           columns=[],
                                           aggfunc=np.mean)
        if cache_folder is not None:
            _write_csv(pivot_table_times, True, summary_permtimes_table_fn)

    #######################
    # Create final table of results by combining the summary tables
    #######################

    #Rename some columns
    results_corr.rename(columns={
        "h2uncorr SE": "SE (h2uncorr)",
        "h2corr SE": "SE (h2corr)",
        "e2 SE": "SE (e2)"
    },
                        inplace=True)

    #Rename some columns and join results
    results_gxe2.rename(columns={
        "alpha": "alpha_gxe2",
        "gxe2 SE": "SE (gxe2)",
        "h2corr_raw": "h2corr_raw_gxe2"
    },
                        inplace=True)
    del results_gxe2['alpha_power']
    results_gxe2.set_index(["phen"], inplace=True)
    final0 = results_corr.join(results_gxe2, on='phen')

    #Rename some columns and join results
    pivot_table_plus.rename(columns={"P(e2)": "P(e2=0)"}, inplace=True)
    if len(pivot_table_plus) > 0:
        final1 = final0.join(pivot_table_plus, on='phen')
    else:
        final1 = final0.copy()
        final1['P(e2=0)'] = np.NaN

    #Rename some columns and join results
    if permtimes_table_list and len(pivot_table_times) > 0:  #not empty
        pivot_table_times.rename(columns={"P(gxe2)": "P(gxe2=0)"},
                                 inplace=True)
        final2 = final1.join(pivot_table_times, on='phen')
    else:
        final2 = final1.copy()
        final2["P(gxe2=0)"] = np.nan

    #Rename 'phen' and select final columns
    final2.rename(columns={"phen": "phenotype"}, inplace=True)
    final3 = final2[[
        "phenotype", "h2uncorr", "SE (h2uncorr)", "h2corr", "SE (h2corr)",
        "P (diff=0)", "e2", "SE (e2)", "P(e2=0)", "alpha", "alpha_gxe2",
        "gxe2", "SE (gxe2)", "P(gxe2=0)"
    ]].copy()

    #Rename sort the phenotypes
    final3['lower'] = [pheno_one.lower() for pheno_one in final3.phenotype]
    final3.sort_values(['lower'], inplace=True)
    del final3['lower']

    if cache_folder is not None:
        summary_final_table_fn = "{0}/summary.final.{1}.{2}.{3}.{4}.txt".format(
            cache_folder, pheno.sid_count, jackknife_count_actual,
            permute_plus_count, permute_times_count)
        _write_csv(final3, False, summary_final_table_fn)

    return final3
コード例 #24
0
    do_plot = False
    from pysnptools.util import snp_gen
    from pysnptools.standardizer import Unit

    seed = 0
    N = 5000

    #Generate SNPs
    snpdata = snp_gen(fst=.1,
                      dfr=0,
                      iid_count=N,
                      sid_count=1000,
                      chr_count=10,
                      label_with_pop=True,
                      seed=seed)
    K_causal = snpdata.read_kernel(Unit()).standardize()

    if do_plot:
        pylab.suptitle("$K_{causal}$")
        pylab.imshow(K_causal.val, cmap=pylab.gray(), vmin=0, vmax=1)
        pylab.show()

    import numpy as np
    from pysnptools.snpreader import SnpData

    distance_between_centers = 2500000
    x0 = distance_between_centers * 0.5
    x1 = distance_between_centers * 1.5
    y0 = distance_between_centers
    y1 = distance_between_centers
    sd = distance_between_centers / 4.
コード例 #25
0
# In one-line:
snpdata = Bed("all.bed").read().standardize()

# Beta standardization
from pysnptools.standardizer import Beta

snpdataB = Bed("all.bed").read().standardize(Beta(1, 25))
print snpdataB.val
#[[  7.40112054e-01   7.15532756e-01  -5.02003205e-04 ...,   4.40649336e-03   -1.13331663e-06   1.87525732e-01]
# [  7.40112054e-01   7.15532756e-01  -5.02003205e-04 ...,   4.40649336e-03   -1.34519756e-05   1.87525732e-01]
# ...

# To create an kernel (the relateness of each iid pair as the dot product of their standardized SNP values)
from pysnptools.standardizer import Unit

kerneldata = Bed("all.bed").read_kernel(standardizer=Unit())
print kerneldata.val
#array([[ 5081.6121922 ,   253.32922313,   165.9842232 , ...,  -130.76998392,  -298.66392286,  -287.66887036],
#       [  253.32922313,  5061.87849635,   384.04149913, ...,  -334.33599388,  -127.02308706,  -291.41483161]
#
#...

# Low memory:
kerneldata = Bed("all.bed").read_kernel(standardizer=Unit(), block_size=500)

# Summary
#  Standardization
#     default Unit - mean 0, stdev 1, THEN fill with 0
#     In place, and returns self
#     Other standardizers: Beta, Unit, DiagKToN
#  Kernels
コード例 #26
0
def single_snp(test_snps, pheno, K0=None,
                 K1=None, mixing=None,
                 covar=None, covar_by_chrom=None, leave_out_one_chrom=True, output_file_name=None, h2=None, log_delta=None,
                 cache_file = None, GB_goal=None, interact_with_snp=None, force_full_rank=False, force_low_rank=False, G0=None, G1=None, runner=None,
                 count_A1=None):
    """
    Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed.
    (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.)

    :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. 
           If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example,
           `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps.
           Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. 
           If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_
           or a `KernelNpz <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelnpz>`_-formated file name.)
    :type K0: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string
           (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing').
           Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_.
           If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_
           or a `KernelNpz <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelnpz>`_-formated file name.)
    :type K1: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string
           (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0.
            If you give no mixing number and a K1 is given, the best weight will be learned.
    :type mixing: number

    :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True.
           (Warning: setting False can cause proximal contamination.)
    :type leave_out_one_chrom: boolean    

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-delimited text.
    :type output_file_name: file name

    :param h2: A parameter to LMM learning, optional
            If not given will search for best value.
            If mixing is unspecified, then h2 must also be unspecified.
    :type h2: number

    :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1)
    :type log_delta: number

    :param cache_file: Name of  file to read or write cached precomputation values to, optional.
                If not given, no cache file will be used.
                If given and file does not exist, will write precomputation values to file.
                If given and file does exist, will read precomputation values from file.
                The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format.
                Calls using the same cache file should have the same 'K0' and 'K1'
                If given and the file does exist then K0 and K1 need not be given.
    :type cache_file: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param interact_with_snp: index of a covariate to perform an interaction test with. 
            Allows for interaction testing (interact_with_snp x snp will be tested)
            default: None

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given.
    :type G0: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given.
    :type G1: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param runner: a `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool


    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"



    :Example:

    >>> import logging
    >>> from fastlmm.association import single_snp
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn, count_A1=False)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    t0 = time.time()
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps, count_A1=count_A1)
    pheno = _pheno_fixup(pheno, count_A1=count_A1).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid, count_A1=count_A1)

    if not leave_out_one_chrom:
        assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None"
        K0 = _kernel_fixup(K0 or G0 or test_snps, iid_if_none=test_snps.iid, standardizer=Unit(),count_A1=count_A1)
        K1 = _kernel_fixup(K1 or G1, iid_if_none=test_snps.iid, standardizer=Unit(),count_A1=count_A1)
        K0, K1, test_snps, pheno, covar  = pstutil.intersect_apply([K0, K1, test_snps, pheno, covar])
        logging.debug("# of iids now {0}".format(K0.iid_count))
        K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal, force_full_rank, force_low_rank)

        frame =  _internal_single(K0=K0, test_snps=test_snps, pheno=pheno,
                                    covar=covar, K1=K1,
                                    mixing=mixing, h2=h2, log_delta=log_delta,
                                    cache_file = cache_file, force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                    output_file_name=output_file_name,block_size=block_size, interact_with_snp=interact_with_snp,
                                    runner=runner)
        sid_index_range = IntRangeSet(frame['sid_index'])
        assert sid_index_range == (0,test_snps.sid_count), "Some SNP rows are missing from the output"
    else: 
        chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data
        assert not np.isnan(chrom_list).any(), "chrom list should not contain NaN"
        input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + ([] if covar_by_chrom is None else covar_by_chrom.values())

        def nested_closure(chrom):
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==chrom]
            covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)
            cache_file_chrom = None if cache_file is None else cache_file + ".{0}".format(chrom)

            K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom, test_snps.iid)
            K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid)

            K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom  = pstutil.intersect_apply([K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom])
            logging.debug("# of iids now {0}".format(K0_chrom.iid_count))
            K0_chrom, K1_chrom, block_size = _set_block_size(K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank, force_low_rank)

            distributable = _internal_single(K0=K0_chrom, test_snps=test_snps_chrom, pheno=pheno_chrom,
                                        covar=covar_chrom, K1=K1_chrom,
                                        mixing=mixing, h2=h2, log_delta=log_delta, cache_file=cache_file_chrom,
                                        force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                        output_file_name=None, block_size=block_size, interact_with_snp=interact_with_snp,
                                        runner=Local())
            
            return distributable

        def reducer_closure(frame_sequence):
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(test_snps.iid_count))
            logging.info("SNPCount\t{0}".format(test_snps.sid_count))
            logging.info("Runtime\t{0}".format(time.time()-t0))

            return frame

        frame = map_reduce(chrom_list,
                   mapper = nested_closure,
                   reducer = reducer_closure,
                   input_files = input_files,
                   output_files = [output_file_name],
                   name = "single_snp (leave_out_one_chrom), out='{0}'".format(output_file_name),
                   runner = runner)

    return frame
コード例 #27
0
ファイル: snpmemmap.py プロジェクト: fastlmm/PySnpTools
        if Path(memmap_file).exists():
            Path(memmap_file).unlink()

        #######
        # Merge the input files
        ######
        merge = _MergeSIDs([
            Bed(bed_file,
                fam_filename=fam_file,
                bim_filename=bim_file,
                count_A1=True,
                skip_format_check=True) for bed_file, fam_file, bim_file in
            zip(bed_file_list, fam_file_list, bim_file_list)
        ])

        # memmap = _bed_to_memmap2(merge,memmap_file=memmap_file,dtype='float32',step=10)
        from pysnptools.standardizer import Unit
        memmap = SnpMemMap.write(memmap_file,
                                 merge,
                                 standardizer=Unit(),
                                 dtype='float32')
        memmap

    suites = getTestSuite()
    r = unittest.TextTestRunner(failfast=True)
    ret = r.run(suites)
    assert ret.wasSuccessful()

    result = doctest.testmod(optionflags=doctest.ELLIPSIS)
    assert result.failed == 0, "failed doc test: " + __file__
コード例 #28
0
ファイル: pairs.py プロジェクト: eric-czech/PySnpTools
                snpdata = pairs.read()#
                #print(snpdata.val)

    import datetime
    from pysnptools.kernelreader import SnpKernel
    from pysnptools.standardizer import Unit
    from pysnptools.util.mapreduce1.runner import LocalMultiProc
    from pysnptools.util.mapreduce1 import map_reduce
    #runner=None
    runner = LocalMultiProc(1,just_one_process=False)

    part_pair_count = (part_count*part_count+part_count)//2
    part_pair_index = -1
    print("part_pair_count={0:,}".format(part_pair_count))

    K0 = SnpKernel(synbed,standardizer=Unit()).read() #Precompute the similarity

    start_time = datetime.datetime.now()
    for i,part_i in enumerate(part_list):
        def mapper1(j):
            #from fastlmm.association import single_snp
            #from pysnptools.snpreader import Pairs
            #print('Z')
            #part_j = part_list[j]
            #print('A')
            print("Looking at pair {0},{1} which is {2} of {3}".format(i,j,part_pair_index+j+1,part_pair_count))
            #pairs = Pairs(part_i) if i==j else Pairs(part_i,part_j)
            #result_df_ij = single_snp(pairs, K0=K0, pheno=pheno_fn, covar=cov_fn, leave_out_one_chrom=False, count_A1=True)
            #print(result_df_ij[:1])
            #return result_df_ij
コード例 #29
0
    def blocking(self,
                 snpreader,
                 cov_fn=None,
                 num_pcs=0,
                 output_prefix=None,
                 strategy="lmm_full_cv"):
        """
        compare three different cases:

        To control memory use, we've introduced a parameter called "num_snps_in_memory", which defaults to 100000. 
        Here are the interesting cases to consider (and choose num_snps_in_memory accordingly):

        1) num_snps_in_memory > total_num_snps

           In this case, the same code as before should be 
           executed (except the kernel matrix on all SNPs is now cached). 


        2) num_snps_in_memory < total_num_snps
            num_snps_in_memory > k (excluding all_snps)

            Here, the linear regression will be blocked, 
            while the data for cross-validation is cached, 
            saving time for loading and re-indexing.


        3) num_snps_in_memory < total_num_snps
            num_snps_in_memory < k (excluding all_snps)

            Finally, both operations - linear regression 
            and building the kernel will be blocked.

        4,5,6) Same as #1,2,3, but with a phenos that has extra iids and for which the iids are shuffled.


        """

        # set up grid
        ##############################
        num_steps_delta = 5
        num_folds = 2

        # log_2 space and all SNPs
        k_values = [0, 1, 5, 10, 100, 500, 700, 10000]
        delta_values = np.logspace(-3,
                                   3,
                                   endpoint=True,
                                   num=num_steps_delta,
                                   base=np.exp(1))

        random_state = 42

        # case 1
        fss_1 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=20000)
        best_k_1, best_delta_1, best_obj_1, best_snps_1 = fss_1.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        #some misc testing
        import PerformSelectionDistributable as psd
        perform_selection_distributable = psd.PerformSelectionDistributable(
            fss_1,
            k_values,
            delta_values,
            strategy,
            output_prefix,
            select_by_ll=True,
            penalty=0.0)
        self.assertEqual(perform_selection_distributable.work_count, 3)
        s = perform_selection_distributable.tempdirectory
        s = str(perform_selection_distributable)
        s = "%r" % perform_selection_distributable
        from fastlmm.feature_selection.feature_selection_cv import GClass
        s = "%r" % GClass.factory(snpreader, 1000000, Unit(), 50)
        s = s
        #!!making  test for each break point.

        # case 2
        fss_2 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=5000)
        best_k_2, best_delta_2, best_obj_2, best_snps_2 = fss_2.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        # case 3
        fss_3 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=600)
        best_k_3, best_delta_3, best_obj_3, best_snps_3 = fss_3.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        # case 4
        fss_4 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_shuffleplus_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=20000)
        best_k_4, best_delta_4, best_obj_4, best_snps_4 = fss_4.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        # case 5
        fss_5 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_shuffleplus_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=5000)
        best_k_5, best_delta_5, best_obj_5, best_snps_5 = fss_5.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        # case 6
        fss_6 = FeatureSelectionStrategy(snpreader,
                                         self.pheno_shuffleplus_fn,
                                         num_folds,
                                         cov_fn=cov_fn,
                                         random_state=random_state,
                                         num_pcs=num_pcs,
                                         interpolate_delta=True,
                                         num_snps_in_memory=600)
        best_k_6, best_delta_6, best_obj_6, best_snps_6 = fss_6.perform_selection(
            k_values,
            delta_values,
            output_prefix=output_prefix,
            select_by_ll=True,
            strategy=strategy)

        self.assertEqual(int(best_k_1), int(best_k_2))
        self.assertEqual(int(best_k_1), int(best_k_3))
        #self.assertEqual(int(best_k_1), int(best_k_4))
        #self.assertEqual(int(best_k_1), int(best_k_5))
        #self.assertEqual(int(best_k_1), int(best_k_6))
        self.assertAlmostEqual(best_obj_1, best_obj_2)
        self.assertAlmostEqual(best_obj_1, best_obj_3)
        #self.assertAlmostEqual(best_obj_1, best_obj_4)
        self.assertAlmostEqual(best_obj_4, best_obj_5)
        self.assertAlmostEqual(best_obj_4, best_obj_6)

        if strategy != "insample_cv":
            self.assertAlmostEqual(best_delta_1, best_delta_2)
            self.assertAlmostEqual(best_delta_1, best_delta_3)
            #self.assertAlmostEqual(best_delta_1, best_delta_4)
            self.assertAlmostEqual(best_delta_4, best_delta_5)
            self.assertAlmostEqual(best_delta_4, best_delta_6)
コード例 #30
0
            def mapper_gather_lots(i_fold_and_pair):
                i_fold, (train_idx, test_idx) = i_fold_and_pair
                logging.info(
                    "Working on GWAS_1K and k search, chrom={0}, i_fold={1}".
                    format(test_chr, i_fold))

                G_train = G_for_chrom[train_idx, :]

                #Precompute whole x whole standardized on train
                from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal
                min_count = _internal_determine_block_size(
                    G_for_chrom, None, None, force_full_rank, force_low_rank)
                block_size = _block_size_from_GB_goal(GB_goal,
                                                      G_for_chrom.iid_count,
                                                      min_count)
                K_whole_unittrain = _SnpWholeWithTrain(
                    whole=G_for_chrom,
                    train_idx=train_idx,
                    standardizer=Unit(),
                    block_size=block_size).read()

                assert np.array_equal(K_whole_unittrain.iid,
                                      G_for_chrom.iid), "real assert"
                K_train = K_whole_unittrain[train_idx]

                single_snp_result = single_snp(
                    test_snps=G_train,
                    K0=K_train,
                    pheno=
                    pheno,  #iid intersection means when can give the whole covariate and pheno
                    covar=covar,
                    leave_out_one_chrom=False,
                    GB_goal=GB_goal,
                    force_full_rank=force_full_rank,
                    force_low_rank=force_low_rank,
                    mixing=mixing,
                    h2=h2,
                    count_A1=count_A1)

                is_all = (i_fold == n_folds) if n_folds > 1 else True

                k_list_in = [0] + [
                    int(k)
                    for k in k_list if 0 < k and k < len(single_snp_result)
                ]

                if is_all:
                    top_snps = list(single_snp_result.SNP[:max_k])
                else:
                    top_snps = None

                if i_fold == n_folds:
                    k_index_to_nLL = None
                else:
                    k_index_to_nLL = []
                    for k in k_list_in:
                        top_k = G_for_chrom[:,
                                            G_for_chrom.sid_to_index(
                                                single_snp_result.SNP[:k])]
                        logging.info(
                            "Working on chr={0}, i_fold={1}, and K_{2}".format(
                                test_chr, i_fold, k))

                        top_k_train = top_k[train_idx, :] if k > 0 else None
                        fastlmm = FastLMM(force_full_rank=force_full_rank,
                                          force_low_rank=force_low_rank,
                                          GB_goal=GB_goal)
                        fastlmm.fit(
                            K0_train=K_train,
                            K1_train=top_k_train,
                            X=covar,
                            y=pheno,
                            mixing=mixing,
                            h2raw=h2
                        )  #iid intersection means when can give the whole covariate and pheno

                        top_k_test = top_k[test_idx, :] if k > 0 else None
                        K0_whole_test = K_whole_unittrain[:, test_idx]
                        nLL = fastlmm.score(
                            K0_whole_test=K0_whole_test,
                            K1_whole_test=top_k_test,
                            X=covar,
                            y=pheno
                        )  #iid intersection means when can give the whole covariate and pheno
                        k_index_to_nLL.append(nLL)

                if i_fold > 0:
                    k_list_in = None

                return k_list_in, top_snps, k_index_to_nLL