def _fixup(test_snps, G, pheno, covar,count_A1=None): test_snps = _snps_fixup(test_snps,count_A1=count_A1) G = _snps_fixup(G or test_snps,count_A1=count_A1) pheno = _pheno_fixup(pheno,count_A1=count_A1).read() assert pheno.sid_count == 1, "Expect pheno to be just one variable" pheno = pheno[(pheno.val==pheno.val)[:,0],:] covar = _pheno_fixup(covar, iid_if_none=pheno.iid,count_A1=count_A1) G, test_snps, pheno, covar = pstutil.intersect_apply([G, test_snps, pheno, covar]) return test_snps, G, pheno, covar
def predict(self, X=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance """ assert self.is_fitted, "Can only predict after predictor has been fitted" assert K0_whole_test is None or isinstance( K0_whole_test, KernelIdentity) # could also accept no snps assert K1_whole_test is None or isinstance( K1_whole_test, KernelIdentity) # could also accept no snps X = _pheno_fixup(X, iid_if_none=iid_if_none) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.read().val, np.ones((X.iid_count, 1))]) assert np.array_equal( X.sid, self.covar_sid ), "Expect covar sids to be the same in train and test." pheno_predicted = X.val.dot(self.beta).reshape(-1, 1) ret0 = SnpData(iid=X.iid, sid=self.pheno_sid, val=pheno_predicted, pos=np.array([[np.nan, np.nan, np.nan]]), name="linear regression Prediction" ) #!!!replace 'parent_string' with 'name' from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=X.iid, val=np.eye(X.iid_count) * self.ssres / self.iid_count) return ret0, ret1
def score(self, X=None, y=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, return_mse_too=False, count_A1=None): """ Method for calculating the negative log likelihood of testing examples. If the examples in X,y, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string. :param y: testing phenotype: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type y: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__ :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :param return_mse_too: If true, will also return the mean squared error. :type return_mse_too: bool :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error. """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: mean0, covar0 = self.predict(K0_whole_test=K0_whole_test, K1_whole_test=K1_whole_test, X=X, iid_if_none=iid_if_none, count_A1=count_A1) y = _pheno_fixup(y, iid_if_none=covar0.iid, count_A1=count_A1) mean, covar, y = intersect_apply([mean0, covar0, y]) var = multivariate_normal( mean=mean.read(order='A', view_ok=True).val.reshape(-1), cov=covar.read(order='A', view_ok=True).val) y_actual = y.read().val nll = -np.log(var.pdf(y_actual.reshape(-1))) if not return_mse_too: return nll else: mse = ((y_actual - mean)**2).sum() return nll, mse
def _create_covar_chrom(covar, covar_by_chrom, chrom): if covar_by_chrom is not None: covar_by_chrom_chrom = covar_by_chrom[chrom] covar_by_chrom_chrom = _pheno_fixup(covar_by_chrom_chrom, iid_if_none=covar) covar_after, covar_by_chrom_chrom = pstutil.intersect_apply([covar, covar_by_chrom_chrom]) ret = SnpData(iid=covar_after.iid,sid=np.r_[covar_after.sid,covar_by_chrom_chrom.sid], val=np.c_[covar_after.read(order='A',view_ok=True).val, covar_by_chrom_chrom.read(order='A',view_ok=True).val]) #view_ok because np.c_ will allocate new memory. return ret else: return covar
def _create_covar_chrom(covar, covar_by_chrom, chrom,count_A1=None): if covar_by_chrom is not None: covar_by_chrom_chrom = covar_by_chrom[chrom] covar_by_chrom_chrom = _pheno_fixup(covar_by_chrom_chrom, iid_if_none=covar,count_A1=count_A1) covar_after, covar_by_chrom_chrom = pstutil.intersect_apply([covar, covar_by_chrom_chrom]) ret = SnpData(iid=covar_after.iid,sid=np.r_[covar_after.sid,covar_by_chrom_chrom.sid], val=np.c_[covar_after.read(order='A',view_ok=True).val, covar_by_chrom_chrom.read(order='A',view_ok=True).val]) #view_ok because np.c_ will allocate new memory. return ret else: return covar
def score(self, X=None, y=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, return_mse_too=False): """ Method for calculating the negative log likelihood of testing examples. If the examples in X,y, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param y: testing phenotype: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :param return_mse_too: If true, will also return the mean squared error. :type return_mse_too: bool :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error. """ mean0, covar0 = self.predict(K0_whole_test=K0_whole_test, K1_whole_test=K1_whole_test, X=X, iid_if_none=iid_if_none) y = _pheno_fixup(y, iid_if_none=covar0.iid) mean, covar, y = intersect_apply([mean0, covar0, y]) var = multivariate_normal(mean=mean.read(order='A', view_ok=True).val.reshape(-1), cov=covar.read(order='A', view_ok=True).val) y_actual = y.read().val nll = -np.log(var.pdf(y_actual.reshape(-1))) if not return_mse_too: return nll else: mse = ((y_actual - mean)**2).sum() return nll, mse
def predict(self,X=None,K0_whole_test=None,K1_whole_test=None,iid_if_none=None,count_A1=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance """ assert self.is_fitted, "Can only predict after predictor has been fitted" assert K0_whole_test is None or isinstance(K0_whole_test,KernelIdentity) # could also accept no snps assert K1_whole_test is None or isinstance(K1_whole_test,KernelIdentity) # could also accept no snps X = _pheno_fixup(X,iid_if_none=iid_if_none,count_A1=count_A1) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.read().val,np.ones((X.iid_count,1))]) assert np.array_equal(X.sid,self.covar_sid), "Expect covar sids to be the same in train and test." pheno_predicted = X.val.dot(self.beta).reshape(-1,1) ret0 = SnpData(iid = X.iid, sid=self.pheno_sid,val=pheno_predicted,pos=np.array([[np.nan,np.nan,np.nan]]),name="linear regression Prediction") #!!!replace 'parent_string' with 'name' from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=X.iid,val=np.eye(X.iid_count)* self.ssres / self.iid_count) return ret0, ret1
def score(self, X=None, y=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, return_mse_too=False, count_A1=None): """ Method for calculating the negative log likelihood of testing examples. If the examples in X,y, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param y: testing phenotype: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :param return_mse_too: If true, will also return the mean squared error. :type return_mse_too: bool :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error. """ mean0, covar0 = self.predict(K0_whole_test=K0_whole_test,K1_whole_test=K1_whole_test,X=X,iid_if_none=iid_if_none,count_A1=count_A1) y = _pheno_fixup(y, iid_if_none=covar0.iid,count_A1=count_A1) mean, covar, y = intersect_apply([mean0, covar0, y]) var = multivariate_normal(mean=mean.read(order='A',view_ok=True).val.reshape(-1), cov=covar.read(order='A',view_ok=True).val) y_actual = y.read().val nll = -np.log(var.pdf(y_actual.reshape(-1))) if not return_mse_too: return nll else: mse = ((y_actual-mean)**2).sum() return nll, mse
def single_snp(test_snps, pheno, K0=None, K1=None, mixing=None, covar=None, covar_by_chrom=None, leave_out_one_chrom=True, output_file_name=None, h2=None, log_delta=None, cache_file = None, GB_goal=None, interact_with_snp=None, force_full_rank=False, force_low_rank=False, G0=None, G1=None, runner=None): """ Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed. (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.) :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type test_snps: a :class:`.SnpReader` or a string :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`. If you give a string, it should be the file name of a PLINK phenotype-formatted file. Any IIDs with missing values will be removed. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type pheno: a :class:`.SnpReader` or a string :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.) :type K0: :class:`.SnpReader` or a string (or :class:`.KernelReader`) :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing'). Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (When leave_out_one_chrom is False, can be a :class:`.KernelReader` or a :class:`.KernelNpz`-formated file name.) :type K1: :class:`.SnpReader` or a string (or :class:`.KernelReader`) :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0. If you give no mixing number and a K1 is given, the best weight will be learned. :type mixing: number :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`. If you give a string, it should be the file name of a PLINK phenotype-formatted file. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type covar: a :class:`.SnpReader` or a string :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True. (Warning: setting False can cause proximal contamination.) :type leave_out_one_chrom: boolean :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. :type output_file_name: file name :param h2: A parameter to LMM learning, optional If not given will search for best value. If mixing is unspecified, then h2 must also be unspecified. :type h2: number :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1) :type log_delta: number :param cache_file: Name of file to read or write cached precomputation values to, optional. If not given, no cache file will be used. If given and file does not exist, will write precomputation values to file. If given and file does exist, will read precomputation values from file. The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format. Calls using the same cache file should have the same 'K0' and 'K1' If given and the file does exist then K0 and K1 need not be given. :type cache_file: file name :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel, which is memory efficient with little overhead on computation time. :type GB_goal: number :param interact_with_snp: index of a covariate to perform an interaction test with. Allows for interaction testing (interact_with_snp x snp will be tested) default: None :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True. :type force_full_rank: Boolean :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True. :type force_low_rank: Boolean :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given. :type G0: :class:`.SnpReader` or a string (or :class:`.KernelReader`) :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given. :type G1: :class:`.SnpReader` or a string (or :class:`.KernelReader`) :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster. If not given, the function is run locally. :type runner: a runner. :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue" :Example: >>> import logging >>> import numpy as np >>> from fastlmm.association import single_snp >>> from pysnptools.snpreader import Bed >>> logging.basicConfig(level=logging.INFO) >>> pheno_fn = "../feature_selection/examples/toydata.phe" >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn) >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe) null_576 1e-07 10000 """ t0 = time.time() if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") assert test_snps is not None, "test_snps must be given as input" test_snps = _snps_fixup(test_snps) pheno = _pheno_fixup(pheno).read() assert pheno.sid_count == 1, "Expect pheno to be just one variable" pheno = pheno[(pheno.val==pheno.val)[:,0],:] covar = _pheno_fixup(covar, iid_if_none=pheno.iid) if not leave_out_one_chrom: assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None" K0 = _kernel_fixup(K0 or G0 or test_snps, iid_if_none=test_snps.iid, standardizer=Unit()) K1 = _kernel_fixup(K1 or G1, iid_if_none=test_snps.iid, standardizer=Unit()) K0, K1, test_snps, pheno, covar = pstutil.intersect_apply([K0, K1, test_snps, pheno, covar]) logging.debug("# of iids now {0}".format(K0.iid_count)) K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal, force_full_rank, force_low_rank) frame = _internal_single(K0=K0, test_snps=test_snps, pheno=pheno, covar=covar, K1=K1, mixing=mixing, h2=h2, log_delta=log_delta, cache_file = cache_file, force_full_rank=force_full_rank,force_low_rank=force_low_rank, output_file_name=output_file_name,block_size=block_size, interact_with_snp=interact_with_snp, runner=runner) sid_index_range = IntRangeSet(frame['sid_index']) assert sid_index_range == (0,test_snps.sid_count), "Some SNP rows are missing from the output" else: chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + ([] if covar_by_chrom is None else covar_by_chrom.values()) def nested_closure(chrom): test_snps_chrom = test_snps[:,test_snps.pos[:,0]==chrom] covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom) K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom, test_snps.iid) K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid) K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom = pstutil.intersect_apply([K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom]) logging.debug("# of iids now {0}".format(K0_chrom.iid_count)) K0_chrom, K1_chrom, block_size = _set_block_size(K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank, force_low_rank) distributable = _internal_single(K0=K0_chrom, test_snps=test_snps_chrom, pheno=pheno_chrom, covar=covar_chrom, K1=K1_chrom, mixing=mixing, h2=h2, log_delta=log_delta, cache_file=None, force_full_rank=force_full_rank,force_low_rank=force_low_rank, output_file_name=None, block_size=block_size, interact_with_snp=interact_with_snp, runner=Local()) return distributable def reducer_closure(frame_sequence): frame = pd.concat(frame_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) logging.info("PhenotypeName\t{0}".format(pheno.sid[0])) logging.info("SampleSize\t{0}".format(test_snps.iid_count)) logging.info("SNPCount\t{0}".format(test_snps.sid_count)) logging.info("Runtime\t{0}".format(time.time()-t0)) return frame frame = map_reduce(chrom_list, mapper = nested_closure, reducer = reducer_closure, input_files = input_files, output_files = [output_file_name], name = "single_snp (leave_out_one_chrom), out='{0}'".format(output_file_name), runner = runner) return frame
def single_snp(test_snps, pheno, K0=None, K1=None, mixing=None, covar=None, covar_by_chrom=None, leave_out_one_chrom=True, output_file_name=None, h2=None, log_delta=None, cache_file = None, GB_goal=None, interact_with_snp=None, force_full_rank=False, force_low_rank=False, G0=None, G1=None, runner=None, count_A1=None): """ Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed. (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.) :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type test_snps: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_. If you give a string, it should be the file name of a PLINK phenotype-formatted file. Any IIDs with missing values will be removed. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type pheno: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps. Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (When leave_out_one_chrom is False, can be a `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_ or a `KernelNpz <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelnpz>`_-formated file name.) :type K0: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_) :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing'). Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (When leave_out_one_chrom is False, can be a `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_ or a `KernelNpz <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelnpz>`_-formated file name.) :type K1: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_) :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0. If you give no mixing number and a K1 is given, the best weight will be learned. :type mixing: number :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_. If you give a string, it should be the file name of a PLINK phenotype-formatted file. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type covar: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True. (Warning: setting False can cause proximal contamination.) :type leave_out_one_chrom: boolean :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-delimited text. :type output_file_name: file name :param h2: A parameter to LMM learning, optional If not given will search for best value. If mixing is unspecified, then h2 must also be unspecified. :type h2: number :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1) :type log_delta: number :param cache_file: Name of file to read or write cached precomputation values to, optional. If not given, no cache file will be used. If given and file does not exist, will write precomputation values to file. If given and file does exist, will read precomputation values from file. The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format. Calls using the same cache file should have the same 'K0' and 'K1' If given and the file does exist then K0 and K1 need not be given. :type cache_file: file name :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel, which is memory efficient with little overhead on computation time. :type GB_goal: number :param interact_with_snp: index of a covariate to perform an interaction test with. Allows for interaction testing (interact_with_snp x snp will be tested) default: None :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True. :type force_full_rank: Boolean :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True. :type force_low_rank: Boolean :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given. :type G0: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_) :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given. :type G1: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_) :param runner: a `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_, optional: Tells how to run locally, multi-processor, or on a cluster. If not given, the function is run locally. :type runner: `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_ :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue" :Example: >>> import logging >>> from fastlmm.association import single_snp >>> from pysnptools.snpreader import Bed >>> logging.basicConfig(level=logging.INFO) >>> pheno_fn = "../feature_selection/examples/toydata.phe" >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn, count_A1=False) >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe) null_576 1e-07 10000 """ t0 = time.time() if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") assert test_snps is not None, "test_snps must be given as input" test_snps = _snps_fixup(test_snps, count_A1=count_A1) pheno = _pheno_fixup(pheno, count_A1=count_A1).read() assert pheno.sid_count == 1, "Expect pheno to be just one variable" pheno = pheno[(pheno.val==pheno.val)[:,0],:] covar = _pheno_fixup(covar, iid_if_none=pheno.iid, count_A1=count_A1) if not leave_out_one_chrom: assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None" K0 = _kernel_fixup(K0 or G0 or test_snps, iid_if_none=test_snps.iid, standardizer=Unit(),count_A1=count_A1) K1 = _kernel_fixup(K1 or G1, iid_if_none=test_snps.iid, standardizer=Unit(),count_A1=count_A1) K0, K1, test_snps, pheno, covar = pstutil.intersect_apply([K0, K1, test_snps, pheno, covar]) logging.debug("# of iids now {0}".format(K0.iid_count)) K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal, force_full_rank, force_low_rank) frame = _internal_single(K0=K0, test_snps=test_snps, pheno=pheno, covar=covar, K1=K1, mixing=mixing, h2=h2, log_delta=log_delta, cache_file = cache_file, force_full_rank=force_full_rank,force_low_rank=force_low_rank, output_file_name=output_file_name,block_size=block_size, interact_with_snp=interact_with_snp, runner=runner) sid_index_range = IntRangeSet(frame['sid_index']) assert sid_index_range == (0,test_snps.sid_count), "Some SNP rows are missing from the output" else: chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data assert not np.isnan(chrom_list).any(), "chrom list should not contain NaN" input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + ([] if covar_by_chrom is None else covar_by_chrom.values()) def nested_closure(chrom): test_snps_chrom = test_snps[:,test_snps.pos[:,0]==chrom] covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom) cache_file_chrom = None if cache_file is None else cache_file + ".{0}".format(chrom) K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom, test_snps.iid) K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid) K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom = pstutil.intersect_apply([K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom]) logging.debug("# of iids now {0}".format(K0_chrom.iid_count)) K0_chrom, K1_chrom, block_size = _set_block_size(K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank, force_low_rank) distributable = _internal_single(K0=K0_chrom, test_snps=test_snps_chrom, pheno=pheno_chrom, covar=covar_chrom, K1=K1_chrom, mixing=mixing, h2=h2, log_delta=log_delta, cache_file=cache_file_chrom, force_full_rank=force_full_rank,force_low_rank=force_low_rank, output_file_name=None, block_size=block_size, interact_with_snp=interact_with_snp, runner=Local()) return distributable def reducer_closure(frame_sequence): frame = pd.concat(frame_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) logging.info("PhenotypeName\t{0}".format(pheno.sid[0])) logging.info("SampleSize\t{0}".format(test_snps.iid_count)) logging.info("SNPCount\t{0}".format(test_snps.sid_count)) logging.info("Runtime\t{0}".format(time.time()-t0)) return frame frame = map_reduce(chrom_list, mapper = nested_closure, reducer = reducer_closure, input_files = input_files, output_files = [output_file_name], name = "single_snp (leave_out_one_chrom), out='{0}'".format(output_file_name), runner = runner) return frame
def single_snp_all_plus_select( test_snps, pheno, G=None, covar=None, k_list=None, n_folds=10, #1 is special and means test on train seed=0, output_file_name=None, GB_goal=None, force_full_rank=False, force_low_rank=False, mixing=None, h2=None, do_plot=False, runner=None, count_A1=None): """ Function performing single SNP GWAS based on two kernels. The first kernel is based on all SNPs. The second kernel is a similarity matrix constructed of the top *k* SNPs where the SNPs are ordered via the PValue from :meth:`.single_snp` and *k* is determined via out-of-sample prediction. All work is done via 'leave_out_one_chrom', that one chromosome is tested and the kernels are constructed from the other chromosomes. Will reorder and intersect IIDs as needed. :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type test_snps: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_. If you give a string, it should be the file name of a PLINK phenotype-formatted file. Any IIDs with missing values will be removed. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type pheno: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string :param G: SNPs from which to create a similarity matrix of the top *k* SNPs. If not given, will use test_snps. Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. :type G: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_. If you give a string, it should be the file name of a PLINK phenotype-formatted file. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type covar: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string :param k_list: Values of *k* (in addition to 0) to test. Default to [1,2,4,8,...8192]. :type k_list: list of numbers :param n_folds: Number of folds of cross validation to use for out-of-sample evaluation of various values of *k*. Default to 10. :type n_folds: number :param seed: (optional) Random seed used to generate permutations for lrt G0 fitting. :type seed: number :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. :type output_file_name: file name :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel, which is memory efficient with little overhead on computation time. :type GB_goal: number :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True. :type force_full_rank: Boolean :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True. :type force_low_rank: Boolean :param mixing: A parameter to LMM learning telling how to combine the two kernels, optional If not given will search for best value. :type mixing: number :param h2: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional If not given will search for best value. :type h2: number :param do_plot: If true, will plot, for each chrom, the negative loglikelihood vs k. :type do_plot: boolean :param runner: a `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_, optional: Tells how to run locally, multi-processor, or on a cluster. If not given, the function is run locally. :type runner: `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_ :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue" :Example: >>> import logging >>> import numpy as np >>> from fastlmm.association import single_snp_all_plus_select >>> from pysnptools.snpreader import Bed >>> from pysnptools.util.mapreduce1.runner import LocalMultiProc >>> logging.basicConfig(level=logging.INFO) >>> pheno_fn = "../feature_selection/examples/toydata.phe" >>> snps = Bed("../feature_selection/examples/toydata.5chrom.bed",count_A1=False)[:,::100] #To make example faster, run on only 1/100th of the data >>> chrom5_snps = snps[:,snps.pos[:,0]==5] # Test on only chrom5 >>> results_dataframe = single_snp_all_plus_select(test_snps=chrom5_snps,G=snps,pheno=pheno_fn,GB_goal=2,runner=LocalMultiProc(20,mkl_num_threads=5), count_A1=False) #Run multiproc >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe) null_9800 0.0793385 4 """ #================================================= # Start of definition of inner functions #================================================= def _best_snps_for_each_chrom(chrom_list, input_files, runner, G, n_folds, seed, pheno, covar, force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal): #logging.info("Doing GWAS_1K for each chrom and fold. Work_count={0}".format(len(chrom_list)*(n_folds+1))) max_k = int(max(k_list)) assert np.array_equal(G.iid, pheno.iid) and np.array_equal( G.iid, covar.iid), "real assert" def mapper_find_best_given_chrom(test_chr): G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader def mapper_gather_lots(i_fold_and_pair): i_fold, (train_idx, test_idx) = i_fold_and_pair logging.info( "Working on GWAS_1K and k search, chrom={0}, i_fold={1}". format(test_chr, i_fold)) G_train = G_for_chrom[train_idx, :] #Precompute whole x whole standardized on train from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal min_count = _internal_determine_block_size( G_for_chrom, None, None, force_full_rank, force_low_rank) block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count) K_whole_unittrain = _SnpWholeWithTrain( whole=G_for_chrom, train_idx=train_idx, standardizer=Unit(), block_size=block_size).read() assert np.array_equal(K_whole_unittrain.iid, G_for_chrom.iid), "real assert" K_train = K_whole_unittrain[train_idx] single_snp_result = single_snp( test_snps=G_train, K0=K_train, pheno= pheno, #iid intersection means when can give the whole covariate and pheno covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2, count_A1=count_A1) is_all = (i_fold == n_folds) if n_folds > 1 else True k_list_in = [0] + [ int(k) for k in k_list if 0 < k and k < len(single_snp_result) ] if is_all: top_snps = list(single_snp_result.SNP[:max_k]) else: top_snps = None if i_fold == n_folds: k_index_to_nLL = None else: k_index_to_nLL = [] for k in k_list_in: top_k = G_for_chrom[:, G_for_chrom.sid_to_index( single_snp_result.SNP[:k])] logging.info( "Working on chr={0}, i_fold={1}, and K_{2}".format( test_chr, i_fold, k)) top_k_train = top_k[train_idx, :] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank, GB_goal=GB_goal) fastlmm.fit( K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno, mixing=mixing, h2raw=h2 ) #iid intersection means when can give the whole covariate and pheno top_k_test = top_k[test_idx, :] if k > 0 else None K0_whole_test = K_whole_unittrain[:, test_idx] nLL = fastlmm.score( K0_whole_test=K0_whole_test, K1_whole_test=top_k_test, X=covar, y=pheno ) #iid intersection means when can give the whole covariate and pheno k_index_to_nLL.append(nLL) if i_fold > 0: k_list_in = None return k_list_in, top_snps, k_index_to_nLL def reducer_find_best(top_snps_and_k_index_to_nLL_sequence): #Starts fold_index+all -> k_index -> nll #Need: k_index -> sum(fold_index -> nll) k_index_to_sum_nll = None top_snps_all = None k_list_in_all = None for i_fold, (k_list_in, top_snps, k_index_to_nLL) in enumerate( top_snps_and_k_index_to_nLL_sequence): if k_list_in is not None: assert k_list_in_all is None, "real assert" k_list_in_all = k_list_in k_index_to_sum_nll = np.zeros(len(k_list_in)) if top_snps is not None: assert top_snps_all is None, "real assert" top_snps_all = top_snps if k_index_to_nLL is not None: assert i_fold < n_folds or n_folds == 1, "real assert" for k_index, nLL in enumerate(k_index_to_nLL): k_index_to_sum_nll[k_index] += nLL #find best # top_snps best_k = k_list_in_all[np.argmin(k_index_to_sum_nll)] logging.info("For chrom={0}, best_k={1}".format( test_chr, best_k)) if do_plot: _nll_plot(k_list_in_all, k_index_to_sum_nll) #Return the top snps from all result = top_snps_all[:best_k] return result i_fold_index_to_top_snps_and_k_index_to_nLL = map_reduce( _kfold(G_for_chrom.iid_count, n_folds, seed, end_with_all=True), mapper=mapper_gather_lots, reducer=reducer_find_best) return i_fold_index_to_top_snps_and_k_index_to_nLL chrom_index_to_best_sid = map_reduce( chrom_list, nested=mapper_find_best_given_chrom, input_files=input_files, name="best snps for each chrom", runner=runner) return chrom_index_to_best_sid def _gwas_2k_via_loo_chrom(test_snps, chrom_list, input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal): logging.info("Doing GWAS_2K for each chrom. Work_count={0}".format( len(chrom_list))) def mapper_single_snp_2K_given_chrom(test_chr): logging.info("Working on chr={0}".format(test_chr)) test_snps_chrom = test_snps[:, test_snps.pos[:, 0] == test_chr] G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader chrom_index = chrom_list.index(test_chr) best_sid = chrom_index_to_best_sid[chrom_index] K1 = G_for_chrom[:, G_for_chrom.sid_to_index(best_sid)] result = single_snp(test_snps=test_snps_chrom, K0=G_for_chrom, K1=K1, pheno=pheno, covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2, count_A1=count_A1) return result def reducer_closure( frame_sequence): #!!!very similar code in single_snp frame = pd.concat(frame_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) logging.info("PhenotypeName\t{0}".format(pheno.sid[0])) logging.info("SampleSize\t{0}".format(G.iid_count)) logging.info("SNPCount\t{0}".format(G.sid_count)) return frame frame = map_reduce(chrom_list, mapper=mapper_single_snp_2K_given_chrom, reducer=reducer_closure, input_files=input_files, name="single_snp with two K's for all chroms", runner=runner) return frame #================================================= # End of definition of inner functions #================================================= #!!!code similar to single_snp if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") if k_list is None: k_list = np.logspace(start=0, stop=13, num=14, base=2) assert test_snps is not None, "test_snps must be given as input" test_snps = _snps_fixup(test_snps, count_A1=count_A1) G = _snps_fixup(G or test_snps, count_A1=count_A1) pheno = _pheno_fixup(pheno, count_A1=count_A1).read() assert pheno.sid_count == 1, "Expect pheno to be just one variable" pheno = pheno[(pheno.val == pheno.val)[:, 0], :] covar = _pheno_fixup(covar, iid_if_none=pheno.iid, count_A1=count_A1) chrom_list = list( set(test_snps.pos[:, 0]) ) # find the set of all chroms mentioned in test_snps, the main testing data G, test_snps, pheno, covar = pstutil.intersect_apply( [G, test_snps, pheno, covar]) common_input_files = [test_snps, G, pheno, covar] chrom_index_to_best_sid = _best_snps_for_each_chrom( chrom_list, common_input_files, runner, G, n_folds, seed, pheno, covar, force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal) frame = _gwas_2k_via_loo_chrom(test_snps, chrom_list, common_input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal) return frame
def single_snp_linreg(test_snps, pheno, covar=None, max_output_len=None, output_file_name=None, GB_goal=None, runner=None): """ Function performing single SNP GWAS using linear regression. Will reorder and intersect IIDs as needed. :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type test_snps: a :class:`.SnpReader` or a string :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`. If you give a string, it should be the file name of a PLINK phenotype-formatted file. Any IIDs with missing values will be removed. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type pheno: a :class:`.SnpReader` or a string :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`. If you give a string, it should be the file name of a PLINK phenotype-formatted file. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type covar: a :class:`.SnpReader` or a string :param max_output_len: Maximum number of Pvalues to return. Default to None, which means 'Return all'. :type max_output_len: number :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. :type output_file_name: file name :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks of size iid_count, which is memory efficient with little overhead on computation time. :type GB_goal: number :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster. If not given, the function is run locally. :type runner: a runner. :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue" :Example: >>> import logging >>> import numpy as np >>> from fastlmm.association import single_snp_linreg >>> from pysnptools.snpreader import Bed >>> logging.basicConfig(level=logging.INFO) >>> pheno_fn = "../feature_selection/examples/toydata.phe" >>> results_dataframe = single_snp_linreg(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn) >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe) null_576 1e-07 10000 """ assert test_snps is not None, "test_snps must be given as input" test_snps = _snps_fixup(test_snps) pheno = _pheno_fixup(pheno).read() assert pheno.sid_count == 1, "Expect pheno to be just one variable" pheno = pheno[(pheno.val==pheno.val)[:,0],:] covar = _pheno_fixup(covar, iid_if_none=pheno.iid) test_snps, pheno, covar = pstutil.intersect_apply([test_snps, pheno, covar]) logging.debug("# of iids now {0}".format(test_snps.iid_count)) _, _, block_size = _set_block_size(test_snps, None, 0, GB_goal, force_full_rank=False, force_low_rank=False) #!!!what about missing data in covar, in test_snps, in y covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))] #view_ok because np.c_ will allocation new memory y = pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values def mapper(start): snp_index = np.arange(start,min(start+block_size,test_snps.sid_count)) x = test_snps[:,start:start+block_size].read().standardize().val _,pval_in = lin_reg.f_regression_cov_alt(x,y,covar) pval_in = pval_in.reshape(-1) if max_output_len is None: return pval_in,snp_index else: #We only need to return the top max_output_len results sort_index = np.argsort(pval_in)[:max_output_len] return pval_in[sort_index],snp_index[sort_index] def reducer(pval_and_snp_index_sequence): pval_list = [] snp_index_list = [] for pval, snp_index in pval_and_snp_index_sequence: pval_list.append(pval) snp_index_list.append(snp_index) pval = np.concatenate(pval_list) snp_index = np.concatenate(snp_index_list) sort_index = np.argsort(pval) if max_output_len is not None: sort_index = sort_index[:max_output_len] index = snp_index[sort_index] dataframe = pd.DataFrame( index=np.arange(len(index)), columns=('sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue') ) #!!Is this the only way to set types in a dataframe? dataframe['sid_index'] = dataframe['sid_index'].astype(np.float) dataframe['Chr'] = dataframe['Chr'].astype(np.float) dataframe['GenDist'] = dataframe['GenDist'].astype(np.float) dataframe['ChrPos'] = dataframe['ChrPos'].astype(np.float) dataframe['PValue'] = dataframe['PValue'].astype(np.float) dataframe['sid_index'] = index dataframe['SNP'] = test_snps.sid[index] dataframe['Chr'] = test_snps.pos[index,0] dataframe['GenDist'] = test_snps.pos[index,1] dataframe['ChrPos'] = test_snps.pos[index,2] dataframe['PValue'] = pval[sort_index] if output_file_name is not None: dataframe.to_csv(output_file_name, sep="\t", index=False) return dataframe dataframe = map_reduce(xrange(0,test_snps.sid_count,block_size), mapper=mapper, reducer=reducer, input_files=[test_snps,pheno,covar], output_files=[output_file_name], name = "single_snp_linreg", runner=runner) return dataframe
def single_snp_linreg(test_snps, pheno, covar=None, max_output_len=None, output_file_name=None, GB_goal=None, runner=None, count_A1=None): """ Function performing single SNP GWAS using linear regression. Will reorder and intersect IIDs as needed. :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type test_snps: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__. If you give a string, it should be the file name of a PLINK phenotype-formatted file. Any IIDs with missing values will be removed. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type pheno: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__. If you give a string, it should be the file name of a PLINK phenotype-formatted file. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type covar: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param max_output_len: Maximum number of Pvalues to return. Default to None, which means 'Return all'. :type max_output_len: number :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-delimited text. :type output_file_name: file name :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks of size iid_count, which is memory efficient with little overhead on computation time. :type GB_goal: number :param runner: `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__, optional: Tells how to run locally, multi-processor, or on a cluster. If not given, the function is run locally. :type runner: `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__ :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue" :Example: >>> import logging >>> import numpy as np >>> from fastlmm.association import single_snp_linreg >>> from pysnptools.snpreader import Bed >>> from fastlmm.util import example_file # Download and return local file name >>> logging.basicConfig(level=logging.INFO) >>> pheno_fn = example_file("fastlmm/feature_selection/examples/toydata.phe") >>> test_snps = example_file("fastlmm/feature_selection/examples/toydata.5chrom.*","*.bed") >>> results_dataframe = single_snp_linreg(test_snps=test_snps, pheno=pheno_fn, count_A1=False) >>> print(results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)) null_576 1e-07 10000 """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: assert test_snps is not None, "test_snps must be given as input" test_snps = _snps_fixup(test_snps, count_A1=count_A1) pheno = _pheno_fixup(pheno, count_A1=count_A1).read() assert pheno.sid_count == 1, "Expect pheno to be just one variable" pheno = pheno[(pheno.val == pheno.val)[:, 0], :] covar = _pheno_fixup(covar, iid_if_none=pheno.iid) test_snps, pheno, covar = pstutil.intersect_apply( [test_snps, pheno, covar]) logging.debug("# of iids now {0}".format(test_snps.iid_count)) if GB_goal is not None: bytes_per_sid = test_snps.iid_count * 8 sid_per_GB_goal = 1024.0**3 * GB_goal / bytes_per_sid block_size = max(1, int(sid_per_GB_goal + .5)) block_count = test_snps.sid_count / block_size else: block_count = 1 block_size = test_snps.sid_count logging.debug("block_count={0}, block_size={1}".format( block_count, block_size)) #!!!what about missing data in covar, in test_snps, in y covar = np.c_[ covar.read(view_ok=True, order='A').val, np.ones((test_snps.iid_count, 1))] #view_ok because np.c_ will allocation new memory y = pheno.read( view_ok=True, order='A' ).val #view_ok because this code already did a fresh read to look for any missing values def mapper(start): logging.info( "single_snp_linereg reading start={0},block_size={1}".format( start, block_size)) snp_index = np.arange(start, min(start + block_size, test_snps.sid_count)) x = test_snps[:, start:start + block_size].read().standardize().val logging.info("single_snp_linereg linreg") _, pval_in = lin_reg.f_regression_cov_alt(x, y, covar) logging.info("single_snp_linereg done") pval_in = pval_in.reshape(-1) if max_output_len is None: return pval_in, snp_index else: #We only need to return the top max_output_len results sort_index = np.argsort(pval_in)[:max_output_len] return pval_in[sort_index], snp_index[sort_index] def reducer(pval_and_snp_index_sequence): pval_list = [] snp_index_list = [] for pval, snp_index in pval_and_snp_index_sequence: pval_list.append(pval) snp_index_list.append(snp_index) pval = np.concatenate(pval_list) snp_index = np.concatenate(snp_index_list) sort_index = np.argsort(pval) if max_output_len is not None: sort_index = sort_index[:max_output_len] index = snp_index[sort_index] dataframe = pd.DataFrame(index=np.arange(len(index)), columns=('sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue')) #!!Is this the only way to set types in a dataframe? dataframe['sid_index'] = dataframe['sid_index'].astype(np.float) dataframe['Chr'] = dataframe['Chr'].astype(np.float) dataframe['GenDist'] = dataframe['GenDist'].astype(np.float) dataframe['ChrPos'] = dataframe['ChrPos'].astype(np.float) dataframe['PValue'] = dataframe['PValue'].astype(np.float) dataframe['sid_index'] = index dataframe['SNP'] = np.array( test_snps.sid[index], dtype='str' ) #This will be ascii on Python2 and unicode on Python3 dataframe['Chr'] = test_snps.pos[index, 0] dataframe['GenDist'] = test_snps.pos[index, 1] dataframe['ChrPos'] = test_snps.pos[index, 2] dataframe['PValue'] = pval[sort_index] if output_file_name is not None: dataframe.to_csv(output_file_name, sep="\t", index=False) return dataframe dataframe = map_reduce(range(0, test_snps.sid_count, block_size), mapper=mapper, reducer=reducer, input_files=[test_snps, pheno, covar], output_files=[output_file_name], name="single_snp_linreg", runner=runner) return dataframe
def fit(self, X=None, y=None, K0_train=None, K1_train=None, h2=None, mixing=None,count_A1=None): """ Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected. :param X: training covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param y: training phenotype: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_train: Must be None. Represents the identity similarity matrix. :type K0_train: None :param K1_train: Must be None. Represents the identity similarity matrix. :type K1_train: :class:`.SnpReader` or a string or :class:`.KernelReader` :param h2: Ignored. Optional. :type h2: number :param mixing: Ignored. Optional. :type mixing: number :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: self, the fitted Linear Regression predictor """ self.is_fitted = True assert K0_train is None # could also accept that ID or no snps assert K1_train is None # could also accept that ID or no snps assert y is not None, "y must be given" y = _pheno_fixup(y,count_A1=count_A1) assert y.sid_count == 1, "Expect y to be just one variable" X = _pheno_fixup(X, iid_if_none=y.iid,count_A1=count_A1) X, y = intersect_apply([X, y]) y = y.read() X, covar_unit_trained = X.read().standardize(self.covariate_standardizer,return_trained=True) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.val,np.ones((X.iid_count,1))]) lsqSol = np.linalg.lstsq(X.val, y.val[:,0],rcond=-1) bs=lsqSol[0] #weights r2=lsqSol[1] #squared residuals D=lsqSol[2] #rank of design matrix N=y.iid_count self.beta = bs self.ssres = float(r2) self.sstot = ((y.val-y.val.mean())**2).sum() self.covar_unit_trained = covar_unit_trained self.iid_count = X.iid_count self.covar_sid = X.sid self.pheno_sid = y.sid return self
def heritability_spatial_correction(G_kernel, spatial_coor, spatial_iid, alpha_list, alpha_power, pheno, map_function=map, cache_folder=None, jackknife_count=500, permute_plus_count=10000, permute_times_count=10000, seed=0, just_testing=False, always_remote=False, allow_gxe2=True, count_A1=None): """ Function measuring heritability with correction for spatial location. :param G_kernel: A kernel that tells the genetic similarity between all pairs of individuals. The kernel can be given explicitly, for example with a :class:`.KernelData`. The kernel can also be given implicitly by providing a set of SNPs or the name of a BED file. :type G_kernel: a `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__, `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string :param spatial_coor: The position of each individual given by two coordinates. Any units are allowed, but the two values must be compatible so that distance can be determined via Pythagoras' theorem. (So, longitude and latitude should not be used unless the locations are near the Equator.) :type spatial_coor: a iid_count x 2 array :param spatial_iid: A ndarray of the iids. Each iid is a ndarray of two strings (a family ID and a case ID) that identifies an individual. :type spatial_iid: array of strings with shape [iid_count,2] :param alpha_list: a list of numbers to search to find the best alpha, which is the similarity scale. The similarity of two individuals is here defined as exp(-(distance_between/alpha)**alpha_power). If the closest individuals are 100 units apart and the farthest individuals are 4e6 units apart, a reasonable alpha_list might be: [int(v) for v in np.logspace(np.log10(100),np.log10(1e10), 100)] The function's reports on the alphas chosen. If an extreme alpha is picked, change alpha_list to cover more range. :type alpha_list: list of numbers :param alpha_power: 2 (a good choice) means that similarity goes with area. 1 means with distance. :type alpha_list: number :param pheno: The target values(s) to predict. It can be a file name readable via `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. :type pheno: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or string :param cache_folder: (default 'None') The name of a directory in which to save intermediate results. If 'None', then no intermediate results are saved. :type cache_folder: a string :param map_function: (default 'map') A function with the same inputs and functionality as Python's 'map' function. Can be used to run 'heritability_spatial_correction' on a cluster. :type map_function: a function :param jackknife_count: (default 500) The number of jackknife groups to use when calculating standard errors (SE). Changing to a small number, 2, speeds up calculation at the cost of unusable SEs. :type jackknife_count: number :param permute_plus_count: (default 10000) The number of permutations used when calculating P values. Changing to a small number, 1, speeds up calculation at the cost of unusable P values. :type permute_plus_count: number :param permute_times_count: (default 10000) The number of permutations used when calculating P values. Changing to a small number, 1, speeds up calculation at the cost of unusable P values. :type permute_times_count: number :param seed: (default 0) The random seed used by jackknifing and permutation. :type seed: number :param just_testing: (default False) If true, skips actual LMM-related search and calculation. :type just_testing: bool :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: Pandas dataframe with one row per phenotype. Columns include "h2uncorr", "h2corr", etc. """ ###################### # Prepare the inputs ###################### from fastlmm.inference.fastlmm_predictor import _kernel_fixup, _pheno_fixup G_kernel = _kernel_fixup( G_kernel, iid_if_none=None, standardizer=Unit(), count_A1=count_A1 ) # Create a kernel from an in-memory kernel, some snps, or a text file. pheno = _pheno_fixup( pheno, iid_if_none=G_kernel.iid, missing='NA', count_A1=count_A1 ) # Create phenotype data from in-memory data or a text file. if cache_folder is not None: pstutil.create_directory_if_necessary(cache_folder, isfile=False) jackknife_seed = seed or 1954692566 permute_plus_seed = seed or 2372373100 permute_times_seed = seed or 2574440128 ###################### # Find 'alpha', the scale for distance ###################### # create the alpha table (unless it is already there) alpha_table_fn = "{0}/alpha_table.{1}.txt".format( cache_folder, pheno.sid_count) # create a name for the alpha_table cache file phen_target_array = np.array(pheno.sid, dtype='str') if cache_folder is not None and os.path.exists(alpha_table_fn): alpha_table = pd.read_csv(alpha_table_fn, delimiter='\t', index_col=False, comment=None) else: # create the list of arguments to run arg_list = [] for phen_target in phen_target_array: pheno_one = pheno[:, pheno.col_to_index( [phen_target])] # Look at only this pheno_target for alpha in alpha_list: #pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife_index, jackknife_count, jackknife_seed), arg_tuple = ( pheno_one, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (-1, 0, None), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (-1, 0, None), (-1, 0, None), just_testing, False, True and allow_gxe2, None) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function( work_item, arg_list) if len(arg_list) > 1 or always_remote else list( map(work_item, arg_list)) return_list = [line for line in return_list if line is not None] #Remove 'None' results alpha_table = pd.DataFrame(return_list) if cache_folder is not None: _write_csv(alpha_table, False, alpha_table_fn) # read the alpha table and find the best values grouped = alpha_table.groupby("phen") alpha_dict = {} for phen, phen_table in grouped: best_index_corr = phen_table['nLLcorr'].idxmin( ) # with Pandas, this returns the index in the parent table, not the group table best_index_gxe2 = phen_table['nLL_gxe2'].idxmin() if allow_gxe2 else 0 alpha_corr = alpha_table.iloc[best_index_corr]['alpha'] alpha_gxe2 = alpha_table.iloc[best_index_gxe2]['alpha'] alpha_dict[phen] = alpha_corr, alpha_gxe2 logging.info(alpha_dict) ###################### # Use jackknifing to compute h2uncorr, SE, h2corr, SE, e2, SE, gxe2, SE ###################### jackknife_count_actual = min(jackknife_count, G_kernel.iid_count) # Set up the run and do it (unless it has already been run) jackknife_table_fn = "{0}/jackknife.{1}.count{2}.txt".format( cache_folder, pheno.sid_count, jackknife_count_actual) if cache_folder is not None and os.path.exists(jackknife_table_fn): jackknife_table = pd.read_csv(jackknife_table_fn, delimiter='\t', index_col=False, comment=None) else: arg_list = [] for phen_target in phen_target_array: pheno_one = pheno[:, pheno.col_to_index( [phen_target])] # Look at only this pheno_target alpha_corr, alpha_gxe2 = alpha_dict[phen_target] alpha_set = set([ alpha_corr, alpha_gxe2 ]) #If these are the same, then only need to do half the work for alpha in alpha_set: logging.debug(alpha) do_uncorr = (alpha == alpha_corr) do_gxe2 = (alpha == alpha_gxe2) and allow_gxe2 for jackknife in range(-1, jackknife_count_actual): # pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife_index, jackknife_count, jackknife_seed), arg_tuple = ( pheno_one, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife, jackknife_count_actual, jackknife_seed), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (-1, 0, None), (-1, 0, None), just_testing, do_uncorr, do_gxe2, None) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function( work_item, arg_list) if len(arg_list) > 1 or always_remote else list( map(work_item, arg_list)) return_list = [line for line in return_list if line is not None] #Remove 'None' results jackknife_table = pd.DataFrame(return_list) if cache_folder is not None: _write_csv(jackknife_table, False, jackknife_table_fn) # get the real (that is, unjackknifed) values jackknife_table[ "diff"] = jackknife_table.h2uncorr - jackknife_table.h2corr # Compute the diff = h2uncorr-h2corr column results_both = jackknife_table[ jackknife_table.jackknife_index == -1] # Create a table of the real (non-jackknifed) results for both alphas (which may be the same) del results_both["jackknife_index"] results_corr = results_both[results_both.alpha == [ alpha_dict[phen][0] for phen in results_both.phen ]] #Create version for g+e's alpha results_gxe2 = results_both[results_both.alpha == [ alpha_dict[phen][1] for phen in results_both.phen ]] #Create version for gxe's alpha #remove unwanted columns for delcol in [ "a2_gxe2", "gxe2", "nLL_gxe2", "permute_plus_count", "permute_plus_index", "permute_plus_seed", "permute_times_count", "permute_times_index", "permute_times_seed", "jackknife_count", "jackknife_seed" ]: del results_corr[delcol] for delcol in [ "a2", "e2", "h2corr", "h2uncorr", "nLLcorr", "nLLuncorr", "diff", "permute_plus_count", "permute_plus_index", "permute_plus_seed", "permute_times_count", "permute_times_index", "permute_times_seed", "jackknife_count", "jackknife_seed" ]: del results_gxe2[delcol] if jackknife_count_actual > 0: #Use a pivottable to compute the jackknifed SE's corr_rows = np.logical_and( jackknife_table.jackknife_index != -1, jackknife_table.alpha == [ alpha_dict[phen][0] for phen in jackknife_table.phen ]) jk_table_corr = pd.pivot_table( jackknife_table[corr_rows], values=['h2uncorr', 'h2corr', 'diff', 'e2'], index=['phen'], columns=[], aggfunc=np.std) jk_table_corr["h2uncorr SE"] = jk_table_corr["h2uncorr"] * np.sqrt( jackknife_count_actual - 1) jk_table_corr["h2corr SE"] = jk_table_corr["h2corr"] * np.sqrt( jackknife_count_actual - 1) jk_table_corr["diff SE"] = jk_table_corr["diff"] * np.sqrt( jackknife_count_actual - 1) jk_table_corr["e2 SE"] = jk_table_corr["e2"] * np.sqrt( jackknife_count_actual - 1) del jk_table_corr["h2uncorr"] del jk_table_corr["h2corr"] del jk_table_corr["diff"] del jk_table_corr["e2"] gxe2_rows = np.logical_and( jackknife_table.jackknife_index != -1, jackknife_table.alpha == [ alpha_dict[phen][1] for phen in jackknife_table.phen ]) jk_table_gxe2 = pd.pivot_table(jackknife_table[gxe2_rows], values=['gxe2'], index=['phen'], columns=[], aggfunc=np.std) jk_table_gxe2["gxe2 SE"] = jk_table_gxe2["gxe2"] * np.sqrt( jackknife_count_actual - 1) del jk_table_gxe2["gxe2"] #Join the SE's to the main results table results_corr = results_corr.join(jk_table_corr, on='phen') results_gxe2 = results_gxe2.join(jk_table_gxe2, on='phen') else: for col in ['h2uncorr SE', 'h2corr SE', 'diff SE', 'e2 SE']: results_corr[col] = np.NaN results_gxe2['gxe2 SE'] = np.NaN #compute pValue columns results_corr["P (diff=0)"] = stats.t.sf( results_corr["diff"] / results_corr["diff SE"], df=jackknife_count_actual - 1) * 2 #two sided results_corr["from SE, one-sided, P (e2=0)"] = stats.t.sf( results_corr["e2"] / results_corr["e2 SE"], df=jackknife_count_actual - 1) results_gxe2["from SE, one-sided, P (gxe2=0)"] = stats.t.sf( results_gxe2["gxe2"] / results_gxe2["gxe2 SE"], df=jackknife_count_actual - 1) #one sided if cache_folder is not None: _write_csv( results_corr, False, "{0}/jackknife_corr_summary.{1}.jackknife{2}.txt".format( cache_folder, pheno.sid_count, jackknife_count_actual)) _write_csv( results_gxe2, False, "{0}/jackknife_gxe2_summary.{1}.jackknife{2}.txt".format( cache_folder, pheno.sid_count, jackknife_count_actual)) ###################### # compute p(e2=0) via permutation ###################### permplus_table_fn = "{0}/permutation.GPlusE.{1}.count{2}.txt".format( cache_folder, pheno.sid_count, permute_plus_count) if cache_folder is not None and os.path.exists(permplus_table_fn): permplus_table = pd.read_csv(permplus_table_fn, delimiter='\t', index_col=False, comment=None) else: arg_list = [] for phen_target in phen_target_array: pheno_one = pheno[:, pheno.col_to_index( [phen_target])] # Look at only this pheno_target alpha_corr, alpha_gxe2 = alpha_dict[phen_target] for jackknife_index in range(-1, permute_plus_count): # pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife_index, jackknife_count, jackknife_seed), arg_tuple = ( pheno_one, G_kernel, spatial_coor, spatial_iid, alpha_corr, alpha_power, (-1, 0, None), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (jackknife_index, permute_plus_count, permute_plus_seed), (-1, 0, None), just_testing, False, False, None) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function( work_item, arg_list) if len(arg_list) > 1 or always_remote else list( map(work_item, arg_list)) return_list = [line for line in return_list if line is not None] #Remove 'None' results permplus_table = pd.DataFrame(return_list) if cache_folder is not None: _write_csv(permplus_table, False, permplus_table_fn) #Create a table of the real nLL for each pheno real_result_permplus = permplus_table[permplus_table.permute_plus_index == -1][['phen', 'nLLcorr']] real_result_permplus.rename(columns={'nLLcorr': 'nLLcorr_real'}, inplace=True) real_result_permplus.set_index(['phen'], inplace=True) # Create a table of the permutation runs and add the real nLL to each row perm_table = permplus_table[permplus_table.permute_plus_index != -1] result = perm_table.join(real_result_permplus, on='phen') result['P(e2)'] = [ 1.0 if b else 0.0 for b in result.nLLcorr <= result.nLLcorr_real ] # create a column showing where the perm is better (or as good) as the real # Use pivottable to find the fraction of of times when permutation is better pivot_table_plus = pd.pivot_table(result, values=['P(e2)'], index=['phen'], columns=[], aggfunc=np.mean) if cache_folder is not None: summary_permplus_table_fn = "{0}/summary.permutation.GPlusE.{1}.count{2}.txt".format( cache_folder, pheno.sid_count, permute_plus_count) _write_csv(pivot_table_plus, True, summary_permplus_table_fn) ################################################ # compute p(gxe2=0) via permutation ################################################ #Only process phenos for which gxe2 is not 0 nonzero = set(results_gxe2[results_gxe2.gxe2 != 0].phen) permtimes_phenotypes = set(phen_target_array) & nonzero #intersection permtimes_table_list = [] for phen_target in permtimes_phenotypes: permtimes_table_fn = "{0}/permutation.GxE/{1}.count{2}.txt".format( cache_folder, phen_target, permute_times_count) if cache_folder is not None and os.path.exists(permtimes_table_fn): permtime_results = pd.read_csv(permtimes_table_fn, delimiter='\t', index_col=False, comment=None) else: arg_list = [] pheno_one = pheno[:, pheno.col_to_index( [phen_target])] # Look at only this pheno_target alpha_corr, alpha_gxe2 = alpha_dict[phen_target] a2 = float(permplus_table[permplus_table.phen == phen_target][ permplus_table.permute_plus_index == -1]['a2']) for permute_index in range(-1, permute_times_count): # pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_powerm (permute_index, permute_count, permute_seed), arg_tuple = ( pheno_one, G_kernel, spatial_coor, spatial_iid, alpha_gxe2, alpha_power, (-1, 0, None), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (-1, 0, None), (permute_index, permute_times_count, permute_times_seed), just_testing, False, allow_gxe2, a2) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function( work_item, arg_list) if len(arg_list) > 1 or always_remote else list( map(work_item, arg_list)) return_list = [line for line in return_list if line is not None] #Remove 'None' results permtime_results = pd.DataFrame(return_list) if cache_folder is not None: pstutil.create_directory_if_necessary(permtimes_table_fn) _write_csv(permtime_results, False, permtimes_table_fn) permtimes_table_list.append(permtime_results) if permtimes_table_list: #not empty permtimes_table = pd.concat(permtimes_table_list) logging.info(permtimes_table.head()) #Create a table of the real nLL for each pheno real_result_permtimes = permtimes_table[ permtimes_table.permute_times_index == -1][['phen', 'nLL_gxe2']] real_result_permtimes.rename(columns={'nLL_gxe2': 'nLL_gxe2_real'}, inplace=True) real_result_permtimes.set_index(['phen'], inplace=True) # Create a table of the permutation runs and add the real nLL to reach row summary_permtimes_table_fn = "{0}/summary.permutation.GxE.{1}.count{2}.txt".format( cache_folder, len(permtimes_phenotypes), permute_times_count) perm_table = permtimes_table[permtimes_table.permute_times_index != -1] resultx = perm_table.join(real_result_permtimes, on='phen') resultx['P(gxe2)'] = [ 1.0 if b else 0.0 for b in resultx.nLL_gxe2 <= resultx.nLL_gxe2_real ] # create a column showing where the perm is better (or as good) as the real # Use pivottable to find the fraction of times when permutation is better pivot_table_times = pd.pivot_table(resultx, values=['P(gxe2)'], index=['phen'], columns=[], aggfunc=np.mean) if cache_folder is not None: _write_csv(pivot_table_times, True, summary_permtimes_table_fn) ####################### # Create final table of results by combining the summary tables ####################### #Rename some columns results_corr.rename(columns={ "h2uncorr SE": "SE (h2uncorr)", "h2corr SE": "SE (h2corr)", "e2 SE": "SE (e2)" }, inplace=True) #Rename some columns and join results results_gxe2.rename(columns={ "alpha": "alpha_gxe2", "gxe2 SE": "SE (gxe2)", "h2corr_raw": "h2corr_raw_gxe2" }, inplace=True) del results_gxe2['alpha_power'] results_gxe2.set_index(["phen"], inplace=True) final0 = results_corr.join(results_gxe2, on='phen') #Rename some columns and join results pivot_table_plus.rename(columns={"P(e2)": "P(e2=0)"}, inplace=True) if len(pivot_table_plus) > 0: final1 = final0.join(pivot_table_plus, on='phen') else: final1 = final0.copy() final1['P(e2=0)'] = np.NaN #Rename some columns and join results if permtimes_table_list and len(pivot_table_times) > 0: #not empty pivot_table_times.rename(columns={"P(gxe2)": "P(gxe2=0)"}, inplace=True) final2 = final1.join(pivot_table_times, on='phen') else: final2 = final1.copy() final2["P(gxe2=0)"] = np.nan #Rename 'phen' and select final columns final2.rename(columns={"phen": "phenotype"}, inplace=True) final3 = final2[[ "phenotype", "h2uncorr", "SE (h2uncorr)", "h2corr", "SE (h2corr)", "P (diff=0)", "e2", "SE (e2)", "P(e2=0)", "alpha", "alpha_gxe2", "gxe2", "SE (gxe2)", "P(gxe2=0)" ]].copy() #Rename sort the phenotypes final3['lower'] = [pheno_one.lower() for pheno_one in final3.phenotype] final3.sort_values(['lower'], inplace=True) del final3['lower'] if cache_folder is not None: summary_final_table_fn = "{0}/summary.final.{1}.{2}.{3}.{4}.txt".format( cache_folder, pheno.sid_count, jackknife_count_actual, permute_plus_count, permute_times_count) _write_csv(final3, False, summary_final_table_fn) return final3
def predict(self, X=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, count_A1=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__ :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: A `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__ of the means and a :class:`KernelData` of the covariance """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: assert self.is_fitted, "Can only predict after predictor has been fitted" assert K0_whole_test is None or isinstance( K0_whole_test, KernelIdentity) # could also accept no snps assert K1_whole_test is None or isinstance( K1_whole_test, KernelIdentity) # could also accept no snps X = _pheno_fixup(X, iid_if_none=iid_if_none, count_A1=count_A1) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.read().val, np.ones((X.iid_count, 1))]) assert np.array_equal( X.sid, self.covar_sid ), "Expect covar sids to be the same in train and test." pheno_predicted = X.val.dot(self.beta).reshape(-1, 1) ret0 = SnpData(iid=X.iid, sid=self.pheno_sid, val=pheno_predicted, pos=np.array([[np.nan, np.nan, np.nan]]), name="linear regression Prediction" ) #!!!replace 'parent_string' with 'name' from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=X.iid, val=np.eye(X.iid_count) * self.ssres / self.iid_count) return ret0, ret1
results_df['Full ID'] = results_df['Chr'].astype('str') + '_' + results_df['ChrPos'].astype('str') results_df = pd.concat([results_df[['Chr', 'ChrPos', 'SNP', 'Full ID', 'PValue']], test_stat], axis = 1) results_df.columns = ['Chr', 'ChrPos', 'SNP', 'Full ID', 'PValue', 'F-test statistic'] mybed = Bed(VARIANTS_TO_TEST + '.bed') mysnpdata = mybed.read() print 'Time (s): ' + str(time.clock()-start) # In[6]: pheno = _pheno_fixup(PHENOTYPE_DATA, count_A1=None).read() pheno = pheno.val[np.searchsorted(pheno.iid[:,1], mysnpdata.iid[:,1])] snpdata = mysnpdata.val diff = range(snpdata.shape[1]) maf = range(snpdata.shape[1]) n_alleles = range(snpdata.shape[1]) mean_major = range(snpdata.shape[1]) for i in range(snpdata.shape[1]): ref = [j for j, x in enumerate(snpdata[:,i]) if x == 2] alt = [j for j, x in enumerate(snpdata[:,i]) if x == 0] meanref = np.mean(pheno[ref]) meanalt = np.mean(pheno[alt]) if len(ref) > len(alt): diff[i] = meanref - meanalt maf[i] = float(len(alt)) / (len(ref) + len(alt)) n_alleles[i] = len(ref) + len(alt)
def heritability_spatial_correction(G_kernel, spatial_coor, spatial_iid, alpha_list, alpha_power, pheno, map_function = map, cache_folder=None, jackknife_count=500, permute_plus_count=10000, permute_times_count=10000, seed=0, just_testing=False, always_remote=False, allow_gxe2 = True ): """ Function measuring heritability with correction for spatial location. :param G_kernel: A kernel that tells the genetic similarity between all pairs of individuals. The kernel can be given explicitly, for example with a :class:`.KernelData`. The kernel can also be given implicitly by providing a set of SNPs or the name of a BED file. :type G_kernel: a :class:`.KernelReader`, :class:`.SnpReader` or a string :param spatial_coor: The position of each individual given by two coordinates. Any units are allowed, but the two values must be compatible so that distance can be determined via Pythagoras' theorem. (So, longitude and latitude should not be used unless the locations are near the Equator.) :type spatial_coor: a iid_count x 2 array :param spatial_iid: A ndarray of the iids. Each iid is a ndarray of two strings (a family ID and a case ID) that identifies an individual. :type spatial_iid: array of strings with shape [iid_count,2] :param alpha_list: a list of numbers to search to find the best alpha, which is the similarity scale. The similarity of two individuals is here defined as exp(-(distance_between/alpha)**alpha_power). If the closest individuals are 100 units apart and the farthest individuals are 4e6 units apart, a reasonable alpha_list might be: [int(v) for v in np.logspace(np.log10(100),np.log10(1e10), 100)] The function's reports on the alphas chosen. If an extreme alpha is picked, change alpha_list to cover more range. :type alpha_list: list of numbers :param alpha_power: 2 (a good choice) means that similarity goes with area. 1 means with distance. :type alpha_list: number :param pheno: The target values(s) to predict. It can be a file name readable via :class:`SnpReader.Pheno` or any :class:`.SnpReader`. :type pheno: a :class:`.SnpReader` or string :param cache_folder: (default 'None') The name of a directory in which to save intermediate results. If 'None', then no intermediate results are saved. :type cache_folder: a string :param map_function: (default 'map') A function with the same inputs and functionality as Python's 'map' function. Can be used to run 'heritability_spatial_correction' on a cluster. :type map_function: a function :param jackknife_count: (default 500) The number of jackknife groups to use when calculating standard errors (SE). Changing to a small number, 2, speeds up calculation at the cost of unusable SEs. :type jackknife_count: number :param permute_plus_count: (default 10000) The number of permutations used when calculating P values. Changing to a small number, 1, speeds up calculation at the cost of unusable P values. :type permute_plus_count: number :param permute_times_count: (default 10000) The number of permutations used when calculating P values. Changing to a small number, 1, speeds up calculation at the cost of unusable P values. :type permute_times_count: number :param seed: (default 0) The random seed used by jackknifing and permutation. :type seed: number :param just_testing: (default False) If true, skips actual LMM-related search and calculation. :type just_testing: bool :rtype: Pandas dataframe with one row per phenotyper. Columns include "h2uncorr", "h2corr", etc. """ ###################### # Prepare the inputs ###################### from fastlmm.inference.fastlmm_predictor import _kernel_fixup, _pheno_fixup G_kernel = _kernel_fixup(G_kernel, iid_if_none=None, standardizer=Unit()) # Create a kernel from an in-memory kernel, some snps, or a text file. pheno = _pheno_fixup(pheno,iid_if_none=G_kernel.iid, missing='NA') # Create phenotype data from in-memory data or a text file. if cache_folder is not None: pstutil.create_directory_if_necessary(cache_folder,isfile=False) jackknife_seed = seed or 1954692566L permute_plus_seed = seed or 2372373100L permute_times_seed = seed or 2574440128L ###################### # Find 'alpha', the scale for distance ###################### # create the alpha table (unless it is already there) alpha_table_fn = "{0}/alpha_table.{1}.txt".format(cache_folder,pheno.sid_count) # create a name for the alpha_table cache file if cache_folder is not None and os.path.exists(alpha_table_fn): alpha_table = pd.read_csv(alpha_table_fn, delimiter = '\t',index_col=False, comment=None) else: # create the list of arguments to run arg_list = [] for phen_target in pheno.sid: pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target for alpha in alpha_list: #pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife_index, jackknife_count, jackknife_seed), arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (-1, 0, None), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (-1, 0, None), (-1, 0, None), just_testing, False, True and allow_gxe2, None) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list) return_list = [line for line in return_list if line is not None] #Remove 'None' results alpha_table = pd.DataFrame(return_list) if cache_folder is not None: _write_csv(alpha_table,False,alpha_table_fn) # read the alpha table and find the best values grouped = alpha_table.groupby("phen") alpha_dict = {} for phen, phen_table in grouped: best_index_corr = phen_table['nLLcorr'].idxmin() # with Pandas, this returns the index in the parent table, not the group table best_index_gxe2 = phen_table['nLL_gxe2'].idxmin() if allow_gxe2 else 0 alpha_corr = alpha_table.iloc[best_index_corr]['alpha'] alpha_gxe2 = alpha_table.iloc[best_index_gxe2]['alpha'] alpha_dict[phen] = alpha_corr, alpha_gxe2 logging.info(alpha_dict) ###################### # Use jackknifing to compute h2uncorr, SE, h2corr, SE, e2, SE, gxe2, SE ###################### jackknife_count_actual = min(jackknife_count,G_kernel.iid_count) # Set up the run and do it (unless it has already been run) jackknife_table_fn = "{0}/jackknife.{1}.count{2}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual) if cache_folder is not None and os.path.exists(jackknife_table_fn): jackknife_table = pd.read_csv(jackknife_table_fn, delimiter = '\t',index_col=False, comment=None) else: arg_list = [] for phen_target in pheno.sid: pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target alpha_corr, alpha_gxe2 = alpha_dict[phen_target] alpha_set = set([alpha_corr, alpha_gxe2]) #If these are the same, then only need to do half the work for alpha in alpha_set: logging.debug(alpha) do_uncorr = (alpha == alpha_corr) do_gxe2 = (alpha == alpha_gxe2) and allow_gxe2 for jackknife in range(-1, jackknife_count_actual): # pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife_index, jackknife_count, jackknife_seed), arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife, jackknife_count_actual, jackknife_seed), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (-1,0,None), (-1,0,None), just_testing, do_uncorr, do_gxe2, None) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list) return_list = [line for line in return_list if line is not None] #Remove 'None' results jackknife_table = pd.DataFrame(return_list) if cache_folder is not None: _write_csv(jackknife_table, False, jackknife_table_fn) # get the real (that is, unjackknifed) values jackknife_table["diff"] = jackknife_table.h2uncorr-jackknife_table.h2corr # Compute the diff = h2uncorr-h2corr column results_both = jackknife_table[jackknife_table.jackknife_index==-1] # Create a table of the real (non-jackknifed) results for both alphas (which may be the same) del results_both["jackknife_index"] results_corr = results_both[results_both.alpha == [alpha_dict[phen][0] for phen in results_both.phen]] #Create version for g+e's alpha results_gxe2 = results_both[results_both.alpha == [alpha_dict[phen][1] for phen in results_both.phen]] #Create version for gxe's alpha #remove unwanted columns for delcol in ["a2_gxe2","gxe2","nLL_gxe2","permute_plus_count","permute_plus_index","permute_plus_seed","permute_times_count","permute_times_index","permute_times_seed","jackknife_count","jackknife_seed"]: del results_corr[delcol] for delcol in ["a2","e2","h2corr","h2uncorr","nLLcorr","nLLuncorr","diff","permute_plus_count","permute_plus_index","permute_plus_seed","permute_times_count","permute_times_index","permute_times_seed","jackknife_count","jackknife_seed"]: del results_gxe2[delcol] #Use a pivottable to compute the jackknifed SE's corr_rows = np.logical_and(jackknife_table.jackknife_index!=-1,jackknife_table.alpha==[alpha_dict[phen][0] for phen in jackknife_table.phen]) jk_table_corr = pd.pivot_table(jackknife_table[corr_rows], values=['h2uncorr','h2corr','diff','e2'], index=['phen'], columns=[], aggfunc=np.std) jk_table_corr["h2uncorr SE"] = jk_table_corr["h2uncorr"] * np.sqrt(jackknife_count_actual-1) jk_table_corr["h2corr SE"] = jk_table_corr["h2corr"] * np.sqrt(jackknife_count_actual-1) jk_table_corr["diff SE"] = jk_table_corr["diff"] * np.sqrt(jackknife_count_actual-1) jk_table_corr["e2 SE"] = jk_table_corr["e2"] * np.sqrt(jackknife_count_actual-1) del jk_table_corr["h2uncorr"] del jk_table_corr["h2corr"] del jk_table_corr["diff"] del jk_table_corr["e2"] gxe2_rows = np.logical_and(jackknife_table.jackknife_index!=-1,jackknife_table.alpha==[alpha_dict[phen][1] for phen in jackknife_table.phen]) jk_table_gxe2 = pd.pivot_table(jackknife_table[gxe2_rows], values=['gxe2'], index=['phen'], columns=[], aggfunc=np.std) jk_table_gxe2["gxe2 SE"] = jk_table_gxe2["gxe2"] * np.sqrt(jackknife_count_actual-1) del jk_table_gxe2["gxe2"] #Join the SE's to the main results table results_corr = results_corr.join(jk_table_corr, on='phen') results_gxe2 = results_gxe2.join(jk_table_gxe2, on='phen') #compute pValue columns results_corr["P (diff=0)"] = stats.t.sf(results_corr["diff"]/results_corr["diff SE"],df=jackknife_count_actual-1)*2 #two sided results_corr["from SE, one-sided, P (e2=0)"] = stats.t.sf(results_corr["e2"]/results_corr["e2 SE"],df=jackknife_count_actual-1) results_gxe2["from SE, one-sided, P (gxe2=0)"] = stats.t.sf(results_gxe2["gxe2"]/results_gxe2["gxe2 SE"],df=jackknife_count_actual-1) #one sided if cache_folder is not None: _write_csv(results_corr, False, "{0}/jackknife_corr_summary.{1}.jackknife{2}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual)) _write_csv(results_gxe2, False, "{0}/jackknife_gxe2_summary.{1}.jackknife{2}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual)) ###################### # compute p(e2=0) via permutation ###################### permplus_table_fn = "{0}/permutation.GPlusE.{1}.count{2}.txt".format(cache_folder, pheno.sid_count, permute_plus_count) if cache_folder is not None and os.path.exists(permplus_table_fn): permplus_table = pd.read_csv(permplus_table_fn, delimiter = '\t',index_col=False, comment=None) else: arg_list = [] for phen_target in pheno.sid: pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target alpha_corr, alpha_gxe2 = alpha_dict[phen_target] for jackknife_index in range(-1,permute_plus_count): # pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife_index, jackknife_count, jackknife_seed), arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha_corr, alpha_power, (-1,0,None), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (jackknife_index, permute_plus_count,permute_plus_seed), (-1,0,None), just_testing, False, False, None) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list) return_list = [line for line in return_list if line is not None] #Remove 'None' results permplus_table = pd.DataFrame(return_list) if cache_folder is not None: _write_csv(permplus_table, False, permplus_table_fn) #Create a table of the real nLL for each pheno real_result_permplus = permplus_table[permplus_table.permute_plus_index==-1][['phen','nLLcorr']] real_result_permplus.rename(columns={'nLLcorr':'nLLcorr_real'},inplace=True) real_result_permplus.set_index(['phen'],inplace=True) # Create a table of the permutation runs and add the real nLL to each row perm_table = permplus_table[permplus_table.permute_plus_index!=-1] result = perm_table.join(real_result_permplus, on='phen') result['P(e2)'] = [1.0 if b else 0.0 for b in result.nLLcorr <= result.nLLcorr_real] # create a column showing where the perm is better (or as good) as the real # Use pivottable to find the fraction of of times when permutation is better pivot_table_plus = pd.pivot_table(result, values=['P(e2)'], index=['phen'], columns=[], aggfunc=np.mean) if cache_folder is not None: summary_permplus_table_fn = "{0}/summary.permutation.GPlusE.{1}.count{2}.txt".format(cache_folder, pheno.sid_count, permute_plus_count) _write_csv(pivot_table_plus, True, summary_permplus_table_fn) ################################################ # compute p(gxe2=0) via permutation ################################################ #Only process phenos for which gxe2 is not 0 nonzero = set(results_gxe2[results_gxe2.gxe2 !=0].phen) permtimes_phenotypes = set(pheno.sid) & nonzero #intersection permtimes_table_list = [] for phen_target in permtimes_phenotypes: permtimes_table_fn = "{0}/permutation.GxE/{1}.count{2}.txt".format(cache_folder, phen_target, permute_times_count) if cache_folder is not None and os.path.exists(permtimes_table_fn): permtime_results = pd.read_csv(permtimes_table_fn, delimiter = '\t',index_col=False, comment=None) else: arg_list = [] pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target alpha_corr, alpha_gxe2 = alpha_dict[phen_target] a2 = float(permplus_table[permplus_table.phen==phen_target][permplus_table.permute_plus_index == -1]['a2']) for permute_index in range(-1,permute_times_count): # pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_powerm (permute_index, permute_count, permute_seed), arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha_gxe2, alpha_power, (-1,0,None), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (-1,0,None), (permute_index, permute_times_count,permute_times_seed), just_testing, False, allow_gxe2, a2) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list) return_list = [line for line in return_list if line is not None] #Remove 'None' results permtime_results = pd.DataFrame(return_list) if cache_folder is not None: pstutil.create_directory_if_necessary(permtimes_table_fn) _write_csv(permtime_results,False,permtimes_table_fn) permtimes_table_list.append(permtime_results) if permtimes_table_list: #not empty permtimes_table = pd.concat(permtimes_table_list) logging.info(permtimes_table.head()) #Create a table of the real nLL for each pheno real_result_permtimes = permtimes_table[permtimes_table.permute_times_index==-1][['phen','nLL_gxe2']] real_result_permtimes.rename(columns={'nLL_gxe2':'nLL_gxe2_real'},inplace=True) real_result_permtimes.set_index(['phen'],inplace=True) # Create a table of the permutation runs and add the real nLL to reach row summary_permtimes_table_fn = "{0}/summary.permutation.GxE.{1}.count{2}.txt".format(cache_folder,len(permtimes_phenotypes), permute_times_count) perm_table = permtimes_table[permtimes_table.permute_times_index!=-1] resultx = perm_table.join(real_result_permtimes, on='phen') resultx['P(gxe2)'] = [1.0 if b else 0.0 for b in resultx.nLL_gxe2 <= resultx.nLL_gxe2_real] # create a column showing where the perm is better (or as good) as the real # Use pivottable to find the fraction of of times when permutation is better pivot_table_times = pd.pivot_table(resultx, values=['P(gxe2)'], index=['phen'], columns=[], aggfunc=np.mean) if cache_folder is not None: _write_csv(pivot_table_times,True,summary_permtimes_table_fn) ####################### # Create final table of results by combining the summary tables ####################### #Rename some columns results_corr.rename(columns={"h2uncorr SE":"SE (h2uncorr)","h2corr SE":"SE (h2corr)","e2 SE":"SE (e2)"}, inplace=True) #Rename some columns and join results results_gxe2.rename(columns={"alpha":"alpha_gxe2","gxe2 SE":"SE (gxe2)"}, inplace=True) del results_gxe2['alpha_power'] results_gxe2.set_index(["phen"],inplace=True) final0 = results_corr.join(results_gxe2, on='phen') #Rename some columns and join results pivot_table_plus.rename(columns={"P(e2)":"P(e2=0)"}, inplace=True) final1 = final0.join(pivot_table_plus, on='phen') #Rename some columns and join results if permtimes_table_list: #not empty pivot_table_times.rename(columns={"P(gxe2)":"P(gxe2=0)"}, inplace=True) final2 = final1.join(pivot_table_times, on='phen') else: final2 = final1.copy() final2["P(gxe2=0)"] = np.nan #Rename 'phen' and select final columns final2.rename(columns={"phen":"phenotype"}, inplace=True) final3 = final2[["phenotype","h2uncorr","SE (h2uncorr)","h2corr","SE (h2corr)","P (diff=0)","e2","SE (e2)","P(e2=0)","alpha","alpha_gxe2","gxe2","SE (gxe2)","P(gxe2=0)"]].copy() #Rename sort the phenotypes final3['lower'] = [pheno_one.lower() for pheno_one in final3.phenotype] final3.sort(['lower'],inplace=True) del final3['lower'] if cache_folder is not None: summary_final_table_fn = "{0}/summary.final.{1}.{2}.{3}.{4}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual,permute_plus_count,permute_times_count) _write_csv(final3,False,summary_final_table_fn) return final3
test_stat = test_stat.replace('[\[\] ]', '', regex=True) test_stat = pd.to_numeric(test_stat[0]) results_df['Full ID'] = results_df['Chr'].astype( 'str') + '_' + results_df['ChrPos'].astype('str') results_df = pd.concat( [results_df[['Chr', 'ChrPos', 'SNP', 'Full ID', 'PValue']], test_stat], axis=1) results_df.columns = [ 'Chr', 'ChrPos', 'SNP', 'Full ID', 'PValue', 'F-test statistic' ] mybed = Bed(variants_to_test + '.bed') mysnpdata = mybed.read() pheno = _pheno_fixup(phenotype_data, count_A1=None).read() pheno = pheno.val[np.searchsorted(pheno.iid[:, 1], mysnpdata.iid[:, 1])] snpdata = mysnpdata.val diff = range(snpdata.shape[1]) maf = range(snpdata.shape[1]) n_alleles = range(snpdata.shape[1]) mean_major = range(snpdata.shape[1]) for i in range(snpdata.shape[1]): ref = [j for j, x in enumerate(snpdata[:, i]) if x == 2] alt = [j for j, x in enumerate(snpdata[:, i]) if x == 0] meanref = np.mean(pheno[ref]) meanalt = np.mean(pheno[alt]) if len(ref) > len(alt): diff[i] = meanref - meanalt maf[i] = float(len(alt)) / (len(ref) + len(alt)) n_alleles[i] = len(ref) + len(alt)
def fit(self, X=None, y=None, K0_train=None, K1_train=None, h2=None, mixing=None): """ Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected. :param X: training covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param y: training phenotype: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_train: Must be None. Represents the identity similarity matrix. :type K0_train: None :param K1_train: Must be None. Represents the identity similarity matrix. :type K1_train: :class:`.SnpReader` or a string or :class:`.KernelReader` :param h2: Ignored. Optional. :type h2: number :param mixing: Ignored. Optional. :type mixing: number :rtype: self, the fitted Linear Regression predictor """ self.is_fitted = True assert K0_train is None # could also accept that ID or no snps assert K1_train is None # could also accept that ID or no snps assert y is not None, "y must be given" y = _pheno_fixup(y) assert y.sid_count == 1, "Expect y to be just one variable" X = _pheno_fixup(X, iid_if_none=y.iid) X, y = intersect_apply([X, y]) y = y.read() X, covar_unit_trained = X.read().standardize( self.covariate_standardizer, return_trained=True) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.val, np.ones((X.iid_count, 1))]) lsqSol = np.linalg.lstsq(X.val, y.val[:, 0]) bs = lsqSol[0] #weights r2 = lsqSol[1] #squared residuals D = lsqSol[2] #rank of design matrix N = y.iid_count self.beta = bs self.ssres = float(r2) self.sstot = ((y.val - y.val.mean())**2).sum() self.covar_unit_trained = covar_unit_trained self.iid_count = X.iid_count self.covar_sid = X.sid self.pheno_sid = y.sid return self
def single_snp_all_plus_select(test_snps, pheno, G=None, covar=None, k_list = None, n_folds=10, #1 is special and means test on train seed = 0, output_file_name = None, GB_goal=None, force_full_rank=False, force_low_rank=False, mixing=None, h2=None, do_plot=False, runner=None): """ Function performing single SNP GWAS based on two kernels. The first kernel is based on all SNPs. The second kernel is a similarity matrix constructed of the top *k* SNPs where the SNPs are ordered via the PValue from :meth:`.single_snp` and *k* is determined via out-of-sample prediction. All work is done via 'leave_out_one_chrom', that one chromosome is tested and the kernels are constructed from the other chromosomes. Will reorder and intersect IIDs as needed. :param test_snps: SNPs to test. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type test_snps: a :class:`.SnpReader` or a string :param pheno: A single phenotype: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`. If you give a string, it should be the file name of a PLINK phenotype-formatted file. Any IIDs with missing values will be removed. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type pheno: a :class:`.SnpReader` or a string :param G: SNPs from which to create a similarity matrix of the top *k* SNPs. If not given, will use test_snps. Can be any :class:`.SnpReader`. If you give a string, it should be the base name of a set of PLINK Bed-formatted files. :type G: :class:`.SnpReader` or a string :param covar: covariate information, optional: Can be any :class:`.SnpReader`, for example, :class:`.Pheno` or :class:`.SnpData`. If you give a string, it should be the file name of a PLINK phenotype-formatted file. (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header') :type covar: a :class:`.SnpReader` or a string :param k_list: Values of *k* (in addition to 0) to test. Default to [1,2,4,8,...8192]. :type k_list: list of numbers :param n_folds: Number of folds of cross validation to use for out-of-sample evaluation of various values of *k*. Default to 10. :type n_folds: number :param seed: (optional) Random seed used to generate permutations for lrt G0 fitting. :type seed: number :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. :type output_file_name: file name :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel, which is memory efficient with little overhead on computation time. :type GB_goal: number :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True. :type force_full_rank: Boolean :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True. :type force_low_rank: Boolean :param mixing: A parameter to LMM learning telling how to combine the two kernels, optional If not given will search for best value. :type mixing: number :param h2: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional If not given will search for best value. :type h2: number :param do_plot: If true, will plot, for each chrom, the negative loglikelihood vs k. :type do_plot: boolean :param runner: a runner, optional: Tells how to run locally, multi-processor, or on a cluster. If not given, the function is run locally. :type runner: a runner. :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue" :Example: >>> import logging >>> import numpy as np >>> from fastlmm.association import single_snp_all_plus_select >>> from pysnptools.snpreader import Bed >>> from fastlmm.util.runner import LocalMultiProc >>> logging.basicConfig(level=logging.INFO) >>> pheno_fn = "../feature_selection/examples/toydata.phe" >>> snps = Bed("../feature_selection/examples/toydata.5chrom.bed")[:,::100] #To make example faster, run on only 1/100th of the data >>> chrom5_snps = snps[:,snps.pos[:,0]==5] # Test on only chrom5 >>> results_dataframe = single_snp_all_plus_select(test_snps=chrom5_snps,G=snps,pheno=pheno_fn,GB_goal=2,runner=LocalMultiProc(20,mkl_num_threads=5)) #Run multiproc >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe) null_9800 0.0793397 4 """ #================================================= # Start of definition of inner functions #================================================= def _best_snps_for_each_chrom(chrom_list, input_files, runner, G, n_folds, seed, pheno, covar, force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal): #logging.info("Doing GWAS_1K for each chrom and fold. Work_count={0}".format(len(chrom_list)*(n_folds+1))) max_k = int(max(k_list)) assert np.array_equal(G.iid,pheno.iid) and np.array_equal(G.iid,covar.iid), "real assert" def mapper_find_best_given_chrom(test_chr): G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader def mapper_gather_lots(i_fold_and_pair): i_fold, (train_idx, test_idx) = i_fold_and_pair logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold)) G_train = G_for_chrom[train_idx,:] #Precompute whole x whole standardized on train from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank) block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count) K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read() assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert" K_train = K_whole_unittrain[train_idx] single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2) is_all = (i_fold == n_folds) if n_folds > 1 else True k_list_in = [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)] if is_all: top_snps = list(single_snp_result.SNP[:max_k]) else: top_snps = None if i_fold == n_folds: k_index_to_nLL = None else: k_index_to_nLL = [] for k in k_list_in: top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])] logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k)) top_k_train = top_k[train_idx,:] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal) fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2=h2) #iid intersection means when can give the whole covariate and pheno top_k_test = top_k[test_idx,:] if k > 0 else None K0_whole_test = K_whole_unittrain[:,test_idx] nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno k_index_to_nLL.append(nLL) if i_fold > 0: k_list_in = None return k_list_in, top_snps, k_index_to_nLL def reducer_find_best(top_snps_and_k_index_to_nLL_sequence): #Starts fold_index+all -> k_index -> nll #Need: k_index -> sum(fold_index -> nll) k_index_to_sum_nll = None top_snps_all = None k_list_in_all = None for i_fold, (k_list_in, top_snps, k_index_to_nLL) in enumerate(top_snps_and_k_index_to_nLL_sequence): if k_list_in is not None: assert k_list_in_all is None, "real assert" k_list_in_all = k_list_in k_index_to_sum_nll = np.zeros(len(k_list_in)) if top_snps is not None: assert top_snps_all is None, "real assert" top_snps_all = top_snps if k_index_to_nLL is not None: assert i_fold < n_folds or n_folds == 1, "real assert" for k_index, nLL in enumerate(k_index_to_nLL): k_index_to_sum_nll[k_index] += nLL #find best # top_snps best_k = k_list_in_all[np.argmin(k_index_to_sum_nll)] logging.info("For chrom={0}, best_k={1}".format(test_chr,best_k)) if do_plot: _nll_plot(k_list_in_all, k_index_to_sum_nll) #Return the top snps from all result = top_snps_all[:best_k] return result i_fold_index_to_top_snps_and_k_index_to_nLL = map_reduce( _kfold(G_for_chrom.iid_count, n_folds, seed, end_with_all=True), mapper=mapper_gather_lots, reducer=reducer_find_best) return i_fold_index_to_top_snps_and_k_index_to_nLL chrom_index_to_best_sid = map_reduce( chrom_list, nested=mapper_find_best_given_chrom, input_files=input_files, name="best snps for each chrom", runner=runner) return chrom_index_to_best_sid def _gwas_2k_via_loo_chrom(test_snps, chrom_list, input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal): logging.info("Doing GWAS_2K for each chrom. Work_count={0}".format(len(chrom_list))) def mapper_single_snp_2K_given_chrom(test_chr): logging.info("Working on chr={0}".format(test_chr)) test_snps_chrom = test_snps[:,test_snps.pos[:,0]==test_chr] G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader chrom_index = chrom_list.index(test_chr) best_sid = chrom_index_to_best_sid[chrom_index] K1 = G_for_chrom[:,G_for_chrom.sid_to_index(best_sid)] result = single_snp(test_snps=test_snps_chrom, K0=G_for_chrom, K1=K1, pheno=pheno, covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank,mixing=mixing,h2=h2) return result def reducer_closure(frame_sequence): #!!!very similar code in single_snp frame = pd.concat(frame_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) logging.info("PhenotypeName\t{0}".format(pheno.sid[0])) logging.info("SampleSize\t{0}".format(G.iid_count)) logging.info("SNPCount\t{0}".format(G.sid_count)) return frame frame = map_reduce( chrom_list, mapper=mapper_single_snp_2K_given_chrom, reducer=reducer_closure, input_files=input_files, name="single_snp with two K's for all chroms", runner=runner ) return frame #================================================= # End of definition of inner functions #================================================= #!!!code similar to single_snp if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") if k_list is None: k_list = np.logspace(start=0, stop=13, num=14, base=2) assert test_snps is not None, "test_snps must be given as input" test_snps = _snps_fixup(test_snps) G = _snps_fixup(G or test_snps) pheno = _pheno_fixup(pheno).read() assert pheno.sid_count == 1, "Expect pheno to be just one variable" pheno = pheno[(pheno.val==pheno.val)[:,0],:] covar = _pheno_fixup(covar, iid_if_none=pheno.iid) chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data G, test_snps, pheno, covar = pstutil.intersect_apply([G, test_snps, pheno, covar]) common_input_files = [test_snps, G, pheno, covar] chrom_index_to_best_sid = _best_snps_for_each_chrom(chrom_list, common_input_files, runner, G, n_folds, seed, pheno, covar, force_full_rank, force_low_rank, mixing, h2, k_list, GB_goal) frame = _gwas_2k_via_loo_chrom(test_snps, chrom_list, common_input_files, runner, G, chrom_index_to_best_sid, pheno, covar, force_full_rank, force_low_rank, mixing, h2, output_file_name, GB_goal) return frame
def fit(self, X=None, y=None, K0_train=None, K1_train=None, h2=None, mixing=None, count_A1=None): """ Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected. :param X: training covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string. :param y: training phenotype: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type y: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string. :param K0_train: Must be None. Represents the identity similarity matrix. :type K0_train: None :param K1_train: Must be None. Represents the identity similarity matrix. :type K1_train: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__ :param h2: Ignored. Optional. :type h2: number :param mixing: Ignored. Optional. :type mixing: number :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: self, the fitted Linear Regression predictor """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: self.is_fitted = True assert K0_train is None # could also accept that ID or no snps assert K1_train is None # could also accept that ID or no snps assert y is not None, "y must be given" y = _pheno_fixup(y, count_A1=count_A1) assert y.sid_count == 1, "Expect y to be just one variable" X = _pheno_fixup(X, iid_if_none=y.iid, count_A1=count_A1) X, y = intersect_apply([X, y]) y = y.read() X, covar_unit_trained = X.read().standardize( self.covariate_standardizer, return_trained=True) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.val, np.ones((X.iid_count, 1))]) lsqSol = np.linalg.lstsq(X.val, y.val[:, 0], rcond=-1) bs = lsqSol[0] #weights r2 = lsqSol[1] #squared residuals D = lsqSol[2] #rank of design matrix N = y.iid_count self.beta = bs self.ssres = float(r2) self.sstot = ((y.val - y.val.mean())**2).sum() self.covar_unit_trained = covar_unit_trained self.iid_count = X.iid_count self.covar_sid = X.sid self.pheno_sid = y.sid return self