def test_some_std(self): k0 = self.snpdata.read_kernel(standardizer=Unit()).val from pysnptools.kernelreader import SnpKernel k1 = self.snpdata.read_kernel(standardizer=Unit()) np.testing.assert_array_almost_equal(k0, k1.val, decimal=10) from pysnptools.snpreader import SnpData snpdata2 = SnpData(iid=self.snpdata.iid, sid=self.snpdata.sid, pos=self.snpdata.pos, val=np.array(self.snpdata.val)) s = str(snpdata2) snpdata2.standardize() s = str(snpdata2) snpreader = Bed(self.currentFolder + "/examples/toydata", count_A1=False) k2 = snpreader.read_kernel(standardizer=Unit(), block_size=500).val np.testing.assert_array_almost_equal(k0, k2, decimal=10) from pysnptools.standardizer.identity import Identity from pysnptools.standardizer.diag_K_to_N import DiagKtoN for dtype in [sp.float64, sp.float32]: for std in [Unit(), Beta(1, 25), Identity(), DiagKtoN()]: s = str(std) np.random.seed(0) x = np.array(np.random.randint(3, size=[60, 100]), dtype=dtype) x2 = x[:, ::2] x2b = np.array(x2) #LATER what's this about? It doesn't do non-contiguous? #assert not x2.flags['C_CONTIGUOUS'] and not x2.flags['F_CONTIGUOUS'] #set up to test non contiguous #assert x2b.flags['C_CONTIGUOUS'] or x2b.flags['F_CONTIGUOUS'] #set up to test non contiguous #a,b = std.standardize(x2b),std.standardize(x2) #np.testing.assert_array_almost_equal(a,b) logging.info("done")
def _read_pstdata(self): row = SnpReader._read_fam(self.filename, remove_suffix="dat") col, col_property = SnpReader._read_map_or_bim(self.filename, remove_suffix="dat", add_suffix="map") if len(row) == 0 or len(col) == 0: return SnpData(iid=row, sid=col, pos=col_property, val=np.empty([len(row), len(col)])) datfields = pd.read_csv(self.filename, delimiter='\t', header=None, index_col=False, skiprows=self.skiprows) if not np.array_equal(datfields[0], col): raise Exception( "Expect snp list in map file to exactly match snp list in dat file" ) del datfields[0] del datfields[1] del datfields[2] assert len(row) == datfields.shape[ 1], "Expect # iids in fam file to match dat file" val = datfields.values.T snpdata = SnpData(iid=row, sid=col, pos=col_property, val=val) return snpdata
def test_some_std(self): k0 = self.snpdata.read_kernel(standardizer=Unit()).val from pysnptools.kernelreader import SnpKernel k1 = self.snpdata.read_kernel(standardizer=Unit()) np.testing.assert_array_almost_equal(k0, k1.val, decimal=10) from pysnptools.snpreader import SnpData snpdata2 = SnpData(iid=self.snpdata.iid,sid=self.snpdata.sid,pos=self.snpdata.pos,val=np.array(self.snpdata.val)) s = str(snpdata2) snpdata2.standardize() s = str(snpdata2) snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False) k2 = snpreader.read_kernel(standardizer=Unit(),block_size=500).val np.testing.assert_array_almost_equal(k0, k2, decimal=10) from pysnptools.standardizer.identity import Identity from pysnptools.standardizer.diag_K_to_N import DiagKtoN for dtype in [sp.float64,sp.float32]: for std in [Unit(),Beta(1,25),Identity(),DiagKtoN()]: s = str(std) np.random.seed(0) x = np.array(np.random.randint(3,size=[60,100]),dtype=dtype) x2 = x[:,::2] x2b = np.array(x2) #LATER what's this about? It doesn't do non-contiguous? #assert not x2.flags['C_CONTIGUOUS'] and not x2.flags['F_CONTIGUOUS'] #set up to test non contiguous #assert x2b.flags['C_CONTIGUOUS'] or x2b.flags['F_CONTIGUOUS'] #set up to test non contiguous #a,b = std.standardize(x2b),std.standardize(x2) #np.testing.assert_array_almost_equal(a,b) logging.info("done")
def predict(self, X=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance """ assert self.is_fitted, "Can only predict after predictor has been fitted" assert K0_whole_test is None or isinstance( K0_whole_test, KernelIdentity) # could also accept no snps assert K1_whole_test is None or isinstance( K1_whole_test, KernelIdentity) # could also accept no snps X = _pheno_fixup(X, iid_if_none=iid_if_none) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.read().val, np.ones((X.iid_count, 1))]) assert np.array_equal( X.sid, self.covar_sid ), "Expect covar sids to be the same in train and test." pheno_predicted = X.val.dot(self.beta).reshape(-1, 1) ret0 = SnpData(iid=X.iid, sid=self.pheno_sid, val=pheno_predicted, pos=np.array([[np.nan, np.nan, np.nan]]), name="linear regression Prediction" ) #!!!replace 'parent_string' with 'name' from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=X.iid, val=np.eye(X.iid_count) * self.ssres / self.iid_count) return ret0, ret1
def test_block_size_Snp2Dist(self): from pysnptools.snpreader import SnpData from pysnptools.distreader._snp2dist import _Snp2Dist np.random.seed(0) sid_count = 20 val=np.array(np.random.randint(0,3,size=[3,sid_count]),dtype=np.float64,order='F') snpreader = SnpData(iid=[["0","0"],["1","1"],["2","2"]],sid=[str(i) for i in range(sid_count)],val=val) distdata0 = snpreader.as_dist(max_weight=2,block_size=1).read() distdata1 = snpreader.as_dist(max_weight=2,block_size=None).read() np.testing.assert_array_almost_equal(distdata0.val,distdata1.val, decimal=10)
def snpsA(seed, iid_count, sid_count, use_distributed): import numpy as np from pysnptools.snpreader import Bed from pysnptools.snpreader import DistributedBed from pysnptools.snpreader import SnpGen chrom_count = 10 global top_cache if use_distributed: test_snp_path = ( cache_top / f"snpsA_{seed}_{chrom_count}_{iid_count}_{sid_count}_db") else: test_snp_path = ( cache_top / f"snpsA_{seed}_{chrom_count}_{iid_count}_{sid_count}.bed") count_A1 = False if not test_snp_path.exists(): snpgen = SnpGen( seed=seed, iid_count=iid_count, sid_count=sid_count, chrom_count=chrom_count, block_size=1000, ) if use_distributed: test_snps = DistributedBed.write(str(test_snp_path), snpgen) else: test_snps = Bed.write(str(test_snp_path), snpgen.read(dtype="float32"), count_A1=count_A1) else: if use_distributed: test_snps = DistributedBed(str(test_snp_path)) else: test_snps = Bed(str(test_snp_path), count_A1=count_A1) from pysnptools.snpreader import SnpData np.random.seed(seed) pheno = SnpData( iid=test_snps.iid, sid=["pheno"], val=np.random.randn(test_snps.iid_count, 1) * 3 + 2, ) covar = SnpData( iid=test_snps.iid, sid=["covar1", "covar2"], val=np.random.randn(test_snps.iid_count, 2) * 2 - 3, ) return test_snps, pheno, covar
def test_cpp_std(self): #Order C vs F for order in ['C','F']: #32 vs 64 for dtype in [np.float64,np.float32]: #unit vs beta for std in [stdizer.Unit(),stdizer.Beta(2,10)]: np.random.seed(0) snp_count = 20 snpreader0 = SnpData(iid=[["0","0"],["1","1"],["2","2"]],sid=[str(i) for i in range(snp_count)],val=np.array(np.random.randint(3,size=[3,snp_count]),dtype=dtype,order=order)) snpreader1 = SnpData(iid=[["3","3"],["4","4"]],sid=[str(i) for i in range(snp_count)],val=np.array(np.random.randint(3,size=[2,snp_count]),dtype=dtype,order=order)) #has SNC for has_SNC_in_train in [False, True]: if has_SNC_in_train: snpreader0.val[:,1] = 0 #missing data for has_missing_data in [False, True]: if has_missing_data: snpreader0.val[0,2]=np.nan snpreader1.val[0,2]=np.nan #gather stats vs not cppa, stdcppa = snpreader0.read(order=order,dtype=dtype).standardize(std,return_trained=True,force_python_only=False) pya, stdpya = snpreader0.read(order=order,dtype=dtype).standardize(std,return_trained=True,force_python_only=True) np.testing.assert_array_almost_equal(cppa.val, pya.val, decimal=10 if dtype==np.float64 else 5) np.testing.assert_array_almost_equal(stdcppa.stats,stdpya.stats, decimal=10 if dtype==np.float64 else 5) assert (np.inf in stdcppa.stats[:,1]) == has_SNC_in_train assert (np.inf in stdpya.stats[:,1]) == has_SNC_in_train if has_SNC_in_train: assert np.array_equal(cppa.val[:,1],np.zeros([cppa.val.shape[0]])) assert np.array_equal(pya.val[:,1],np.zeros([pya.val.shape[0]])) if has_missing_data: assert 0 == cppa.val[0,2] assert 0 == pya.val[0,2] #uses stats cppb = snpreader1.read(order=order,dtype=dtype).standardize(stdcppa,force_python_only=False) pyb = snpreader1.read(order=order,dtype=dtype).standardize(stdpya,force_python_only=True) np.testing.assert_array_almost_equal(cppb.val, pyb.val, decimal=10 if dtype==np.float64 else 5) np.testing.assert_array_almost_equal(stdcppa.stats,stdpya.stats, decimal=10 if dtype==np.float64 else 5) #Make sure we haven't messed up the train stats if has_SNC_in_train: assert np.array_equal(cppb.val[:,1],np.zeros([cppb.val.shape[0]])) assert np.array_equal(pyb.val[:,1],np.zeros([pyb.val.shape[0]])) if has_missing_data: assert cppb.val[0,2]==0 assert pyb.val[0,2]==0 logging.info("done with 'test_cpp_std'")
def _read_pstdata(self): #LATER switch it, so the main code is here rather than in loadPhen if isinstance(self.filename,str): pheno_input = pstpheno.loadPhen(self.filename,missing=self.missing) elif self.filename is None: assert self._iid_if_none is not None, "If input is None then iid_if_none be given" pheno_input = { 'header':np.empty((0),dtype='str'), 'vals': np.empty((len(self._iid_if_none), 0)), 'iid': self._iid_if_none } else: pheno_input = self.filename if len(pheno_input['vals'].shape) == 1: pheno_input = { 'header' : pheno_input['header'], 'vals' : np.reshape(pheno_input['vals'],(-1,1)), 'iid' : pheno_input['iid'] } if len(pheno_input['header']) > 0 and pheno_input['header'][0] is None: pheno_input['header'] = ["pheno{0}".format(i) for i in range(len(pheno_input['header']))] #LATER move to reader? elif len(pheno_input['header']) == 0: pheno_input['header'] = ["pheno{0}".format(i) for i in range(pheno_input['vals'].shape[1])] row = pheno_input['iid'] col = np.array(pheno_input['header'],dtype='str') col_property = np.empty((len(col),3)) col_property.fill(np.nan) val = pheno_input['vals'] snpdata = SnpData(iid=row,sid=col,pos=col_property,val=val) return snpdata
def test_merge_std(self): #unit vs beta for std in [stdizer.Beta(2, 10), stdizer.Unit()]: np.random.seed(0) sid_count = 20 snpreader = SnpData(iid=[["0", "0"], ["1", "1"], ["2", "2"]], sid=[str(i) for i in range(sid_count)], val=np.array(np.random.randint( 3, size=[3, sid_count]), dtype=np.float64, order='F')) kerneldata0, trained0, diag0 = SnpKernel( snpreader, std, block_size=1)._read_with_standardizing(to_kerneldata=True, return_trained=True) kerneldata1, trained1, diag1 = SnpKernel( snpreader, std, block_size=None)._read_with_standardizing(to_kerneldata=True, return_trained=True) np.testing.assert_array_almost_equal(kerneldata0.val, kerneldata1.val, decimal=10) np.testing.assert_array_almost_equal(trained0.stats, trained1.stats, decimal=10) assert abs(diag0.factor - diag1.factor) < 1e-7
def test_pheno1(self): from pysnptools.snpreader import Bed, SnpData, SnpNpz some_snp_data = Bed(self.currentFolder + "/../../tests/datasets/generate/gen2.bed",count_A1=False).read() gen_snpdata = SnpData(iid=some_snp_data.iid,sid=["pheno"],val=_generate_phenotype(some_snp_data, 10, genetic_var=.5, noise_var=.5, seed=5).reshape(-1,1)) #SnpNpz.write(r'c:\deldir\pheno1.snp.npz',gen_snpdata) ref_snpdata = SnpNpz(self.currentFolder + "/../../tests/datasets/generate/pheno1.snp.npz").read() assert gen_snpdata == ref_snpdata
def _run_once(self): if (self._ran_once): return row_ascii, col_ascii, val, row_property, col_property = self._run_once_inner( ) row = np.array(row_ascii, dtype='str') #!!!avoid this copy when not needed col = np.array(col_ascii, dtype='str') #!!!avoid this copy when not needed SnpData.__init__(self, iid=row, sid=col, val=val, pos=col_property, name="np.memmap('{0}')".format(self._filename))
def _snps_fixup(snp_input, iid_if_none=None, count_A1=None): if isinstance(snp_input, str): return Bed(snp_input, count_A1=count_A1) if isinstance(snp_input, dict): return SnpData(iid=snp_input['iid'], sid=snp_input['header'], val=snp_input['vals']) if snp_input is None: assert iid_if_none is not None, "snp_input cannot be None here" return SnpData(iid_if_none, sid=np.empty((0), dtype='str'), val=np.empty((len(iid_if_none), 0)), pos=np.empty((0, 3)), name="") #todo: make a static factory method on SnpData return snp_input
def predict(self,X=None,K0_whole_test=None,K1_whole_test=None,iid_if_none=None,count_A1=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance """ assert self.is_fitted, "Can only predict after predictor has been fitted" assert K0_whole_test is None or isinstance(K0_whole_test,KernelIdentity) # could also accept no snps assert K1_whole_test is None or isinstance(K1_whole_test,KernelIdentity) # could also accept no snps X = _pheno_fixup(X,iid_if_none=iid_if_none,count_A1=count_A1) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.read().val,np.ones((X.iid_count,1))]) assert np.array_equal(X.sid,self.covar_sid), "Expect covar sids to be the same in train and test." pheno_predicted = X.val.dot(self.beta).reshape(-1,1) ret0 = SnpData(iid = X.iid, sid=self.pheno_sid,val=pheno_predicted,pos=np.array([[np.nan,np.nan,np.nan]]),name="linear regression Prediction") #!!!replace 'parent_string' with 'name' from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=X.iid,val=np.eye(X.iid_count)* self.ssres / self.iid_count) return ret0, ret1
def _sel_plus_pc(self, h2, force_low_rank, force_full_rank, count_A1=None): do_plot = False use_cache = False # define file names bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" pcs_fn = os.path.join(self.tempout_dir, "sel_plus_pc.pcs.txt") if not (use_cache and os.path.exists(pcs_fn)): from fastlmm.util import compute_auto_pcs covar = compute_auto_pcs(bed_fn, count_A1=count_A1) logging.info("selected number of PCs: {0}".format( covar["vals"].shape[1])) Pheno.write( pcs_fn, SnpData(iid=covar['iid'], sid=covar['header'], val=covar['vals'])) else: logging.info("Using top pcs's cache") covar = Pheno(pcs_fn) mf_name = "lmp" #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp" runner = mf_to_runner_function(mf_name)(20) logging.info( "Working on h2={0},force_low_rank={1},force_full_rank={2}".format( h2, force_low_rank, force_full_rank)) result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 == .5 else "h2Search") output_file_name = os.path.join(self.tempout_dir, result_file_name) + ".txt" results = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn, k_list=[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 160, 200, 250, 320, 400, 500, 630, 800, 1000 ], h2=h2, n_folds=self.pythonpath + "/tests/datasets/synth/DebugEmitFolds.txt", covar=covar, output_file_name=output_file_name, force_low_rank=force_low_rank, force_full_rank=force_full_rank, GB_goal=2, count_A1=False #runner = runner ) logging.info(results.head()) self.compare_files(results, result_file_name)
def read(self, order='F', dtype=np.float64, force_python_only=False, view_ok=False): """Reads the SNP values and returns a :class:`.SnpData` (with :attr:`.SnpData.val` property containing a new ndarray of the SNP values). :param order: {'F' (default), 'C', 'A'}, optional -- Specify the order of the ndarray. If order is 'F' (default), then the array will be in F-contiguous order (iid-index varies the fastest). If order is 'C', then the returned array will be in C-contiguous order (sid-index varies the fastest). If order is 'A', then the :attr:`.SnpData.val` ndarray may be in any order (either C-, Fortran-contiguous). :type order: string or None :param dtype: {numpy.float64 (default), numpy.float32}, optional -- The data-type for the :attr:`SnpData.val` ndarray. :type dtype: data-type :param force_python_only: optional -- If False (default), may use outside library code. If True, requests that the read be done without outside library code. :type force_python_only: bool :param view_ok: optional -- If False (default), allocates new memory for the :attr:`SnpData.val`'s ndarray. If True, if practical and reading from a :class:`SnpData`, will return a new :class:`SnpData` with a ndarray shares memory with the original :class:`SnpData`. Typically, you'll also wish to use "order='A'" to increase the chance that sharing will be possible. Use these parameters with care because any change to either ndarray (for example, via :meth:`.SnpData.standardize`) will effect the others. Also keep in mind that :meth:`read` relies on ndarray's mechanisms to decide whether to actually share memory and so it may ignore your suggestion and allocate a new ndarray anyway. :type view_ok: bool :rtype: :class:`.SnpData` Calling the method again causes the SNP values to be re-read and creates a new in-memory :class:`.SnpData` with a new ndarray of SNP values. If you request the values for only a subset of the sids or iids, (to the degree practical) only that subset will be read from disk. :Example: >>> from pysnptools.snpreader import Bed >>> snp_on_disk = Bed('../../tests/datasets/all_chr.maf0.001.N300.bed',count_A1=False) # Specify SNP data on disk >>> snpdata1 = snp_on_disk.read() # Read all the SNP data returning a SnpData instance >>> print(type(snpdata1.val).__name__) # The SnpData instance contains a ndarray of the data. ndarray >>> subset_snpdata = snp_on_disk[:,::2].read() # From the disk, read SNP values for every other sid >>> print(subset_snpdata.val[0,0]) # Print the first SNP value in the subset 2.0 >>> subsub_snpdata = subset_snpdata[:10,:].read(order='A',view_ok=True) # Create an in-memory subset of the subset with SNP values for the first ten iids. Share memory if practical. >>> import numpy as np >>> # print np.may_share_memory(subset_snpdata.val, subsub_snpdata.val) # Do the two ndarray's share memory? They could. Currently they won't. """ dtype = np.dtype(dtype) val = self._read(None, None, order, dtype, force_python_only, view_ok) from pysnptools.snpreader import SnpData ret = SnpData(self.iid, self.sid, val, pos=self.pos, name=str(self)) return ret
def _create_covar_chrom(covar, covar_by_chrom, chrom,count_A1=None): if covar_by_chrom is not None: covar_by_chrom_chrom = covar_by_chrom[chrom] covar_by_chrom_chrom = _pheno_fixup(covar_by_chrom_chrom, iid_if_none=covar,count_A1=count_A1) covar_after, covar_by_chrom_chrom = pstutil.intersect_apply([covar, covar_by_chrom_chrom]) ret = SnpData(iid=covar_after.iid,sid=np.r_[covar_after.sid,covar_by_chrom_chrom.sid], val=np.c_[covar_after.read(order='A',view_ok=True).val, covar_by_chrom_chrom.read(order='A',view_ok=True).val]) #view_ok because np.c_ will allocate new memory. return ret else: return covar
def g_mix(self,K0,K1): mixing = self.mixing if mixing == 1 or isinstance(K0, KernelIdentity): assert K1.standardizer is self.snp_trained1, "real assert" G_train = K1.train.read().standardize(self.snp_trained1).standardize(self.kernel_trained1)#!!!later this a good place to read? G_test = K1.test.read().standardize(self.snp_trained1).standardize(self.kernel_trained1)#!!!later this a good place to read? K = _SnpTrainTest(train=G_train,test=G_test,standardizer=SS_Identity(), block_size=None) return K if mixing == 0 or isinstance(K1, KernelIdentity): assert K0.standardizer is self.snp_trained0, "real assert" G_train = K0.train.read().standardize(self.snp_trained0).standardize(self.kernel_trained0)#!!!later this a good place to read? G_test = K0.test.read().standardize(self.snp_trained0).standardize(self.kernel_trained0)#!!!later this a good place to read? K = _SnpTrainTest(train=G_train,test=G_test,standardizer=SS_Identity(), block_size=None) return K #!!!later why are we processing the training data again???? assert K0.standardizer is self.snp_trained0, "real assert" assert isinstance(K0, _SnpTrainTest), "Expect K0 to be a _SnpTrainTest" assert K1.standardizer is self.snp_trained1, "real assert" G0_train = K0.train.read().standardize(self.snp_trained0).standardize(self.kernel_trained0)#!!!later this a good place to read? G1_train = K1.train.read().standardize(self.snp_trained1).standardize(self.kernel_trained1)#!!!later this a good place to read? G0_test = K0.test.read().standardize(self.snp_trained0).standardize(self.kernel_trained0)#!!!later this a good place to read? G1_test = K1.test.read().standardize(self.snp_trained1).standardize(self.kernel_trained1)#!!!later this a good place to read? G_train = np.empty((K0.iid0_count, K0.train.sid_count + K1.train.sid_count)) G_test = np.empty((K0.iid1_count, K0.train.sid_count + K1.train.sid_count)) _mix_from_Gs(G_train, G0_train.val, G1_train.val, self.mixing) _mix_from_Gs(G_test, G0_test.val, G1_test.val, self.mixing) G_train = SnpData(iid=K0.iid0, sid=np.concatenate((K0.train.sid,K1.train.sid),axis=0), val=G_train,name="{0}&{1}".format(G0_train,G1_train), pos=np.concatenate((K0.train.pos,K1.train.pos),axis=0) ) G_test = SnpData(iid=K0.iid1, sid=np.concatenate((K0.train.sid,K1.train.sid),axis=0), val=G_test,name="{0}&{1}".format(G0_test,G1_test), pos=np.concatenate((K0.train.pos,K1.train.pos),axis=0) ) K = _SnpTrainTest(train=G_train,test=G_test,standardizer=SS_Identity(), block_size=None) return K
def _snps_fixup(snp_input, iid_if_none=None): if isinstance(snp_input, str): return Bed(snp_input) if snp_input is None: assert iid_if_none is not None, "snp_input cannot be None here" return SnpData( iid_if_none, sid=np.empty((0), dtype='str'), val=np.empty((len(iid_if_none), 0)), pos=np.empty((0, 3)), parent_string="") #todo: make a static factory method on SnpData return snp_input
def test_covar_by_chrom_mixing(self): logging.info( "TestSingleSnpLeaveOutOneChrom test_covar_by_chrom_mixing") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn covar = Pheno(self.cov_fn).read() covar = SnpData(iid=covar.iid, sid=["pheno-1"], val=covar.val) covar_by_chrom = {chrom: self.cov_fn for chrom in xrange(1, 6)} output_file = self.file_name("covar_by_chrom_mixing") frame = single_snp(test_snps, pheno, covar=covar, covar_by_chrom=covar_by_chrom, output_file_name=output_file) self.compare_files(frame, "covar_by_chrom_mixing")
def _read_pstdata(self): col, col_property = SnpReader._read_map_or_bim(self.filename, remove_suffix="ped", add_suffix="map") ped = np.loadtxt(self.filename, dtype='str', comments=None) ped = ped.reshape(-1, ped.shape[-1]) #Turns 1-d row into 2-d row = ped[:, 0:2] snpsstr = ped[:, 6::] inan = snpsstr == self.missing snps = np.zeros((snpsstr.shape[0], snpsstr.shape[1] // 2)) for i in range(snpsstr.shape[1] // 2): snps[inan[:, 2 * i], i] = np.nan vals = snpsstr[~inan[:, 2 * i], 2 * i:2 * (i + 1)] if vals.shape[0] > 0: snps[~inan[:, 2 * i], i] += (vals == vals[0, 0]).sum(1) snpdata = SnpData(iid=row, sid=col, pos=col_property, val=snps) return snpdata
def too_slow_test_write_bedbig(self): iid_count = 100000 sid_count = 50000 from pysnptools.snpreader import SnpData iid = np.array([[str(i), str(i)] for i in range(iid_count)]) sid = np.array(["sid_{0}".format(i) for i in range(sid_count)]) pos = np.array([[i, i, i] for i in range(sid_count)]) np.random.seed(0) snpdata = SnpData( iid, sid, np.zeros((iid_count, sid_count)), pos=pos ) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count))) output = "tempdir/bedbig.{0}.{1}".format(iid_count, sid_count) create_directory_if_necessary(output) Bed.write(output, snpdata, count_A1=False) snpdata2 = Bed(output, count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def test_respect_inputs(self): np.random.seed(0) for dtype_start, decimal_start in [(np.float32, 5), (np.float64, 10)]: for order_start in ['F', 'C', 'A']: for sid_count in [20, 2]: snpdataX = SnpData( iid=[["0", "0"], ["1", "1"], ["2", "2"]], sid=[str(i) for i in range(sid_count)], val=np.array(np.random.randint(3, size=[3, sid_count]), dtype=dtype_start, order=order_start)) for stdx in [ stdizer.Beta(1, 25), stdizer.Identity(), stdizer.Unit() ]: for snpreader0 in [snpdataX, snpdataX[:, 1:]]: snpreader1 = snpreader0[1:, :] refdata0, trained_standardizer = snpreader0.read( ).standardize(stdx, return_trained=True, force_python_only=True) refval0 = refdata0.val.dot(refdata0.val.T) refdata1 = snpreader1.read().standardize( trained_standardizer, force_python_only=True ) #LATER why aren't these used? refval1 = refdata0.val.dot( refdata1.val.T) #LATER why aren't these used? for dtype_goal, decimal_goal in [(np.float32, 5), (np.float64, 10)]: for order_goal in ['F', 'C', 'A']: k = snpreader0.read_kernel( standardizer=stdx, block_size=1, order=order_goal, dtype=dtype_goal) PstReader._array_properties_are_ok( k.val, order_goal, dtype_goal) np.testing.assert_array_almost_equal( refval0, k.val, decimal=min(decimal_start, decimal_goal))
def test_one(self): ''' Lock in results on arbitrary data -- because meaningful runs take too long to run. ''' fn = "one.txt" logging.info(fn) tmpOutfile = self.file_name(fn) half = self.pheno_whole.read().val pheno = SnpData(iid=self.pheno_whole.iid,sid=["pheno0","pheno1"],val=np.c_[half,half]) spatial_coor = [[i,-i] for i in xrange(self.snpreader_whole.iid_count)] alpha_list = alpha_list_big=[int(v) for v in np.logspace(2,np.log10(4000), 2)] dataframe = heritability_spatial_correction(self.snpreader_whole,spatial_coor,self.snpreader_whole.iid,alpha_list,2,pheno,jackknife_count=2,permute_plus_count=1,permute_times_count=1,just_testing=True) dataframe.to_csv(tmpOutfile,sep="\t",index=False) referenceOutfile = TestFeatureSelection.reference_file("heritability_spatial_correction/"+fn) out,msg=ut.compare_files(tmpOutfile, referenceOutfile, tolerance) self.assertTrue(out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile))
def gen_Test_Bed(filename, n0, n1, m): n = n0 + n1 iid = [["fam_" + str(i), "iid_" + str(i)] for i in range(0, n)] sid = ["snp_" + str(i) for i in range(0, m)] X = [[2.0 for i in range(0, m)] for i in range(0, n1)] X.extend([[0.0 for i in range(0, m)] for i in range(0, n0)]) dat = SnpData(iid=iid, sid=sid, val=X) Bed.write(filename, dat) fil = open(filename + ".fam") lines = fil.readlines() fil.close() fil = open(filename + ".fam", "w") for i in range(0, len(lines)): l = lines[i] s = l.strip().split() if i < n1: s[5] = "2" else: s[5] = "1" l = " ".join(s) + "\n" fil.write(l) fil.close()
def test_multipheno(self): logging.info("test_multipheno") random_state = RandomState(29921) pheno_reference = Pheno(self.phen_fn).read() for pheno_count in [2, 5, 1]: val = random_state.normal(loc=pheno_count, scale=pheno_count, size=(pheno_reference.iid_count, pheno_count)) pheno_col = ['pheno{0}'.format(i) for i in range(pheno_count)] pheno_multi = SnpData(iid=pheno_reference.iid, sid=pheno_col, val=val) reference = pd.concat([ single_snp(test_snps=self.bed, pheno=pheno_multi[:, pheno_index], covar=self.cov_fn) for pheno_index in range(pheno_count) ]) frame = single_snp_scale(test_snps=self.bed, pheno=pheno_multi, covar=self.cov_fn) assert len(frame) == len( reference), "# of pairs differs from file '{0}'".format( reffile) for sid in sorted( set(reference.SNP )): #This ignores which pheno produces which pvalue pvalue_frame = np.array( sorted(frame[frame['SNP'] == sid].PValue)) pvalue_reference = np.array( sorted(reference[reference['SNP'] == sid].PValue)) assert ( abs(pvalue_frame - pvalue_reference) < 1e-5 ).all, "pair {0} differs too much from reference".format(sid)
def generate_and_analyze(seed, N, do_shuffle, just_testing=True, map_function=None, cache_folder=None): #Generate SNPs snpdata = snp_gen(fst=.1, dfr=0, iid_count=N, sid_count=1000, chr_count=10, label_with_pop=True, seed=seed) K_causal = snpdata.read_kernel(Unit()).standardize() #Generate geo-spatial locations and K_loc distance_between_centers = 2500000 x0 = distance_between_centers * 0.5 x1 = distance_between_centers * 1.5 y0 = distance_between_centers y1 = distance_between_centers sd = distance_between_centers / 4. spatial_iid = snpdata.iid center_dict = {"0": (x0, y0), "1": (x1, y1)} centers = np.array( [center_dict[iid_item[0]] for iid_item in spatial_iid]) np.random.seed(seed) logging.info("Generating positions for seed {0}".format(seed)) spatial_coor = SnpData( iid=snpdata.iid, sid=["x", "y"], val=centers + np.random.multivariate_normal( [0, 0], [[1, 0], [0, 1]], size=len(centers)) * sd, parent_string="'spatial_coor_gen_original'") alpha = distance_between_centers spatial_val = spatial_similarity(spatial_coor.val, alpha, power=2) K_loc = KernelData(iid=snpdata.iid, val=spatial_val).standardize() #Generate phenotype iid = K_causal.iid iid_count = K_causal.iid_count np.random.seed(seed) pheno_causal = SnpData(iid=iid, sid=["causal"], val=np.random.multivariate_normal( np.zeros(iid_count), K_causal.val).reshape(-1, 1), parent_string="causal") np.random.seed(seed ^ 998372) pheno_noise = SnpData(iid=iid, sid=["noise"], val=np.random.normal(size=iid_count).reshape( -1, 1), parent_string="noise") np.random.seed(seed ^ 12230302) pheno_loc_original = SnpData(iid=iid, sid=["loc_original"], val=np.random.multivariate_normal( np.zeros(iid_count), K_loc.val).reshape(-1, 1), parent_string="loc_original") if do_shuffle: idx = np.arange(iid_count) np.random.seed(seed) np.random.shuffle(idx) pheno_loc = pheno_loc_original.read( view_ok=True ) #don't need to copy, because the next line will be fresh memory pheno_loc.val = pheno_loc.val[idx, :] else: pheno_loc = pheno_loc_original pheno = SnpData(iid=iid, sid=["pheno_all"], val=pheno_causal.val + pheno_noise.val + pheno_loc.val) #Analyze data alpha_list = [ int(v) for v in np.logspace(np.log10(100), np.log10(1e10), 100) ] dataframe = heritability_spatial_correction( snpdata, spatial_coor.val, spatial_iid, alpha_list=[alpha] if just_testing else alpha_list, pheno=pheno, alpha_power=2, jackknife_count=0, permute_plus_count=0, permute_times_count=0, just_testing=just_testing, map_function=map_function, cache_folder=cache_folder) logging.info(dataframe) return dataframe
distance_between_centers = 2500000 x0 = distance_between_centers * 0.5 x1 = distance_between_centers * 1.5 y0 = distance_between_centers y1 = distance_between_centers sd = distance_between_centers / 4. spatial_iid = snpdata.iid center_dict = {"0": (x0, y0), "1": (x1, y1)} centers = np.array([center_dict[iid_item[0]] for iid_item in spatial_iid]) np.random.seed(seed) logging.info("Generating positions for seed {0}".format(seed)) spatial_coor_gen_original = SnpData( iid=snpdata.iid, sid=["x", "y"], val=centers + np.random.multivariate_normal([0, 0], [[1, 0], [0, 1]], size=len(centers)) * sd, parent_string="'spatial_coor_gen_original'") if do_plot: import matplotlib.pyplot as plt color_dict = {"0": "r", "1": "b", "2": "g"} colors = [color_dict[iid_item] for iid_item in snpdata.iid[:, 0]] plt.axis('equal') plt.scatter(spatial_coor_gen_original.val[:, 0], spatial_coor_gen_original.val[:, 1], c=colors) plt.show() from fastlmm.association.heritability_spatial_correction import spatial_similarity from pysnptools.kernelreader import KernelData
# [ 2. 2. 2. ..., 1. 2. 2.] # [ 2. 2. 2. ..., 1. 2. 2.] # [ 2. 2. 2. ..., 2. 0. 2.]] # snpdata.val is a NumPy array. Can apply any np functions print np.mean(snpdata.val) #1.478588 #If all you want is to read data in a Numpy array, here it is one line: print np.mean(Bed("all.bed").read().val) #You can also create a SnpData object from scratch (without reading from a SnpReader) from pysnptools.snpreader import SnpData snpdata1 = SnpData(iid=[['f1', 'c1'], ['f1', 'c2'], ['f2', 'c1']], sid=['snp1', 'snp2'], val=[[0, 1], [2, .5], [.5, np.nan]]) print np.nanmean(snpdata1.val) # 0.8 #Review SnpReader and Bed and SnpData, and common attributes including val #Topics: Reading subsets of data, reading with re-ordering iids & sids (rows & cols), stacking #Reading just one snp snpreader = Bed("all.bed") snp0reader = snpreader[:, 0] print snp0reader, snp0reader.iid_count, snp0reader.sid_count, snp0reader.sid # Bed('all.bed')[:,0] 500 1 ['snp625_m0_.03m1_.07'] print snpreader # Bed("all.bed")
def score(self, X=None, y=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, return_mse_too=False, return_per_iid=False, count_A1=None): """ Method for calculating the negative log likelihood of testing examples. If the examples in X,y, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param y: testing phenotype: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively, the test SNPs needed to construct such a similarity matrix. Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file. Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file. :type K0_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively, the test SNPs needed to construct such a similarity matrix. Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file. Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :param return_mse_too: If true, will also return the mean squared error. :type return_mse_too: bool :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: a float of the negative log likelihood and, optionally, a float of the mean squared error. """ mean0, covar0 = self.predict(K0_whole_test=K0_whole_test,K1_whole_test=K1_whole_test,X=X,iid_if_none=iid_if_none,count_A1=count_A1) y = _pheno_fixup(y, iid_if_none=covar0.iid,count_A1=count_A1) mean, covar, y = intersect_apply([mean0, covar0, y]) mean = mean.read(order='A',view_ok=True).val covar = covar.read(order='A',view_ok=True).val y_actual = y.read().val if not return_per_iid: var = multivariate_normal(mean=mean.reshape(-1), cov=covar) nll = -np.log(var.pdf(y_actual.reshape(-1))) if not return_mse_too: return nll else: mse = ((y_actual-mean)**2).sum() return nll, mse else: if not return_mse_too: result = SnpData(iid=y.iid,sid=['nLL'],val=np.empty((y.iid_count,1)),name="nLL") for iid_index in xrange(y.iid_count): var = multivariate_normal(mean=mean[iid_index], cov=covar[iid_index,iid_index]) nll = -np.log(var.pdf(y_actual[iid_index])) result.val[iid_index,0] = nll return result else: raise Exception("need code for mse_too")
def combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=False, force_low_rank=False,snp_standardizer=None,kernel_standardizer=None,block_size=None): from pysnptools.kernelstandardizer import Identity as KS_Identity assert K0.iid0 is K0.iid1, "Expect K0 to be square" assert K1.iid0 is K1.iid1, "Expect K1 to be square" assert K0 is not None assert K1 is not None assert np.array_equal(K0.iid,K1.iid), "Expect K0 and K1 to having matching iids" assert kernel_standardizer is not None, "expect values for kernel_standardizer" mixer = _Mixer(False,KS_Identity(),KS_Identity(),mixing) sid_count_0 = _Mixer.sid_counter(K0, force_full_rank, force_low_rank) sid_count_1 = _Mixer.sid_counter(K1, force_full_rank, force_low_rank) ################################# # Both Identity (or not given) ################################# if sid_count_0 + sid_count_1 == 0: h2 = h2 or 0 mixer.mixing = mixer.mixing or 0 K = K0.read() #would be nice to use LinearRegression or low-rank with 0 snps ################################# # ################################# elif sid_count_0 + sid_count_1 < K0.iid_count or force_low_rank: mixer.do_g = True #!!!there is no need for block_size here because we want G0 in full. But if starting with SNPs and not low-rank then batches are needed and the two standardizers must be remembered for use later if sid_count_0 > 0: K0, mixer.snp_trained0, mixer.kernel_trained0 = K0._read_with_standardizing(to_kerneldata=not mixer.do_g, kernel_standardizer=kernel_standardizer, return_trained=True) if sid_count_1 > 0: K1, mixer.snp_trained1, mixer.kernel_trained1 = K1._read_with_standardizing(to_kerneldata=not mixer.do_g, kernel_standardizer=kernel_standardizer, return_trained=True) if sid_count_1 == 0: mixer.mixing = mixer.mixing or 0 K = K0 elif sid_count_0 == 0: mixer.mixing = mixer.mixing or 1 K = K1 else: if mixer.do_g: G = np.empty((K0.iid_count, K0.sid_count + K1.sid_count)) if mixer.mixing is None: mixer.mixing, h2 = _find_mixing_from_Gs(G, covar, K0.snpreader.val, K1.snpreader.val, h2, y) if mixer.mixing == 0: K = K0 elif mixer.mixing == 1: K = K1 else: _mix_from_Gs(G, K0.snpreader.val, K1.snpreader.val, mixer.mixing) G = SnpData(iid=K0.iid, sid=["K0_{0}".format(i) for i in xrange(K0.sid_count)]+["K1_{0}".format(i) for i in xrange(K1.sid_count)], #rename the sids so that they can't collide. val=G,name="{0}&{1}".format(K0.snpreader,K1.snpreader), pos=np.concatenate((K0.pos,K1.pos),axis=0) ) K = SnpKernel(G,SS_Identity(),block_size=block_size) else: mixer.do_g = False if sid_count_0 > 0: #!!!but what if we have SNP data but still need to remember the standardizer? K0, mixer.snp_trained0, mixer.kernel_trained0 = K0._read_with_standardizing(to_kerneldata=True,return_trained=True)#!!!pass in a new argument, the kernel_standardizer(???) if sid_count_1 > 0: K1, mixer.snp_trained1, mixer.kernel_trained1 = K1._read_with_standardizing(to_kerneldata=True,return_trained=True) if sid_count_1 == 0: mixer.mixing = mixer.mixing or 0 K = K0 elif sid_count_0 == 0: mixer.mixing = mixer.mixing or 1 K = K1 else: K = np.empty(K0.val.shape) if mixer.mixing is None: mixer.mixing, h2 = _find_mixing_from_Ks(K, covar, K0.val, K1.val, h2, y) _mix_from_Ks(K, K0.val, K1.val, mixer.mixing) assert K.shape[0] == K.shape[1] and abs(np.diag(K).sum() - K.shape[0]) < 1e-7, "Expect mixed K to be standardized" K = KernelData(val=K,iid=K0.iid) return K, h2, mixer
from pysnptools.snpreader import Pheno, Bed import pysnptools.util as pstutil data_file = 'd:\OneDrive\programs\epiCornell\syndata.bed' if False: from pysnptools.snpreader import SnpData import numpy as np bed1 = Bed("../../tests/datasets/synth/all") print(bed1.iid_count, bed1.sid_count, bed1.iid_count * bed1.sid_count) #goal 1500 individuals x 27000 SNP snpdata1 = bed1.read() iid = bed1.iid sid = ['sid{0}'.format(i) for i in xrange(27000)] val = np.tile(snpdata1.val,(3,6))[:,:27000].copy() #snpdata = Pheno('pysnptools/examples/toydata.phe').read() # Read data from Pheno format snpdata2 = SnpData(iid, sid, val) print(snpdata2.iid_count, snpdata2.sid_count, snpdata2.iid_count * snpdata2.sid_count) Bed.write(snpdata2,data_file,count_A1=False) synbed = Bed(data_file) print(synbed.iid_count, synbed.sid_count, synbed.iid_count * synbed.sid_count) part_count = 1000 part_list = list(split_on_sids(synbed,part_count)) pairs00 = _Pairs(part_list[0]) from fastlmm.association import single_snp pheno_fn = r"d:\OneDrive\programs\epiCornell\pheno.txt" cov_fn = r"d:\OneDrive\programs\epiCornell\cov.txt" results_df = single_snp(pairs00, K0=synbed, pheno=pheno_fn, covar=cov_fn, leave_out_one_chrom=False, count_A1=True)
def predict(self,X=None,K0_whole_test=None,K1_whole_test=None,iid_if_none=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: A similarity matrix from all the examples to the test examples. Alternatively, the test SNPs needed to construct such a similarity matrix. Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file. Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file. :type K0_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param K1_whole_test: A second similarity matrix from all the examples to the test examples. Alternatively, the test SNPs needed to construct such a similarity matrix. Can be any :class:`.SnpReader`. If you give a string, can be the name of a PLINK-formated Bed file. Can be PySnpTools :class:`.KernelReader`. If you give a string it can be the name of a :class:`.KernelNpz` file. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance """ assert self.is_fitted, "Can only predict after predictor has been fitted" #assert K0_whole_test is not None, "K0_whole_test must be given" #!!!later is it too wasteful to keep both G0_train, G1_train, and lmm.G when storing to disk? #!!!later all _kernel_fixup's should use block_size input K0_whole_test_b = _kernel_fixup(K0_whole_test, train_snps=self.G0_train, iid_if_none=iid_if_none, standardizer=self.mixer.snp_trained0, test=K0_whole_test, test_iid_if_none=None, block_size=self.block_size) K1_whole_test = _kernel_fixup(K1_whole_test, train_snps=self.G1_train, iid_if_none=K0_whole_test_b.iid0, standardizer=self.mixer.snp_trained1, test=K1_whole_test, test_iid_if_none=K0_whole_test_b.iid1, block_size=self.block_size) X = _pheno_fixup(X,iid_if_none=K0_whole_test_b.iid1) K0_whole_test_c, K1_whole_test, X = intersect_apply([K0_whole_test_b, K1_whole_test, X],intersect_before_standardize=True,is_test=True) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=self._new_snp_name(X), val=np.c_[X.read().val,np.ones((X.iid_count,1))]) assert np.array_equal(X.sid,self.covar_sid), "Expect covar sids to be the same in train and test." train_idx0 = K0_whole_test_c.iid0_to_index(self.K_train_iid) K0_train_test = K0_whole_test_c[train_idx0,:] train_idx1 = K1_whole_test.iid0_to_index(self.K_train_iid) K1_train_test = K1_whole_test[train_idx1,:] test_idx0 = K0_whole_test_c.iid0_to_index(K0_whole_test_c.iid1) K0_test_test = K0_whole_test_c[test_idx0,:] if K0_test_test.iid0 is not K0_test_test.iid1: raise Exception("real assert") test_idx1 = K1_whole_test.iid0_to_index(K0_whole_test_c.iid1) K1_test_test = K1_whole_test[test_idx1,:] if self.mixer.do_g: ################################################### # low rank from Rasmussen eq 2.9 + noise term added to covar ################################################### Gstar = self.mixer.g_mix(K0_train_test,K1_train_test) varg = self.h2 * self.sigma2 vare = (1.-self.h2) * self.sigma2 Ainv = LA.inv((1./vare) * np.dot(self.G.T,self.G) + (1./varg)*np.eye(self.G.shape[1])) testAinv = np.dot(Gstar.test.val, Ainv) pheno_predicted = np.dot(X.val,self.beta) + (1./vare) * np.dot(np.dot(testAinv,self.G.T),self.y-np.dot(self.X,self.beta)) pheno_predicted = pheno_predicted.reshape(-1,1) covar = np.dot(testAinv,Gstar.test.val.T) + vare * np.eye(Gstar.test.val.shape[0]) else: lmm = LMM() lmm.U = self.U lmm.S = self.S lmm.G = self.G lmm.y = self.y lmm.Uy = self.Uy lmm.X = self.X lmm.UX = self.UX Kstar = self.mixer.k_mix(K0_train_test,K1_train_test) #!!!later do we need/want reads here? how about view_OK? lmm.setTestData(Xstar=X.val, K0star=Kstar.val.T) Kstar_star = self.mixer.k_mix(K0_test_test,K1_test_test) #!!!later do we need/want reads here?how about view_OK? pheno_predicted, covar = lmm.predict_mean_and_variance(beta=self.beta, h2=self.h2,sigma2=self.sigma2, Kstar_star=Kstar_star.val) #pheno_predicted = lmm.predictMean(beta=self.beta, h2=self.h2,scale=self.sigma2).reshape(-1,1) ret0 = SnpData(iid = X.iid, sid=self.pheno_sid,val=pheno_predicted,pos=np.array([[np.nan,np.nan,np.nan]]),name="lmm Prediction") from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=K0_test_test.iid,val=covar) return ret0, ret1
def test_lr_real(self): do_plot = False import pylab logging.info("TestLinRegTrain test_lr_real") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #make covar just numbers 0,1,... covar = self.covariate_whole.read() covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)]) covariate_train = covar[train_idx, :].read() covariate_test = covar[test_idx, :].read() K0_test_test = KernelIdentity(covariate_test.iid) #make pheno # pheno = 2*covar+100+normal(0,1)*10 pheno = self.pheno_whole.read() np.random.seed(0) pheno.val = covar.val * 2.0 + 100 + np.random.normal( size=covar.val.shape) * 10 pheno_train = pheno[train_idx, :].read() pheno_test = pheno[test_idx, :].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val, ".", covariate_test.val, pheno_test.val, ".") pylab.suptitle("Plot training x and y, testing x and y") pylab.show() Xtrain = np.c_[covariate_train.val, np.ones((covariate_train.iid_count, 1))] Xtest = np.c_[covariate_test.val, np.ones((covariate_test.iid_count, 1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:, 0], rcond=-1) bs = lsqSol[0] #weights r2 = lsqSol[1] #squared residuals D = lsqSol[2] #rank of design matrix N = pheno_train.iid_count REML = False if not REML: sigma2 = float(r2 / N) nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + N * 0.5 else: sigma2 = float(r2 / (N - D)) nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + 0.5 / sigma2 * r2 nLL -= 0.5 * D * np.log(2 * np.pi * sigma2) #REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val, "g.", covariate_test.val, predicted, "r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted, yerr, linestyle='None') pylab.suptitle("real linear regression: actual to prediction") pylab.show() #These should all give the same result first_name = None for name, K0_train, K0_whole_test in [("Identity Kernel", None, None)]: first_name = first_name or name #Learn model, save, load modelx = LinearRegression().fit(K0_train=K0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_lr_real.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(modelx, filename) model = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar = model.predict( K0_whole_test=K0_train, X=covariate_train) #test on train output_file = self.file_name("lr_reala_" + name) Dat.write(output_file, predicted_pheno) covar2 = SnpData( iid=covar.row, sid=covar.col[:, 1], val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_reala.cov_" + name) Dat.write(output_file, covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val, "g.", covariate_train.val, predicted, "r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted, yerr, linestyle='None') pylab.suptitle( name + ": test on train: train X to true target (green) and prediction (red)" ) pylab.show() self.compare_files(predicted_pheno, "lr2a_" + first_name) self.compare_files(covar2, "lr2a.cov_" + first_name) #Predict with model (test on test) predicted_pheno, covar = model.predict( K0_whole_test=K0_whole_test, X=covariate_test) #test on train output_file = self.file_name("lr_realb_" + name) Dat.write(output_file, predicted_pheno) covar2 = SnpData( iid=covar.row, sid=covar.col[:, 1], val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_realb.cov_" + name) Dat.write(output_file, covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val, "g.", covariate_test.val, predicted, "r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted, yerr, linestyle='None') pylab.suptitle( name + ": test on test: test X to true target (green) and prediction (red)" ) pylab.show() ## Plot y and predicted y (test on train) #pylab.plot(pheno_test.val,predicted_pheno.val,".") #pylab.suptitle(name+": test on test: true target to prediction") #pylab.show() self.compare_files(predicted_pheno, "lr2b_" + first_name) self.compare_files(covar2, "lr2b.cov_" + first_name)
def snp_gen(fst, dfr, iid_count, sid_count, maf_low=.05, maf_high=.5, seed=0, sibs_per_family=10, freq_pop_0=.5, chr_count=None, label_with_pop=False): """Generates a random :class:`.SnpData` :param fst: Degree of Population Structure, e.g. 0 (a special case), 0.005, 0.01, 0.05, 0.1 :type fst: float :param dfr: Degree of Family Relatedness, the fraction of individuals belonging to a family, ie. fracSibs, e.g. 0.0, 0.5, 0.6, 0.7, 0.8, 0.9 :type dfr: float :param iid_count: The number of individuals to generate. Because of rounding the actual number may be less. :type iid_count: int :param sid_count: The number of snps to generate. :type sid_count: int :param maf_low: (default .05) lower bound of uniformly-generated Minor allele frequency :type maf_low: float :param maf_high: (default .5) upper bound of uniformly-generated Minor allele frequency :type maf_high: float :param seed: (default 0) Random seed :type seed: int :param sibs_per_family: (default 10) number of siblings in each family :type sibs_per_family: int :param freq_pop_0: (default .5) Fraction of individuals in population 0 (the rest will be in population 1) :type freq_pop_0: float :param chr_count: (default one chromosome per SNP) Number of chromosomes to which SNPs should be assigned. The SNPs will be assigned as evenly as possible. Chromosome names are integers starting with 1. SNP positions within a chromosome are sequential integers starting with 1. :type chr_count: int :rtype: :class:`.SnpData` :Example: >>> snpdata = snp_gen(fst=.1,dfr=.5,iid_count=200,sid_count=20,maf_low=.05,seed=6) >>> print int(snpdata.iid_count), int(snpdata.sid_count) #because of rounding got 190 individuals 190 20 """ assert 0 <= freq_pop_0 and freq_pop_0 <= 1.0, "assert 0 <= freq_pop_0 and freq_pop_0 <=1.0" if seed is not None: np.random.seed(int(seed % sys.maxint)) iid_solo_count = iid_count - iid_count * dfr family_count = int(iid_count * dfr / (2 * sibs_per_family)) ancestral = np.random.uniform( maf_low, maf_high, sid_count) #sample ancestral allele frequencies snp_list = [] for population_index, freq_pop in enumerate([freq_pop_0, 1.0 - freq_pop_0]): logging.info("Simulating SNPs from a population %i" % population_index) snps_parents = _generate_snps(ancestral, fst, int(iid_solo_count * freq_pop), sid_count) snp_list.append(snps_parents) snp_list.append( _generate_kids(parent_snps=snps_parents, family_count=int(freq_pop * family_count), sibs_per_family=sibs_per_family)) snp_list.append( _generate_kids(parent_snps=np.concatenate(snp_list), family_count=family_count, sibs_per_family=sibs_per_family)) val = np.concatenate(snp_list) if not label_with_pop: iid = np.array([["i_{0}".format(iid_index), "f_{0}".format(iid_index)] for iid_index in xrange(val.shape[0])], dtype=str).reshape(-1, 2) else: assert len(snp_list) == 5, "real assert" iid0 = [["0", str(iid_index)] for iid_index in xrange(len(snp_list[0]) + len(snp_list[1])) ] #parents and children of pop 0 iid1 = [["1", str(iid_index)] for iid_index in xrange(len(snp_list[2]) + len(snp_list[3])) ] #parents and children of pop 1 iid2 = [["2", str(iid_index)] for iid_index in xrange(len(snp_list[4])) ] #children with parents in any pop iid = np.array(iid0 + iid1 + iid2, dtype=str).reshape(-1, 2) sid = np.array( ["snp_{0}".format(sid_index) for sid_index in xrange(val.shape[1])], dtype=str) if chr_count is None: chr_count = len(sid) assert len( sid ) == 0 or chr_count > 0, "chr_count must be at least 1 (unless sid_count is 0)" sid_per_chrom = int(sp.ceil(float(len(sid)) / max(1, chr_count))) pos = np.array( list([ 1 + sid_index // sid_per_chrom, 1 + sid_index % sid_per_chrom, 1 + sid_index % sid_per_chrom ] for sid_index in xrange(len(sid)))) if len(sid) == 0: #make it work when no sids are wanted pos = pos.reshape(len(sid), 3) snpdata = SnpData( iid=iid, sid=sid, val=val, pos=pos, parent_string= "snp_gen(fst={0}, dfr={1}, iid_count={2}, sid_count={3}, maf_low={4}, maf_high={5}, seed={6}, sibs_per_family={7}, freq_pop_0={8})" .format(fst, dfr, iid_count, sid_count, maf_low, maf_high, seed, sibs_per_family, freq_pop_0)) if snpdata.iid_count != iid_count: logging.warn( "Because of rounding the actual number of iids is {0} rather than the requested {1}" .format(snpdata.iid_count, iid_count)) return snpdata
def test_cpp_std(self): #Order C vs F for order in ['C', 'F']: #32 vs 64 for dtype in [np.float64, np.float32]: #unit vs beta for std in [stdizer.Unit(), stdizer.Beta(2, 10)]: np.random.seed(0) sid_count = 20 snpreader0 = SnpData( iid=[["0", "0"], ["1", "1"], ["2", "2"]], sid=[str(i) for i in range(sid_count)], val=np.array(np.random.randint(3, size=[3, sid_count]), dtype=dtype, order=order)) snpreader1 = SnpData( iid=[["3", "3"], ["4", "4"]], sid=[str(i) for i in range(sid_count)], val=np.array(np.random.randint(3, size=[2, sid_count]), dtype=dtype, order=order)) #has SNC for has_SNC_in_train in [False, True]: if has_SNC_in_train: snpreader0.val[:, 1] = 0 #missing data for has_missing_data in [False, True]: if has_missing_data: snpreader0.val[0, 2] = np.nan snpreader1.val[0, 2] = np.nan #gather stats vs not cppa, stdcppa = snpreader0.read( order=order, dtype=dtype).standardize( std, return_trained=True, force_python_only=False) pya, stdpya = snpreader0.read( order=order, dtype=dtype).standardize( std, return_trained=True, force_python_only=True) np.testing.assert_array_almost_equal( cppa.val, pya.val, decimal=10 if dtype == np.float64 else 5) np.testing.assert_array_almost_equal( stdcppa.stats, stdpya.stats, decimal=10 if dtype == np.float64 else 5) assert (np.inf in stdcppa.stats[:, 1]) == has_SNC_in_train assert (np.inf in stdpya.stats[:, 1]) == has_SNC_in_train if has_SNC_in_train: assert np.array_equal( cppa.val[:, 1], np.zeros([cppa.val.shape[0]])) assert np.array_equal( pya.val[:, 1], np.zeros([pya.val.shape[0]])) if has_missing_data: assert 0 == cppa.val[0, 2] assert 0 == pya.val[0, 2] #uses stats cppb = snpreader1.read(order=order, dtype=dtype).standardize( stdcppa, force_python_only=False) pyb = snpreader1.read(order=order, dtype=dtype).standardize( stdpya, force_python_only=True) np.testing.assert_array_almost_equal( cppb.val, pyb.val, decimal=10 if dtype == np.float64 else 5) np.testing.assert_array_almost_equal( stdcppa.stats, stdpya.stats, decimal=10 if dtype == np.float64 else 5 ) #Make sure we haven't messed up the train stats if has_SNC_in_train: assert np.array_equal( cppb.val[:, 1], np.zeros([cppb.val.shape[0]])) assert np.array_equal( pyb.val[:, 1], np.zeros([pyb.val.shape[0]])) if has_missing_data: assert cppb.val[0, 2] == 0 assert pyb.val[0, 2] == 0 logging.info("done with 'test_cpp_std'")