def divideData(filename, direct, num=5, mph=3, delet=True): print "Estimating heritability using " + str(num) + " components" [yFil, sFil] = getData(filename, mph=mph) n = sFil.iid_count reOrd = perm(n) yFil = yFil[reOrd, :] sFil = sFil[reOrd, :] div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)] varEsts = [] for i in range(0, num): print "For component " + str(i) sFilTemp = sFil[div[i]:div[i + 1], :] yFilTemp = yFil[div[i]:div[i + 1], :] fileTemp = direct + "/tempFile_" + str(i) Bed.write(fileTemp, sFilTemp.read()) Pheno.write(fileTemp + ".phen", yFilTemp.read()) varEsts.append(varRes(fileTemp, direct)) if delet: os.system("rm " + direct + "/tempFile_" + str(i) + "*") return varEsts
def test_c_reader_pheno(self): snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() self.assertEqual(np.float64, snpdata1.val.dtype) snpdata1.val[1,0] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.phe" create_directory_if_necessary(output) Pheno.write(output, snpdata1) snpreader = Pheno(output) _fortesting_JustCheckExists().input(snpreader) s = str(snpreader) snpdata2 = snpreader.read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() import pysnptools.util.pheno as pstpheno dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="") snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="",vectorize=True) assert len(dict['vals'].shape)==1, "test 1-d array of values" snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) snpdata4 = Pheno(None,iid_if_none=snpdata1.iid) assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0 snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10) snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
def divideData(filename,direct,num=5,mph=3,delet=True): print "Estimating heritability using "+str(num)+" components" [yFil,sFil]=getData(filename,mph=mph); n=sFil.iid_count reOrd=perm(n); yFil=yFil[reOrd,:]; sFil=sFil[reOrd,:]; div=[int(math.ceil( i*n/float(num) )) for i in range(0,num+1)]; varEsts=[]; for i in range(0,num): print "For component "+str(i); sFilTemp=sFil[div[i]:div[i+1],:]; yFilTemp=yFil[div[i]:div[i+1],:]; fileTemp=direct+"/tempFile_"+str(i); Bed.write(fileTemp,sFilTemp.read()); Pheno.write(fileTemp+".phen",yFilTemp.read()) varEsts.append(varRes(fileTemp,direct)); if delet: os.system("rm "+direct+"/tempFile_"+str(i)+"*"); return varEsts;
def _sel_plus_pc(self, h2, force_low_rank, force_full_rank, count_A1=None): do_plot = False use_cache = False # define file names bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" pcs_fn = os.path.join(self.tempout_dir, "sel_plus_pc.pcs.txt") if not (use_cache and os.path.exists(pcs_fn)): from fastlmm.util import compute_auto_pcs covar = compute_auto_pcs(bed_fn, count_A1=count_A1) logging.info("selected number of PCs: {0}".format( covar["vals"].shape[1])) Pheno.write( pcs_fn, SnpData(iid=covar['iid'], sid=covar['header'], val=covar['vals'])) else: logging.info("Using top pcs's cache") covar = Pheno(pcs_fn) mf_name = "lmp" #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp" runner = mf_to_runner_function(mf_name)(20) logging.info( "Working on h2={0},force_low_rank={1},force_full_rank={2}".format( h2, force_low_rank, force_full_rank)) result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 == .5 else "h2Search") output_file_name = os.path.join(self.tempout_dir, result_file_name) + ".txt" results = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn, k_list=[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 160, 200, 250, 320, 400, 500, 630, 800, 1000 ], h2=h2, n_folds=self.pythonpath + "/tests/datasets/synth/DebugEmitFolds.txt", covar=covar, output_file_name=output_file_name, force_low_rank=force_low_rank, force_full_rank=force_full_rank, GB_goal=2, count_A1=False #runner = runner ) logging.info(results.head()) self.compare_files(results, result_file_name)
def test_c_reader_pheno(self): snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() self.assertEqual(np.float64, snpdata1.val.dtype) snpdata1.val[ 1, 0] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.phe" create_directory_if_necessary(output) Pheno.write(output, snpdata1) snpreader = Pheno(output) _fortesting_JustCheckExists().input(snpreader) s = str(snpreader) snpdata2 = snpreader.read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() import pysnptools.util.pheno as pstpheno dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe", missing="") snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe", missing="", vectorize=True) assert len(dict['vals'].shape) == 1, "test 1-d array of values" snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) snpdata4 = Pheno(None, iid_if_none=snpdata1.iid) assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0 snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10) snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
def _sel_plus_pc(self,h2,force_low_rank,force_full_rank,count_A1=None): do_plot = False use_cache = False # define file names bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" pcs_fn = os.path.join(self.tempout_dir,"sel_plus_pc.pcs.txt") if not (use_cache and os.path.exists(pcs_fn)): from fastlmm.util import compute_auto_pcs covar = compute_auto_pcs(bed_fn,count_A1=count_A1) logging.info("selected number of PCs: {0}".format(covar["vals"].shape[1])) Pheno.write(pcs_fn,SnpData(iid=covar['iid'],sid=covar['header'],val=covar['vals'])) else: logging.info("Using top pcs's cache") covar=Pheno(pcs_fn) mf_name = "lmp" #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp" runner = mf_to_runner_function(mf_name)(20) logging.info("Working on h2={0},force_low_rank={1},force_full_rank={2}".format(h2,force_low_rank,force_full_rank)) result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 == .5 else "h2Search") output_file_name = os.path.join(self.tempout_dir,result_file_name)+".txt" results = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn, k_list = [0,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,125,160,200,250,320,400,500,630,800,1000], h2=h2, n_folds = self.pythonpath + "/tests/datasets/synth/DebugEmitFolds.txt", covar=covar, output_file_name=output_file_name, force_low_rank=force_low_rank,force_full_rank=force_full_rank, GB_goal=2, count_A1=False #runner = runner ) logging.info(results.head()) self.compare_files(results,result_file_name)
from pysnptools.snpreader import Pheno phenoreader = Pheno("pheno_10_causals.txt") print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos #Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan nan nan]] phenodata = phenoreader.read() print phenodata.val #[[ 4.85339514e-01] # [ -2.07698457e-01] # [ 1.49090841e+00] # [ -1.21289967e+00] # ... # Write 1st 10 iids and sids of Bed data into Pheno format snpdata1010 = Bed("all.bed")[:10, :10].read() Pheno.write("deleteme1010.txt", snpdata1010) #Write it to Bed format Bed.write("deleteme1010.bed", snpdata1010) # Create a snpdata on the fly and write to Bed snpdata1 = SnpData(iid=[['f1', 'c1'], ['f1', 'c2'], ['f2', 'c1']], sid=['snp1', 'snp2'], val=[[0, 1], [2, 1], [1, np.nan]]) Bed.write("deleteme1.bed", snpdata1) #Pheno is slow because its txt. Bed format can only hold 0,1,2,missing. # Use SnpNpz for fastest read/write times, smallest file size from pysnptools.snpreader import SnpNpz SnpNpz.write("deleteme1010.snp.npz", snpdata1010)
from pysnptools.snpreader import Pheno phenoreader = Pheno("pheno_10_causals.txt") print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos #Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan nan nan]] phenodata = phenoreader.read() print phenodata.val #[[ 4.85339514e-01] # [ -2.07698457e-01] # [ 1.49090841e+00] # [ -1.21289967e+00] # ... # Write 1st 10 iids and sids of Bed data into Pheno format snpdata1010 = Bed("all.bed")[:10,:10].read() Pheno.write("deleteme1010.txt",snpdata1010) #Write it to Bed format Bed.write("deleteme1010.bed",snpdata1010) # Create a snpdata on the fly and write to Bed snpdata1 = SnpData(iid=[['f1','c1'],['f1','c2'],['f2','c1']],sid=['snp1','snp2'],val=[[0,1],[2,1],[1,np.nan]]) Bed.write("deleteme1.bed",snpdata1) #Pheno is slow because its txt. Bed format can only hold 0,1,2,missing. # Use SnpNpz for fastest read/write times, smallest file size from pysnptools.snpreader import SnpNpz SnpNpz.write("deleteme1010.snp.npz", snpdata1010) # Use SnpHdf5 for random-access reads, good speed and size, and compatiblity outside Python