Beispiel #1
0
def divideData(filename, direct, num=5, mph=3, delet=True):
    print "Estimating heritability using " + str(num) + " components"
    [yFil, sFil] = getData(filename, mph=mph)
    n = sFil.iid_count
    reOrd = perm(n)
    yFil = yFil[reOrd, :]
    sFil = sFil[reOrd, :]

    div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)]

    varEsts = []

    for i in range(0, num):
        print "For component " + str(i)
        sFilTemp = sFil[div[i]:div[i + 1], :]

        yFilTemp = yFil[div[i]:div[i + 1], :]

        fileTemp = direct + "/tempFile_" + str(i)
        Bed.write(fileTemp, sFilTemp.read())
        Pheno.write(fileTemp + ".phen", yFilTemp.read())

        varEsts.append(varRes(fileTemp, direct))

        if delet:
            os.system("rm " + direct + "/tempFile_" + str(i) + "*")

    return varEsts
Beispiel #2
0
    def test_c_reader_pheno(self):
        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()

        self.assertEqual(np.float64, snpdata1.val.dtype)

        snpdata1.val[1,0] = np.NaN # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.phe"
        create_directory_if_necessary(output)
        Pheno.write(output, snpdata1)
        snpreader = Pheno(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10)

        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()
        import pysnptools.util.pheno as pstpheno
        dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="")
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10)


        dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="",vectorize=True)
        assert len(dict['vals'].shape)==1, "test 1-d array of values"
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10)

        snpdata4 = Pheno(None,iid_if_none=snpdata1.iid)
        assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0

        snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10)
        snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
Beispiel #3
0
def divideData(filename,direct,num=5,mph=3,delet=True):
	print "Estimating heritability using "+str(num)+" components"
	[yFil,sFil]=getData(filename,mph=mph);
	n=sFil.iid_count	
	reOrd=perm(n);
	yFil=yFil[reOrd,:];
	sFil=sFil[reOrd,:];

	div=[int(math.ceil( i*n/float(num) )) for i in range(0,num+1)];
		
	varEsts=[];

	for i in range(0,num):
		print "For component "+str(i);
		sFilTemp=sFil[div[i]:div[i+1],:];

		yFilTemp=yFil[div[i]:div[i+1],:];

		fileTemp=direct+"/tempFile_"+str(i);
		Bed.write(fileTemp,sFilTemp.read());
		Pheno.write(fileTemp+".phen",yFilTemp.read())
		
		varEsts.append(varRes(fileTemp,direct));
		
		

		if delet:
			os.system("rm "+direct+"/tempFile_"+str(i)+"*");
	
	return varEsts;
Beispiel #4
0
    def _sel_plus_pc(self, h2, force_low_rank, force_full_rank, count_A1=None):
        do_plot = False
        use_cache = False

        # define file names
        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"

        pcs_fn = os.path.join(self.tempout_dir, "sel_plus_pc.pcs.txt")
        if not (use_cache and os.path.exists(pcs_fn)):
            from fastlmm.util import compute_auto_pcs
            covar = compute_auto_pcs(bed_fn, count_A1=count_A1)
            logging.info("selected number of PCs: {0}".format(
                covar["vals"].shape[1]))
            Pheno.write(
                pcs_fn,
                SnpData(iid=covar['iid'],
                        sid=covar['header'],
                        val=covar['vals']))
        else:
            logging.info("Using top pcs's cache")
            covar = Pheno(pcs_fn)

        mf_name = "lmp"  #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp"
        runner = mf_to_runner_function(mf_name)(20)

        logging.info(
            "Working on h2={0},force_low_rank={1},force_full_rank={2}".format(
                h2, force_low_rank, force_full_rank))
        result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 ==
                                                    .5 else "h2Search")
        output_file_name = os.path.join(self.tempout_dir,
                                        result_file_name) + ".txt"
        results = single_snp_select(test_snps=bed_fn,
                                    G=bed_fn,
                                    pheno=phen_fn,
                                    k_list=[
                                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20,
                                        30, 40, 50, 60, 70, 80, 90, 100, 125,
                                        160, 200, 250, 320, 400, 500, 630, 800,
                                        1000
                                    ],
                                    h2=h2,
                                    n_folds=self.pythonpath +
                                    "/tests/datasets/synth/DebugEmitFolds.txt",
                                    covar=covar,
                                    output_file_name=output_file_name,
                                    force_low_rank=force_low_rank,
                                    force_full_rank=force_full_rank,
                                    GB_goal=2,
                                    count_A1=False
                                    #runner = runner
                                    )
        logging.info(results.head())
        self.compare_files(results, result_file_name)
Beispiel #5
0
    def test_c_reader_pheno(self):
        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()

        self.assertEqual(np.float64, snpdata1.val.dtype)

        snpdata1.val[
            1,
            0] = np.NaN  # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.phe"
        create_directory_if_necessary(output)
        Pheno.write(output, snpdata1)
        snpreader = Pheno(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata2.val,
                                             decimal=10)

        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()
        import pysnptools.util.pheno as pstpheno
        dict = pstpheno.loadOnePhen(self.currentFolder +
                                    "/examples/toydata.phe",
                                    missing="")
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata3.val,
                                             decimal=10)

        dict = pstpheno.loadOnePhen(self.currentFolder +
                                    "/examples/toydata.phe",
                                    missing="",
                                    vectorize=True)
        assert len(dict['vals'].shape) == 1, "test 1-d array of values"
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata3.val,
                                             decimal=10)

        snpdata4 = Pheno(None, iid_if_none=snpdata1.iid)
        assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0

        snpdata5 = Pheno(self.currentFolder +
                         "/examples/toydata.id.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata5.val,
                                             decimal=10)
        snpdata6 = Pheno(self.currentFolder +
                         "/examples/toydata.fid.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata6.val,
                                             decimal=10)
    def _sel_plus_pc(self,h2,force_low_rank,force_full_rank,count_A1=None):
        do_plot = False
        use_cache = False

        # define file names
        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"

        pcs_fn = os.path.join(self.tempout_dir,"sel_plus_pc.pcs.txt")
        if not (use_cache and os.path.exists(pcs_fn)):
            from fastlmm.util import compute_auto_pcs
            covar = compute_auto_pcs(bed_fn,count_A1=count_A1)
            logging.info("selected number of PCs: {0}".format(covar["vals"].shape[1]))
            Pheno.write(pcs_fn,SnpData(iid=covar['iid'],sid=covar['header'],val=covar['vals']))
        else:
            logging.info("Using top pcs's cache")
            covar=Pheno(pcs_fn)


        mf_name = "lmp" #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp"
        runner = mf_to_runner_function(mf_name)(20)

        logging.info("Working on h2={0},force_low_rank={1},force_full_rank={2}".format(h2,force_low_rank,force_full_rank))
        result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 == .5 else "h2Search")
        output_file_name = os.path.join(self.tempout_dir,result_file_name)+".txt"
        results = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn,
                                        k_list = [0,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,125,160,200,250,320,400,500,630,800,1000],
                                        h2=h2,
                                        n_folds = self.pythonpath + "/tests/datasets/synth/DebugEmitFolds.txt",
                                        covar=covar,
                                        output_file_name=output_file_name,
                                        force_low_rank=force_low_rank,force_full_rank=force_full_rank,
                                        GB_goal=2,
                                        count_A1=False
                                        #runner = runner
                                    )
        logging.info(results.head())
        self.compare_files(results,result_file_name)
Beispiel #7
0
from pysnptools.snpreader import Pheno

phenoreader = Pheno("pheno_10_causals.txt")
print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos
#Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan  nan  nan]]
phenodata = phenoreader.read()
print phenodata.val
#[[  4.85339514e-01]
# [ -2.07698457e-01]
# [  1.49090841e+00]
# [ -1.21289967e+00]
# ...

# Write 1st 10 iids and sids of Bed data into Pheno format
snpdata1010 = Bed("all.bed")[:10, :10].read()
Pheno.write("deleteme1010.txt", snpdata1010)

#Write it to Bed format
Bed.write("deleteme1010.bed", snpdata1010)

# Create a snpdata on the fly and write to Bed
snpdata1 = SnpData(iid=[['f1', 'c1'], ['f1', 'c2'], ['f2', 'c1']],
                   sid=['snp1', 'snp2'],
                   val=[[0, 1], [2, 1], [1, np.nan]])
Bed.write("deleteme1.bed", snpdata1)

#Pheno is slow because its txt. Bed format can only hold 0,1,2,missing.
# Use SnpNpz for fastest read/write times, smallest file size
from pysnptools.snpreader import SnpNpz

SnpNpz.write("deleteme1010.snp.npz", snpdata1010)
from pysnptools.snpreader import Pheno
phenoreader = Pheno("pheno_10_causals.txt")
print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos
#Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan  nan  nan]]
phenodata = phenoreader.read()
print phenodata.val
#[[  4.85339514e-01]
# [ -2.07698457e-01]
# [  1.49090841e+00]
# [ -1.21289967e+00]
# ...

# Write 1st 10 iids and sids of Bed data into Pheno format
snpdata1010 = Bed("all.bed")[:10,:10].read()
Pheno.write("deleteme1010.txt",snpdata1010)

#Write it to Bed format
Bed.write("deleteme1010.bed",snpdata1010)

# Create a snpdata on the fly and write to Bed
snpdata1 = SnpData(iid=[['f1','c1'],['f1','c2'],['f2','c1']],sid=['snp1','snp2'],val=[[0,1],[2,1],[1,np.nan]])
Bed.write("deleteme1.bed",snpdata1)


#Pheno is slow because its txt. Bed format can only hold 0,1,2,missing.
# Use SnpNpz for fastest read/write times, smallest file size
from pysnptools.snpreader import SnpNpz
SnpNpz.write("deleteme1010.snp.npz", snpdata1010)

# Use SnpHdf5 for random-access reads, good speed and size, and compatiblity outside Python