Esempio n. 1
0
    def test_some_std(self):
        k0 = self.snpdata.read_kernel(standardizer=Unit()).val
        from pysnptools.kernelreader import SnpKernel
        k1 = self.snpdata.read_kernel(standardizer=Unit())
        np.testing.assert_array_almost_equal(k0, k1.val, decimal=10)

        from pysnptools.snpreader import SnpData
        snpdata2 = SnpData(iid=self.snpdata.iid,sid=self.snpdata.sid,pos=self.snpdata.pos,val=np.array(self.snpdata.val))
        s = str(snpdata2)
        snpdata2.standardize()
        s = str(snpdata2)

        snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False)
        k2 = snpreader.read_kernel(standardizer=Unit(),block_size=500).val
        np.testing.assert_array_almost_equal(k0, k2, decimal=10)

        from pysnptools.standardizer.identity import Identity
        from pysnptools.standardizer.diag_K_to_N import DiagKtoN
        for dtype in [sp.float64,sp.float32]:
            for std in [Unit(),Beta(1,25),Identity(),DiagKtoN()]:
                s = str(std)
                np.random.seed(0)
                x = np.array(np.random.randint(3,size=[60,100]),dtype=dtype)
                x2 = x[:,::2]
                x2b = np.array(x2)
                #LATER what's this about? It doesn't do non-contiguous?
                #assert not x2.flags['C_CONTIGUOUS'] and not x2.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #assert x2b.flags['C_CONTIGUOUS'] or x2b.flags['F_CONTIGUOUS'] #set up to test non contiguous
                #a,b = std.standardize(x2b),std.standardize(x2)
                #np.testing.assert_array_almost_equal(a,b)
        logging.info("done")
Esempio n. 2
0
def divideData(filename,direct,num=5,mph=3,delet=True):
	print "Estimating heritability using "+str(num)+" components"
	[yFil,sFil]=getData(filename,mph=mph);
	n=sFil.iid_count	
	reOrd=perm(n);
	yFil=yFil[reOrd,:];
	sFil=sFil[reOrd,:];

	div=[int(math.ceil( i*n/float(num) )) for i in range(0,num+1)];
		
	varEsts=[];

	for i in range(0,num):
		print "For component "+str(i);
		sFilTemp=sFil[div[i]:div[i+1],:];

		yFilTemp=yFil[div[i]:div[i+1],:];

		fileTemp=direct+"/tempFile_"+str(i);
		Bed.write(fileTemp,sFilTemp.read());
		Pheno.write(fileTemp+".phen",yFilTemp.read())
		
		varEsts.append(varRes(fileTemp,direct));
		
		

		if delet:
			os.system("rm "+direct+"/tempFile_"+str(i)+"*");
	
	return varEsts;
    def test_match_cpp(self):
        '''
        match
            FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100

        '''
        logging.info("TestSingleSnp test_match_cpp")
        snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"), count_A1=False)
        pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt")
        covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt")
        sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"]
        sim_idx = snps.sid_to_index(sim_sid)
        test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"]
        test_idx = snps.sid_to_index(test_sid)

        for G0,G1 in [(snps[:,sim_idx],KernelIdentity(snps.iid)),(KernelIdentity(snps.iid),snps[:,sim_idx])]:
            frame_h2 = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,h2=.5,leave_out_one_chrom=False,count_A1=False)
            frame_log_delta = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,log_delta=0,leave_out_one_chrom=False,count_A1=False)
            for frame in [frame_h2, frame_log_delta]:
                referenceOutfile = TestFeatureSelection.reference_file("single_snp/topsnps.single.txt")
                reference = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file
                assert len(frame) == len(reference)
                for _, row in reference.iterrows():
                    sid = row.SNP
                    pvalue = frame[frame['SNP'] == sid].iloc[0].PValue
                    reldiff = abs(row.Pvalue - pvalue)/row.Pvalue
                    assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format(sid,None,row.Pvalue,pvalue,reldiff)
Esempio n. 4
0
    def setUpClass(self):
        currentFolder = os.path.dirname(os.path.realpath(__file__))
        self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata"
        self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt"
        #self.cov_fn = currentFolder + "/examples/toydata.cov"

        # load data
        ###################################################################
        snp_reader = Bed(self.snp_fn)
        pheno = pstpheno.loadOnePhen(self.pheno_fn)
        #cov = pstpheno.loadPhen(self.cov_fn)
        
        # intersect sample ids
        snp_reader, pheno = pysnptools.util.intersect_apply([snp_reader, pheno])
        
        self.G = snp_reader.read(order='C').val
        self.G = stdizer.Unit().standardize(self.G)
        self.G.flags.writeable = False
        self.y = pheno['vals'][:,0]
        self.y.flags.writeable = False

        # load pcs
        #self.G_cov = cov['vals']
        self.G_cov = np.ones((len(self.y), 1))
        self.G_cov.flags.writeable = False
Esempio n. 5
0
 def __init__(self,args):
     if args.window_type not in ['BP','SNP']:
         raise ValueError('Window type not supported')
     bed_1 = Bed(args.bfile) #
     af1 = self.get_allele_frequency(bed_1,args) #
     print(len(af1), "SNPs in file 1")
     snps_1 = (af1>args.maf)&(af1<1-args.maf) #
     print(np.sum(snps_1), "SNPs in file 1 after MAF filter")
     if (args.from_bp is not None) and (args.to_bp is not None):
         k = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp)
         snps_1 = snps_1&k
     snps_to_use = bed_1.sid[snps_1]
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract,'r')])
         snps_to_use = np.intersect1d(snps_to_use,keep)
         print(len(snps_to_use),"SNPs remaining after extraction")
     bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) #
     pos = bed_1.pos[bed_1_index] #
     bim_1=pd.read_table(bed_1.filename+'.bim',header=None,
                         names=['chm','id','pos_mb','pos_bp','a1','a2'])
     af = af1[bed_1_index] #
     if args.afile is not None:
         a1 =  pd.read_table(args.afile,header=None,sep='\s*',
                             names=['id1','id2','theta'])
     else:
         a1 = None
     self.af = af
     self.M = len(bed_1_index) #
     self.windows = self.get_windows(pos,args) #
     self.chr = pos[:,0]
     self.pos = pos[:,2]
     self.id = bed_1.sid[bed_1_index]
     self.A1 = bim_1['a1'].loc[bed_1_index]
     self.A2 = bim_1['a2'].loc[bed_1_index]
     self.scores = self.compute(bed_1,bed_1_index,af,a1,args) #
Esempio n. 6
0
 def setUpClass(self):
     self.currentFolder = os.path.dirname(os.path.realpath(__file__))
     #TODO: get data set with NANs!
     snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False)
     self.pheno_fn = self.currentFolder + "/examples/toydata.phe"
     self.snpdata = snpreader.read(order='F',force_python_only=True)
     self.snps = self.snpdata.val
Esempio n. 7
0
 def read_plink(self, fn_plink = None):
     """
     plink reader
     """
     PL = Bed(fn_plink)
     PLOB = PL.read()
     self.GT = PLOB.val
     self.POS = PLOB.pos[:,[0,1]]
     self.SID = PLOB.iid[:,1]
     self.isNormalised = False
Esempio n. 8
0
    def factory(snpreader, num_snps_in_memory, standardizer, blocksize):
        if isinstance(snpreader, str):
            snpreader = Bed(snpreader)

        if num_snps_in_memory >= snpreader.sid_count:
            in_memory = InMemory(snpreader.read(order='C').standardize(standardizer), standardizer, blocksize)
            in_memory._snpreader.val.flags.writeable = False
            in_memory._val = in_memory._snpreader.val
            return in_memory
        else:
            return FromDisk(snpreader, num_snps_in_memory, standardizer, blocksize, None)
Esempio n. 9
0
 def test_write_x_x_cpp(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata")
     for order in ['C','F']:
         for dtype in [np.float32,np.float64]:
             snpdata = snpreader.read(order=order,dtype=dtype)
             snpdata.val[-1,0] = float("NAN")
             output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64")
             create_directory_if_necessary(output)
             Bed.write(snpdata, output)
             snpdata2 = Bed(output).read()
             assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
Esempio n. 10
0
def process_data(input_path, output_path, name):
    snpreader = Bed(os.path.join(input_path, name))
    data = snpreader.read()
    values = data.val
    preproc_vals = pysnp_genpreproc(values)
    assert(np.any(np.isnan(preproc_vals)) == False)
    saved = os.path.join(output_path, name + ".h5py")
    path, keys = h5_save(path=saved, data_obj={name:preproc_vals}, dt='f')
    return {'n_subjects':data.iid_count, 'subject_ids':data.iid,
            'n_snps':data.sid_count, 'snp_ids':data.sid,
            'data_preprocessed_location': {'path':path, 'key':keys}}
Esempio n. 11
0
 def test_write_x_x_cpp(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata")
     for order in ['C','F']:
         for dtype in [np.float32,np.float64]:
             snpdata = snpreader.read(order=order,dtype=dtype)
             snpdata.val[-1,0] = float("NAN")
             output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64")
             create_directory_if_necessary(output)
             Bed.write(output, snpdata)
             snpdata2 = Bed(output).read()
             np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
Esempio n. 12
0
 def test_subset_view(self):
     snpreader2 = Bed(self.currentFolder + "/examples/toydata",count_A1=False)[:,:]
     result = snpreader2.read(view_ok=True)
     self.assertFalse(snpreader2 is result)
     result2 = result[:,:].read()
     self.assertFalse(sp.may_share_memory(result2.val,result.val))
     result3 = result[:,:].read(view_ok=True)
     self.assertTrue(sp.may_share_memory(result3.val,result.val))
     result4 = result3.read()
     self.assertFalse(sp.may_share_memory(result4.val,result3.val))
     result5 = result4.read(view_ok=True)
     self.assertTrue(sp.may_share_memory(result4.val,result5.val))
Esempio n. 13
0
 def test_npz(self):
     logging.info("in test_npz")
     snpreader = Bed(self.currentFolder + "/../examples/toydata",count_A1=False)
     kerneldata1 = snpreader.read_kernel(standardizer=stdizer.Unit())
     s = str(kerneldata1)
     output = "tempdir/kernelreader/toydata.kernel.npz"
     create_directory_if_necessary(output)
     KernelNpz.write(output,kerneldata1)
     kernelreader2 = KernelNpz(output)
     kerneldata2 = kernelreader2.read()
     np.testing.assert_array_almost_equal(kerneldata1.val, kerneldata2.val, decimal=10)
     logging.info("done with test")
Esempio n. 14
0
def main(args):
    print('reading seeed snps')
    seed_snps = pd.read_csv(args.seed_snps, header=None, names=['SNP'], index_col='SNP')
    seed_snps['ibs_length'] = 0
    seed_snps['ibd'] = 0

    print('reading typed snps')
    typed_snps = pd.read_csv(args.typed_snps, header=None, names=['SNP'])

    print('reading genotypes')
    data = Bed(args.bfile)
    X = data.read().val
    typed_snps_indices = np.sort(data.sid_to_index(typed_snps.SNP))
    typed_snps_bp = data.col_property[typed_snps_indices,2]

    print(len(seed_snps), 'snps in list')
    print(data.iid_count, data.sid_count, 'are dimensions of X')

    def analyze_snp(i):
        # find first typed snp after query snp
        snp_bp = data.col_property[i,2]
        v = np.where(typed_snps_bp > snp_bp)[0]
        if len(v) > 0:
            typed_i = v[0]
        else:
            typed_i = len(typed_snps_indices)-1

        n1, n2 = np.where(X[:,i] == 1)[0]
        if (X[n1,typed_snps_indices[typed_i]] - X[n2, typed_snps_indices[typed_i]])**2 == 4:
            return 0, 0

        typed_il, typed_ir = fis.find_boundaries(
                X[n1,typed_snps_indices],
                X[n2,typed_snps_indices],
                typed_i)
        typed_ir -= 1

        il = typed_snps_indices[typed_il]
        ir = typed_snps_indices[typed_ir]
        cM = data.col_property[ir, 1] - \
                data.col_property[il, 1]
        ibd = (np.mean(X[n1,il:ir] == X[n2,il:ir]) > 0.99)
        return cM, int(ibd)

    for (i, snp) in iter.show_progress(
            it.izip(data.sid_to_index(seed_snps.index), seed_snps.index),
            total=len(seed_snps)):
            # total=10):
        seed_snps.ix[snp, ['ibs_length', 'ibd']] = analyze_snp(i)

    print(seed_snps.iloc[:100])
    seed_snps.to_csv(args.outfile, sep='\t')
Esempio n. 15
0
    def test_subset(self):
        logging.info("in test_subset")
        snpreader = Bed(self.currentFolder + "/../examples/toydata",count_A1=False)
        snpkernel = SnpKernel(snpreader,stdizer.Unit())
        krsub = snpkernel[::2,::2]
        kerneldata1 = krsub.read()
        expected = snpreader.read_kernel(stdizer.Unit())[::2].read()
        np.testing.assert_array_almost_equal(kerneldata1.val, expected.val, decimal=10)

        krsub2 = snpkernel[::2]
        kerneldata2 = krsub2.read()
        np.testing.assert_array_almost_equal(kerneldata2.val, expected.val, decimal=10)
        logging.info("done with test")
Esempio n. 16
0
 def too_slow_test_write_bedbig(self):
     iid_count = 100000
     sid_count = 50000
     from pysnptools.snpreader.snpdata import SnpData #!!! promote on level up innamespace
     iid = np.array([[str(i),str(i)] for i in xrange(iid_count)])
     sid = np.array(["sid_{0}".format(i) for i in xrange(sid_count)])
     pos = np.array([[i,i,i] for i in xrange(sid_count)])
     np.random.seed = 0
     snpdata = SnpData(iid,sid,pos,np.zeros((iid_count,sid_count))) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count)))
     output = "tempdir/bedbig.{0}.{1}".format(iid_count,sid_count)
     create_directory_if_necessary(output)
     Bed.write(snpdata, output)
     snpdata2 = Bed(output).read()
     assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
Esempio n. 17
0
 def too_slow_test_write_bedbig(self):
     iid_count = 100000
     sid_count = 50000
     from pysnptools.snpreader import SnpData
     iid = np.array([[str(i),str(i)] for i in range(iid_count)])
     sid = np.array(["sid_{0}".format(i) for i in range(sid_count)])
     pos = np.array([[i,i,i] for i in range(sid_count)])
     np.random.seed(0)
     snpdata = SnpData(iid,sid,np.zeros((iid_count,sid_count)),pos=pos) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count)))
     output = "tempdir/bedbig.{0}.{1}".format(iid_count,sid_count)
     create_directory_if_necessary(output)
     Bed.write(output, snpdata, count_A1=False)
     snpdata2 = Bed(output,count_A1=False).read()
     np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
Esempio n. 18
0
def main():
    """
    example that compares output to fastlmmc
    """


    # set up data
    phen_fn = "../feature_selection/examples/toydata.phe"
    snp_fn = "../feature_selection/examples/toydata.5chrom"
    #chrom_count = 5
    
    # load data
    ###################################################################
    snp_reader = Bed(snp_fn)
    pheno = pstpheno.loadOnePhen(phen_fn)

    cov = None
    #cov = pstpheno.loadPhen(self.cov_fn)    

    snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])
    
    G = snp_reader.read(order='C').val
    G = stdizer.Unit().standardize(G)
    G.flags.writeable = False
    y = pheno['vals'][:,0]
    y.flags.writeable

    # load pcs
    #G_pc = cov['vals']
    #G_pc.flags.writeable = False
    delta = 2.0
    gwas = WindowingGwas(G, y, delta=delta)
    pv = gwas.run_gwas()

    from fastlmm.association.tests.test_gwas import GwasTest
    REML = False
    snp_pos_sim = snp_reader.sid
    snp_pos_test = snp_reader.sid
    os.environ["FastLmmUseAnyMklLib"] = "1"
    gwas_c = GwasTest(snp_fn, phen_fn, snp_pos_sim, snp_pos_test, delta, REML=REML, excludeByPosition=0)
    gwas_c.run_gwas()

    import pylab
    pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+")
    pylab.plot(np.arange(-18, 0), np.arange(-18,0), "-k")
    pylab.show()

    np.testing.assert_array_almost_equal(np.log(pv), np.log(gwas_c.p_values), decimal=3)
    
    simple_manhattan_plot(pv)
    def test_SNC(self):
        logging.info("TestSNC")
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps, count_A1=False)
        snc = bed.read()
        snc.val[:,2] = [0] * snc.iid_count # make SNP #2 have constant values (aka a SNC)

        output_file_name = self.file_name("snc")

        frame = single_snp(test_snps=snc[:,:10], pheno=pheno, G0=snc, mixing=0,leave_out_one_chrom=False,
                                  covar=covar, output_file_name=output_file_name,count_A1=False
                                  )
        self.compare_files(frame,"snc")
Esempio n. 20
0
 def test_write_bed_f64cpp_5_python(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata")
     iid_index = 5
     logging.info("iid={0}".format(iid_index))
     #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test
     #    snpreader = snpreader[0:-1,:]
     #assert snpreader.iid_count % 4 != 0
     snpdata = snpreader[0:iid_index,:].read(order='F',dtype=np.float64)
     if snpdata.iid_count > 0:
         snpdata.val[-1,0] = float("NAN")
     output = "tempdir/toydata.F64python.{0}".format(iid_index)
     create_directory_if_necessary(output)
     Bed.write(snpdata, output,force_python_only=True)
     snpdata2 = Bed(output).read()
     assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
Esempio n. 21
0
 def test_write_bed_f64cpp_5_python(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False)
     iid_index = 5
     logging.info("iid={0}".format(iid_index))
     #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test
     #    snpreader = snpreader[0:-1,:]
     #assert snpreader.iid_count % 4 != 0
     snpdata = snpreader[0:iid_index,:].read(order='F',dtype=np.float64)
     if snpdata.iid_count > 0:
         snpdata.val[-1,0] = float("NAN")
     output = "tempdir/toydata.F64python.{0}".format(iid_index)
     create_directory_if_necessary(output)
     Bed.write(output,snpdata, force_python_only=True)
     snpdata2 = Bed(output,count_A1=False).read()
     np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False)
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
Esempio n. 23
0
def genPheno(filename="../thinFam",
             per=.5,
             savename="fakePheno.txt",
             c=2.0,
             num=5):
    sFil = Bed(filename)
    D = sFil.read().val
    m = len(D[0])
    n = len(D)
    print m
    print n
    I = [rand.randint(0, m - 1) for i in range(0, num)]
    SNP = [[D[j][i] for j in range(0, n)] for i in I]
    #p0=n*peir/sum([c**i*len([j for j in SNP if j==float(i)]) for i in range(0,3)])
    print len(I)
    print len(SNP)
    print len(SNP[0])
    print n
    print min([len(s) for s in SNP])
    print SNP

    SNP = [[max(i, 0.0) for i in s] for s in SNP]
    for i in range(0, num):
        for j in range(0, n):
            if not SNP[i][j] in [1.0, 0.0, 2.0]:
                SNP[i][j] = 0.0
    print[list(set(s)) for s in SNP]
    lst = [sum([SNP[j][i] for j in range(0, num)]) for i in range(0, n)]
    #print lst;
    print sum(
        [c**(sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n)])
    p0 = n * per / sum(
        [c**(sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n)])
    print p0
    y = [
        float(
            rand.uniform(0, 1) < p0 *
            c**sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n)
    ]
    if len(savename) == 0:
        return y
    fil = open(savename, "w")
    for i in y:
        fil.write(str(i) + "\n")
    fil.close()
Esempio n. 24
0
def load_plink_bed_bim_fam_dataset(path_dataset, snp_ids=None,
                                   subject_ids=None, count_A1=True):
    """
    Load a Plink bed/bim/fam dataset as a SnpData instance. Optionnally a
    specific list of snps or subjects can be extracted to avoid loading
    everything in memory.

    Parameters
    ----------
    path_dataset: str
        Path to the Plink bed/bim/fam dataset, with or without .bed extension.
    snp_ids: list/set of str, default None
        Snps that should be extracted if available in the dataset.
        By default None, all snps are loaded.
    subject_ids: list of str, default None
        Subjects that should be extracted if available in the dataset.
        By default None, all subjects are loaded.
    count_A1: bool, default True
        Genotypes are provided as allele counts, A1 if True else A2.

    Return
    ------
    snp_data: pysnptools object
        PLINK data loaded by the 'pysnptools' library.
    """

    # Load the metadata, without loading the genotypes
    snp_data = Bed(path_dataset, count_A1=count_A1)

    # If requested, filter on snp ids
    if snp_ids is not None:
        snp_ids = set(snp_ids)
        snp_bool_indexes = [(s in snp_ids) for s in snp_data.sid]
        snp_data = snp_data[:, snp_bool_indexes]

    # If requested, filter on subject ids
    if subject_ids is not None:
        subject_ids = set(subject_ids)
        subject_bool_indexes = [(s in subject_ids) for s in snp_data.iid[:, 1]]
        snp_data = snp_data[subject_bool_indexes, :]

    # Load the genotypes from the Plink dataset
    snp_data = snp_data.read()

    return snp_data
Esempio n. 25
0
 def gen_and_compare(self, output_file, **kwargs):
     gen_snpdata = snp_gen(**kwargs)
     #pstutil.create_directory_if_necessary(self.currentFolder + "/tempdir/" + output_file,isfile=True)
     #Bed.write(gen_snpdata, self.currentFolder + "/tempdir/" + output_file)  #comment out
     ref_snpdata = Bed(self.currentFolder + "/expected/" +
                       output_file).read()
     assert TestSnpGen.is_same(gen_snpdata,
                               ref_snpdata), "Failure on " + output_file
     return gen_snpdata
Esempio n. 26
0
 def test_load_and_standardize_hdf5(self):
     snpreader2 = SnpHdf5(self.currentFolder +
                          "/examples/toydata.snpmajor.snp.hdf5")
     snpreader3 = SnpHdf5(self.currentFolder +
                          "/examples/toydata.iidmajor.snp.hdf5")
     self.load_and_standardize(snpreader2, snpreader3)
     snpreaderref = Bed(self.currentFolder + "/examples/toydata",
                        count_A1=False)
     self.load_and_standardize(snpreader2, snpreaderref)
Esempio n. 27
0
 def too_slow_test_write_bedbig(self):
     iid_count = 100000
     sid_count = 50000
     from pysnptools.snpreader import SnpData
     iid = np.array([[str(i), str(i)] for i in range(iid_count)])
     sid = np.array(["sid_{0}".format(i) for i in range(sid_count)])
     pos = np.array([[i, i, i] for i in range(sid_count)])
     np.random.seed(0)
     snpdata = SnpData(
         iid, sid, np.zeros((iid_count, sid_count)), pos=pos
     )  #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count)))
     output = "tempdir/bedbig.{0}.{1}".format(iid_count, sid_count)
     create_directory_if_necessary(output)
     Bed.write(output, snpdata, count_A1=False)
     snpdata2 = Bed(output, count_A1=False).read()
     np.testing.assert_array_almost_equal(snpdata.val,
                                          snpdata2.val,
                                          decimal=10)
Esempio n. 28
0
    def test_subset(self):
        logging.info("in test_subset")
        snpreader = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",
                        count_A1=False)
        snpkernel = SnpKernel(snpreader, stdizer.Unit())
        krsub = snpkernel[::2, ::2]
        kerneldata1 = krsub.read()
        expected = snpreader.read_kernel(stdizer.Unit())[::2].read()
        np.testing.assert_array_almost_equal(kerneldata1.val,
                                             expected.val,
                                             decimal=10)

        krsub2 = snpkernel[::2]
        kerneldata2 = krsub2.read()
        np.testing.assert_array_almost_equal(kerneldata2.val,
                                             expected.val,
                                             decimal=10)
        logging.info("done with test")
Esempio n. 29
0
    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = True
        self.row  # get row info
        self.col  # get col info

        _bed = SnpReader._name_of_other_file(self.path,
                                             remove_suffix="bed",
                                             add_suffix="bed")
        local_bed = self._storage.open_read(_bed)
        self.local = Bed(local_bed.__enter__(),
                         count_A1=True,
                         iid=self.row,
                         sid=self.col,
                         pos=self.col_property,
                         skip_format_check=True)
        self._file_dict["bed"] = local_bed
Esempio n. 30
0
 def test_write_bed_f64cpp_5_python(self):
     snpreader = Bed(self.currentFolder + "/examples/toydata",
                     count_A1=False)
     iid_index = 5
     logging.info("iid={0}".format(iid_index))
     #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test
     #    snpreader = snpreader[0:-1,:]
     #assert snpreader.iid_count % 4 != 0
     snpdata = snpreader[0:iid_index, :].read(order='F', dtype=np.float64)
     if snpdata.iid_count > 0:
         snpdata.val[-1, 0] = float("NAN")
     output = "tempdir/toydata.F64python.{0}".format(iid_index)
     create_directory_if_necessary(output)
     Bed.write(output, snpdata, force_python_only=True)
     snpdata2 = Bed(output, count_A1=False).read()
     np.testing.assert_array_almost_equal(snpdata.val,
                                          snpdata2.val,
                                          decimal=10)
Esempio n. 31
0
 def __init__(self, path, shape, dtype=np.int8, count_A1=True):
     # n variants (sid = SNP id), n samples (iid = Individual id)
     n_sid, n_iid = shape
     # Initialize Bed with empty arrays for axis data, otherwise it will
     # load the bim/map/fam files entirely into memory (it does not do out-of-core for those)
     self.bed = Bed(
         str(path),
         count_A1=count_A1,
         # Array (n_sample, 2) w/ FID and IID
         iid=np.empty((n_iid, 2), dtype="str"),
         # SNP id array (n_variants)
         sid=np.empty((n_sid, ), dtype="str"),
         # Contig and positions array (n_variants, 3)
         pos=np.empty((n_sid, 3), dtype="int"),
     )
     self.shape = (n_sid, n_iid, 2)
     self.dtype = dtype
     self.ndim = 3
Esempio n. 32
0
    def factory(snpreader,
                num_snps_in_memory,
                standardizer,
                blocksize,
                count_A1=None):
        if isinstance(snpreader, str):
            snpreader = Bed(snpreader, count_A1=count_A1)

        if num_snps_in_memory >= snpreader.sid_count:
            in_memory = InMemory(
                snpreader.read(order='C').standardize(standardizer),
                standardizer, blocksize)
            in_memory._snpreader.val.flags.writeable = False
            in_memory._val = in_memory._snpreader.val
            return in_memory
        else:
            return FromDisk(snpreader, num_snps_in_memory, standardizer,
                            blocksize, None)
Esempio n. 33
0
def getData(filename):
    mph = 3
    sFil = Bed(filename, count_A1=False)
    # Bed object
    yFil = Pheno(filename + ".fam")

    y = yFil.read().val[:, mph]
    y = [i - 1 for i in y
         ]  # the last column of .fam file is the disease states of data owners
    return [y, sFil]
 def setUpClass(self):
     from fastlmm.util.util import create_directory_if_necessary
     create_directory_if_necessary(self.tempout_dir, isfile=False)
     self.pythonpath = os.path.abspath(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "..", ".."))
     self.snpreader_whole = Bed(self.pythonpath +
                                "/tests/datasets/synth/all")
     self.pheno_whole = Pheno(self.pythonpath +
                              "/tests/datasets/synth/pheno_10_causals.txt")
Esempio n. 35
0
    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = None

        if isinstance(self.test_snps, str):
            self.test_snps = Bed(self.test_snps)

        if isinstance(self.G0, str):
            self.G0 = Bed(self.G0)

        if isinstance(self.pheno, str):
            self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True) #!! what about missing=-9?

        if self.covar is not None and isinstance(self.covar, str):
            self.covar = pstpheno.loadPhen(self.covar)#!! what about missing=-9?

        if self.G1_or_none is not None and isinstance(self.G1_or_none, str):
            self.G1_or_none = Bed(self.G1_or_none)

        if self.sid_list_0 is None:
            self.sid_list_0 = self.test_snps.sid

        if self.sid_list_1 is None:
            self.sid_list_1 = self.test_snps.sid

        self.set_sid_sets()

        #!!Should fix up to add only of no constant columns - will need to add a test case for this
        if self.covar is None:
            self.covar = np.ones((self.test_snps.iid_count, 1))
        else:
            self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1))))
        self.n_cov = self.covar.shape[1] 


        if self.output_file_or_none is None:
            self.__tempdirectory = ".working"
        else:
            self.__tempdirectory = self.output_file_or_none + ".working"

        self._ran_once = True
Esempio n. 36
0
    def gen_and_compare(self, output_file, **kwargs):
        from pysnptools.snpreader import Bed

        gen_snpdata = snp_gen(**kwargs)
        #pstutil.create_directory_if_necessary(self.currentFolder + "/tempdir/" + output_file,isfile=True)
        #Bed.write(gen_snpdata, self.currentFolder + "/tempdir/" + output_file)  #comment out
        ref_snpdata = Bed(self.currentFolder +
                          "/../../tests/datasets/generate/" + output_file,
                          count_A1=False).read()
        assert gen_snpdata == ref_snpdata, "Failure on " + output_file
        return gen_snpdata
Esempio n. 37
0
def shuffle_bed(bed_file):
    """
    shuffle the genotypes of individuals snp-by-snp
    :param bed_file: the prefix for plink binary file
    :return: the shuffled plink binary file
    """
    try:
        from pysnptools.snpreader import Bed
    except Exception as e:
        print(e)
        return 0
    logging.INFO('Read the plink file')
    data = Bed(bed_file, count_A1=False).read()
    num_snp = data.val.shape[1]
    logging.INFO("Start shuffle the genotypes snp-by-snp")
    for i in tqdm(range(num_snp)):
        np.random.shuffle(data.val[:, i])
    logging.INFO('Write the shuffled plink file')
    Bed.write(bed_file + '_shuffle', data, count_A1=False)
    return 1
Esempio n. 38
0
    def test_SNC(self):
        logging.info("TestSNC")
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps, count_A1=False)
        snc = bed.read()
        snc.val[:, 2] = 0  # make SNP #2 have constant values (aka a SNC)

        output_file_name = self.file_name("snc")

        frame = single_snp(test_snps=snc[:, :10],
                           pheno=pheno,
                           G0=snc,
                           mixing=0,
                           leave_out_one_chrom=False,
                           covar=covar,
                           output_file_name=output_file_name,
                           count_A1=False)
        self.compare_files(frame, "snc")
Esempio n. 39
0
    def test_underscore_read2(self):
        logging.info("in test_underscore_read2")
        snpreader = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",
                        count_A1=False)
        assert snpreader.iid is snpreader.row
        kid = Identity(snpreader.row)
        sub3 = kid[::2, ::2]
        expected = np.identity(kid.iid_count)[::2, :][:, ::2]
        np.testing.assert_array_almost_equal(sub3.read().val, expected)

        logging.info("done with test")
Esempio n. 40
0
 def gen_reference(self, load_path):
     """Get the pysnptools reference via the load type"""
     if self.gen_type == ".bed":
         return Bed(load_path, count_A1=True)
     elif self.gen_type == ".bgen":
         if self._snp_tools:
             return Bgen(load_path)
         else:
             return BgenObject(load_path)
     else:
         raise Exception("Unknown load type set")
Esempio n. 41
0
def sim_zsc(bfile,
            nsample,
            start_chrom,
            end_chrom,
            pheno,
            legend,
            standardize,
            freq,
            nblock=40):

    zsc_maf_thres = 0.01

    nindv = nsample

    nsnp_all = legend.shape[0]
    zsc = np.zeros(nsnp_all, dtype=np.float32)

    for i in xrange(start_chrom, end_chrom + 1):

        snpdata = Bed('{}{}.bed'.format(bfile, i), count_A1=False)
        nsnp = snpdata.sid_count
        blocks = create_block(0, nsnp - 1, nblock)

        snp_idx = np.where(legend['CHR'] == i)[0]
        zsc_chrom = np.zeros(snp_idx.shape[0])

        freq_chrom = freq[snp_idx]
        mask_chrom = np.zeros(nsnp, dtype=bool)
        mask_chrom[freq_chrom > zsc_maf_thres] = True

        for blk in blocks:

            mask_chrom_blk = mask_chrom[blk]
            use_idx = blk[mask_chrom_blk == True]

            snpdata_blk = snpdata[0:nindv, use_idx]
            if standardize == False:
                snpdata_blk = snpdata_blk.read(dtype=np.float32).val
            else:
                snpdata_blk = snpdata_blk.read(dtype=np.float32)\
                    .standardize(Unit()).val
            if standardize == False:
                snpdata_blk -= snpdata_blk.mean(axis=0)
            if standardize == True:
                zsc_chrom[use_idx] = np.dot(snpdata_blk.T,
                                            pheno) / np.sqrt(nindv)
            else:
                sigmasq = snpdata_blk.var(axis=0)
                zsc_chrom[use_idx] = np.dot(snpdata_blk.T, pheno)
                zsc_chrom[use_idx] /= np.sqrt(nindv * sigmasq)

        zsc[snp_idx] = zsc_chrom

    return zsc[freq > zsc_maf_thres]
Esempio n. 42
0
 def __init__(self, gene, iso, sim):
     seed = 124
     np.random.seed(seed)
     self.gene = gene
     self.num_iso = iso
     self.num_sim = sim
     bfile = gene + "_AFR.clean"
     geno = Bed(bfile, count_A1=False).read().val
     self.n_ind, self.n_SNP = geno.shape
     print(geno.shape)
     f = np.sum(geno, axis=0) / (2 * self.n_ind)
     self.geno = (geno - 2 * f) / np.sqrt(2 * f * (1 - f))
Esempio n. 43
0
    def test_intersection_Snp2Dist(self):
        from pysnptools.distreader._snp2dist import _Snp2Dist
        from pysnptools.snpreader import Pheno, Bed
        from pysnptools.distreader._subset import _DistSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        snp_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True)
        k = snp_all.as_dist(max_weight=2)

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:,:] # To test intersection we remove a iid from pheno

        k1,pheno = intersect_apply([k,pheno]) 
        assert isinstance(k1.snpreader,_SnpSubset) and not isinstance(k1,_DistSubset)

        #What happens with fancy selection?
        k2 = k[::2,:]
        assert isinstance(k2,_Snp2Dist)

        logging.info("Done with test_intersection")
Esempio n. 44
0
def getData(filename="", mph=3, UseCov=False):
    sFil = Bed(filename)
    yFil = Pheno(filename + ".fam")

    Q = []
    if isfile(filename + ".cov") and UseCov:
        QFil = Pheno(filename + ".cov")
        [sFil, yFil, QFil] = intersect_apply([sFil, yFil, QFil])
    if isfile(filename + ".phen"):
        yFil = Pheno(filename + ".phen")
        [sFil, yFil] = intersect_apply([sFil, yFil])
    return [yFil, sFil]
Esempio n. 45
0
    def test_SNC(self):
        logging.info("TestSNC")
        from pysnptools.snpreader import Bed
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps)
        snc = bed.read()
        snc.val[:, 2] = [
            0
        ] * snc.iid_count  # make SNP #2 have constant values (aka a SNC)

        output_file_name = self.file_name("snc")

        frame = single_snp(test_snps=snc[:, :10],
                           pheno=pheno,
                           G0=snc,
                           mixing=0,
                           covar=covar,
                           output_file_name=output_file_name)
        self.compare_files(frame, "snc")
Esempio n. 46
0
 def __init__(self, args):
     if args.window_type not in ['BP', 'SNP']:
         raise ValueError('Window type not supported')
     bed_1 = Bed(args.bfile)  #
     af1 = self.get_allele_frequency(bed_1, args)  #
     print(len(af1), "SNPs in file 1")
     snps_1 = (af1 > args.maf) & (af1 < 1 - args.maf)  #
     print(np.sum(snps_1), "SNPs in file 1 after MAF filter")
     if (args.from_bp is not None) and (args.to_bp is not None):
         k = (bed_1.pos[:, 2] > args.from_bp) & (bed_1.pos[:, 2] <
                                                 args.to_bp)
         snps_1 = snps_1 & k
     snps_to_use = bed_1.sid[snps_1]
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract, 'r')])
         snps_to_use = np.intersect1d(snps_to_use, keep)
         print(len(snps_to_use), "SNPs remaining after extraction")
     bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use))  #
     pos = bed_1.pos[bed_1_index]  #
     bim_1 = pd.read_table(
         bed_1.filename + '.bim',
         header=None,
         names=['chm', 'id', 'pos_mb', 'pos_bp', 'a1', 'a2'])
     af = af1[bed_1_index]  #
     if args.afile is not None:
         a1 = pd.read_table(args.afile,
                            header=None,
                            sep='\s*',
                            names=['id1', 'id2', 'theta'])
     else:
         a1 = None
     self.af = af
     self.M = len(bed_1_index)  #
     self.windows = self.get_windows(pos, args)  #
     self.chr = pos[:, 0]
     self.pos = pos[:, 2]
     self.id = bed_1.sid[bed_1_index]
     self.A1 = bim_1['a1'].loc[bed_1_index]
     self.A2 = bim_1['a2'].loc[bed_1_index]
     self.scores = self.compute(bed_1, bed_1_index, af, a1, args)  #
Esempio n. 47
0
    def __init__(self,args):
        self.bed = Bed(args.bfile) #
        self.N = self.bed.iid_count
        if args.covfile is not None:
            cov = pd.read_table(args.covfile,header=None)
            self.cov = sm.add_constant(ju._reorder(cov,self.bed.iid))
            self.ncov = self.cov.shape[1] # + constant
        else:
            self.cov = np.ones((self.N,1))
            self.ncov = 1 # Constant
        if args.phenofile is not None:
            Y = pd.read_table(args.phenofile,header=None,na_values='-9')
        else:
            try:
                Y = pd.read_table(args.bfile+'.pheno',header=None,na_values='-9')
            except IOError:
                print("Phenotype file not found.")
                exit(1)
        self.Y = ju._reorder(Y,self.bed.iid)
        af = ju.get_allele_frequency(self.bed,args) #
        snps = (af>args.maf)&(af<1-args.maf) #
        if (args.from_bp is not None) and (args.to_bp is not None):
            k = (bed.pos[:,2]>args.from_bp)&(bed.pos[:,2]<args.to_bp)
            snp1 = snps&k
        snps_to_use = self.bed.sid[snps]
        if args.extract is not None:
            keep = np.array([l.strip() for l in open(args.extract,'r')])
            snps_to_use = np.intersect1d(snps_to_use,keep)
        self.bed_index = np.sort(self.bed.sid_to_index(snps_to_use)) #
        pos = self.bed.pos[self.bed_index] #
        bim=pd.read_table(self.bed.filename+'.bim',header=None,
                          names=['chm','id','pos_mb','pos_bp','a1','a2'])
        self.af = af[self.bed_index] #
        self.M = len(self.bed_index) #
        self.windows = ju.get_windows(pos,self.M,args.window_size,args.window_type)
        self.pos = pos[:,2]
        self.chr = pos[:,0]
        self.id = self.bed.sid[self.bed_index]
        self.A1 = bim['a1'].loc[self.bed_index]
        self.A2 = bim['a2'].loc[self.bed_index]
        self.logistic = False
        self.chimin = stats.chi2.ppf(1-args.minp,2)

        # Fit null
        if (not args.linear) and (self.Y.min() >= 0 and self.Y.max() <= 1):
            self.null = sm.Logit(self.Y, self.cov, missing='drop').fit(disp=0)
            self.logistic = True
        else:
            self.null = sm.OLS(self.Y, self.cov, missing='drop').fit(disp=0)
        if self.ncov > 1:
            self.cov = sm.add_constant(self.null.fittedvalues)
        self.marg_res, self.joint_res = self.compute(args)
Esempio n. 48
0
def compute(pgs, bedfile=None, bgenfile=None, par_gts_f=None, ped=None, sib=False, compute_controls=False, verbose=True):
    """Compute a polygenic score (PGS) for the individuals with observed genotypes and observed/imputed parental genotypes.

    Args:
        par_gts_f : :class:`str`
            path to HDF5 file with imputed parental genotypes
        gts_f : :class:`str`
            path to bed file with observed genotypes
        pgs : :class:`snipar.pgs`
            the PGS, defined by the weights for a set of SNPs and the alleles of those SNPs
        sib : :class:`bool`
            Compute the PGS for genotyped individuals with at least one genotyped sibling and observed/imputed parental genotypes. Default False.
        compute_controls : :class:`bool`
            Compute polygenic scores for control families (families with observed parental genotypes set to missing). Default False.

    Returns:
        pg : :class:`snipar.gtarray`
            Return the polygenic score as a genotype array with columns: individual's PGS, mean of their siblings' PGS, observed/imputed paternal PGS,
            observed/imputed maternal PGS

    """
    # Check for SNP overlap
    if bedfile is not None:
        bed = Bed(bedfile, count_A1=True)
        snp_ids = bed.sid
    if bgenfile is not None:
        bgen = open_bgen(bgenfile)
        snp_ids = bgen.ids
        if np.unique(snp_ids).shape[0] == 1:
            snp_ids = bgen.rsids
    snp_set = set(snp_ids)
    in_snp_set = np.array([x in snp_set for x in pgs.snp_ids])
    if np.sum(in_snp_set)==0:
        print('No overlap between variants in weights file and observed genotypes')
        return None
    else:
        # Get genotype matrix
        G = get_gts_matrix(bedfile=bedfile, bgenfile=bgenfile, par_gts_f=par_gts_f, ped=ped, snp_ids=pgs.snp_ids, sib=sib, compute_controls=compute_controls, verbose=verbose)
        if sib:
            cols = np.array(['proband', 'sibling', 'paternal', 'maternal'])
        else:
            cols = np.array(['proband', 'paternal', 'maternal'])
        if compute_controls:
            pgs_out = [pgs.compute(x,cols) for x in G[0:3]]
            if sib:
                o_cols = np.array(['proband', 'sibling', 'parental'])
            else:
                o_cols = np.array(['proband','parental'])
            pgs_out.append(pgs.compute(G[3], o_cols))
            return pgs_out
        else:
            return pgs.compute(G,cols)
Esempio n. 49
0
def get_beta_tildes(bed_file, mean_std_file, betas1, betas2, h1, h2,
                    chunk_size_snp):

    # reading bed file
    G = Bed(bed_file, count_A1=False)
    # reading file with means, standard deviation for each SNP
    mean_std = pd.read_csv(mean_std_file, delimiter='\t')

    # dimensions of genotype matrix
    N = G.row_count  # number of individuals
    M = G.col_count  # number of SNPs

    # dot products of standardized matrix and betas
    GB1 = np.zeros(N)
    GB2 = np.zeros(N)

    # standardizing genotype matrix and taking dot product with betas (chunk_size_snp at a time)
    for i in range(0, M, chunk_size_snp):

        # standardizing
        G_sub = G[:, i:(i + chunk_size_snp)].read().val  # current chunk
        mean_sub = mean_std['mean'][
            i:i +
            chunk_size_snp].values  # means of SNPs corresponding to current chunk
        std_sub = mean_std['std'][i:(
            i + chunk_size_snp
        )].values  # standard deviations of SNPs corresponding to current chunk
        nanidx = np.where(np.isnan(G_sub))  # finding NaNs in genotype matrix
        G_sub[nanidx] = mean_sub[
            nanidx[1]]  # setting NaNs to mean of corresponding SNP
        G_sub_std = np.nan_to_num(
            (G_sub - mean_sub) / std_sub)  # standardizing chunk

        # dot product
        betas1_sub = betas1[i:(
            i + chunk_size_snp
        )]  # trait 1 effect sizes of SNPs corresponding to current chunk
        betas2_sub = betas2[i:(
            i + chunk_size_snp
        )]  # trait 2 effect sizes of SNPs corresponding to current chunk
        GB1 += np.dot(G_sub_std, betas1_sub)  # dot product for trait 1
        GB2 += np.dot(G_sub_std, betas2_sub)  # dot product for trait 2

    # re-scaling to have variance of dot product equal to heritability
    var_GB1 = np.var(GB1)
    var_GB2 = np.var(GB2)
    k1 = h1 / var_GB1
    k2 = h2 / var_GB2
    beta_tildes1 = math.sqrt(k1) * betas1  # re-scaled effect sizes for trait 1
    beta_tildes2 = math.sqrt(k2) * betas2  # re-scaled effect sizes for trait 2

    return beta_tildes1, beta_tildes2
Esempio n. 50
0
def _snps_fixup(snp_input, iid_if_none=None):
    if isinstance(snp_input, str):
        return Bed(snp_input)
    if snp_input is None:
        assert iid_if_none is not None, "snp_input cannot be None here"
        return SnpData(
            iid_if_none,
            sid=np.empty((0), dtype='str'),
            val=np.empty((len(iid_if_none), 0)),
            pos=np.empty((0, 3)),
            parent_string="")  #todo: make a static factory method on SnpData

    return snp_input
Esempio n. 51
0
 def test_gen5(self):
     gen_snpdata = self.gen_and_compare("gen5",
                                        fst=.1,
                                        dfr=.5,
                                        iid_count=200,
                                        sid_count=20,
                                        maf_low=.05,
                                        maf_high=.4,
                                        seed=5)
     ref_snpdata = Bed(self.currentFolder + "/expected/gen2").read()
     assert not TestSnpGen.is_same(
         gen_snpdata,
         ref_snpdata), "Expect different seeds to produce different results"
Esempio n. 52
0
def gen_Test_Bed(filename, n0, n1, m):
    n = n0 + n1
    iid = [["fam_" + str(i), "iid_" + str(i)] for i in range(0, n)]
    sid = ["snp_" + str(i) for i in range(0, m)]
    X = [[2.0 for i in range(0, m)] for i in range(0, n1)]
    X.extend([[0.0 for i in range(0, m)] for i in range(0, n0)])
    dat = SnpData(iid=iid, sid=sid, val=X)
    Bed.write(filename, dat)
    fil = open(filename + ".fam")
    lines = fil.readlines()
    fil.close()
    fil = open(filename + ".fam", "w")
    for i in range(0, len(lines)):
        l = lines[i]
        s = l.strip().split()
        if i < n1:
            s[5] = "2"
        else:
            s[5] = "1"
        l = " ".join(s) + "\n"
        fil.write(l)
    fil.close()
Esempio n. 53
0
def cluster_data(snpreader):
    """
    compute hierarchical clustering of snp data set in bed_fn
    """

    if isinstance(snpreader,str):
        snpreader = Bed(snpreader)
    G = snpreader.read().standardize().val

    # Generate distance matrix
    from sklearn.metrics.pairwise import euclidean_distances
    D = euclidean_distances(G, G)

    # Compute and plot first dendrogram.
    fig = pylab.figure(figsize=(8,8))
    ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
    Y = fc.linkage(D, method='average') #method="centroid" is cubic!
    Z1 = sch.dendrogram(Y, orientation='right')
    ax1.set_xticks([])
    ax1.set_yticks([])

    # Compute and plot second dendrogram.
    ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
    #Y = sch.linkage(D, method='single')
    Z2 = sch.dendrogram(Y)
    ax2.set_xticks([])
    ax2.set_yticks([])

    # Plot distance matrix.
    axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
    idx1 = Z1['leaves']
    #dx2 = Z2['leaves']
    D = D[idx1,:]
    D = D[:,idx1]
    axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])

    pylab.show()
Esempio n. 54
0
    def test1(self):
        from pysnptools.snpreader import Bed, SnpMemMap
        from pysnptools.util import example_file  # Download and return local file name

        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        filename2 = "tempdir/tiny.snp.memmap"
        pstutil.create_directory_if_necessary(filename2)
        snpreader2 = SnpMemMap.empty(iid=[['fam0', 'iid0'], ['fam0', 'iid1']],
                                     sid=['snp334', 'snp349', 'snp921'],
                                     filename=filename2,
                                     order="F",
                                     dtype=np.float64)
        assert isinstance(snpreader2.val, np.memmap)
        snpreader2.val[:, :] = [[0., 2., 0.], [0., 1., 2.]]
        assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val,
                              np.array([[1.]]))
        snpreader2.flush()
        assert isinstance(snpreader2.val, np.memmap)
        assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val,
                              np.array([[1.]]))
        snpreader2.flush()

        snpreader3 = SnpMemMap(filename2)
        assert np.array_equal(snpreader3[[1], [1]].read(view_ok=True).val,
                              np.array([[1.]]))
        assert isinstance(snpreader3.val, np.memmap)

        logging.info("in TestSnpMemMap test1")
        snpreader = SnpMemMap('tempdir/tiny.snp.memmap')
        assert snpreader.iid_count == 2
        assert snpreader.sid_count == 3
        assert isinstance(snpreader.val, np.memmap)

        snpdata = snpreader.read(view_ok=True)
        assert isinstance(snpdata.val, np.memmap)

        bed_file = example_file("pysnptools/examples/toydata.5chrom.*",
                                "*.bed")
        bed = Bed(bed_file)
        pstutil.create_directory_if_necessary(
            "tempdir/toydata.5chrom.snp.memmap"
        )  #LATER should we just promise to create directories?
        SnpMemMap.write("tempdir/toydata.5chrom.snp.memmap",
                        bed)  # Write bed in SnpMemMap format
        SnpMemMap.write(
            "tempdir/toydata.5chromsnpdata.snp.memmap",
            bed[:, ::2].read())  # Write snpdata in SnpMemMap format

        os.chdir(old_dir)
Esempio n. 55
0
    def factory_iterator():

        snp_reader_factory_bed = lambda: Bed("examples/toydata",
                                             count_A1=False)
        snp_reader_factory_snpmajor_hdf5 = lambda: SnpHdf5(
            "examples/toydata.snpmajor.snp.hdf5")
        snp_reader_factory_iidmajor_hdf5 = lambda: SnpHdf5(
            "examples/toydata.iidmajor.snp.hdf5")
        snp_reader_factory_dat = lambda: Dat("examples/toydata.dat")

        previous_wd = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        snpreader0 = snp_reader_factory_bed()
        S_original = snpreader0.sid_count
        N_original = snpreader0.iid_count

        snps_to_read_count = min(S_original, 100)

        for iid_index_list in [
                list(range(N_original)),
                list(range(N_original / 2)),
                list(range(N_original - 1, 0, -2))
        ]:
            for snp_index_list in [
                    list(range(snps_to_read_count)),
                    list(range(snps_to_read_count / 2)),
                    list(range(snps_to_read_count - 1, 0, -2))
            ]:
                for standardizer in [Unit(), Beta(1, 25)]:
                    reference_snps, reference_dtype = NaNCNCTestCases(
                        iid_index_list, snp_index_list, standardizer,
                        snp_reader_factory_bed(), sp.float64, "C", "False",
                        None, None).read_and_standardize()
                    for snpreader_factory in [
                            snp_reader_factory_bed,
                            snp_reader_factory_snpmajor_hdf5,
                            snp_reader_factory_iidmajor_hdf5,
                            snp_reader_factory_dat
                    ]:
                        for dtype in [sp.float64, sp.float32]:
                            for order in ["C", "F"]:
                                for force_python_only in [False, True]:
                                    snpreader = snpreader_factory()
                                    test_case = NaNCNCTestCases(
                                        iid_index_list, snp_index_list,
                                        standardizer, snpreader, dtype, order,
                                        force_python_only, reference_snps,
                                        reference_dtype)
                                    yield test_case
        os.chdir(previous_wd)
    def __init__(self, snp_fn, out_prefix):


        self.force_recompute = False

        #self.base_path = base_path
        self.snp_fn = snp_fn

        from pysnptools.snpreader import Bed
        self.snp_reader = Bed(snp_fn)
        
        self.eigen_fn = self.snp_fn + "_pcs.pickle"

        self.out_prefix = out_prefix
Esempio n. 57
0
def genPheno(filename="../thinFam",per=.5,savename="fakePheno.txt",c=2.0,num=5):
	sFil=Bed(filename);
	D=sFil.read().val;
	m=len(D[0]);
	n=len(D);
	print m;
	print n;
	I=[rand.randint(0,m-1) for i in range(0,num)];
	SNP=[[D[j][i] for j in range(0,n)] for i in I]
	#p0=n*peir/sum([c**i*len([j for j in SNP if j==float(i)]) for i in range(0,3)])
	print len(I);
	print len(SNP);
	print len(SNP[0]);
	print n;
	print min([len(s) for s in SNP])
	print SNP;
	
	SNP=[[max(i,0.0) for i in s] for s in SNP]
	for i in range(0,num):
		for j in range(0,n):
			if not SNP[i][j] in [1.0,0.0,2.0]:
				SNP[i][j]=0.0;
	print [list(set(s)) for s in SNP]
Esempio n. 58
0
    def test_match_cpp(self):
        '''
        match
            FaSTLMM.207\Data\DemoData>fastlmmc -snpPairs -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.pairs.txt -logDelta 0 -verbose 100

        '''
        logging.info("TestEpistasis test_match_cpp")
        from pysnptools.snpreader import Bed
        snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"))
        pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt")
        covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt")
        sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"]
        sim_idx = snps.sid_to_index(sim_sid)
        test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"]
        test_idx = snps.sid_to_index(test_sid)

        frame = epistasis(snps[:,test_idx], pheno,covar=covar, G0 = snps[:,sim_idx],log_delta=0)
        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])

        referenceOutfile = TestFeatureSelection.reference_file("epistasis/topsnps.pairs.txt")

        import pandas as pd
        table = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file
        assert len(pvalue_list) == len(table)
        for row in table.iterrows():
            snp0cpp,snp1cpp,pvaluecpp,i1,i2 = row[1]
            for i in xrange(len(pvalue_list)):
                found = False
                pvaluepy = pvalue_list[i]
                snp0py = sid0[i]
                snp1py = sid1[i]
                if (snp0py == snp0cpp and snp1py == snp1cpp) or (snp0py == snp1cpp and snp1py == snp0cpp):
                    found = True
                    diff = abs(pvaluecpp - pvaluepy)/pvaluecpp
                    assert diff < .035, "'{0}' '{1}' pvalue_list differ too much {4} -- {2} vs {3}".format(snp0cpp,snp1cpp,pvaluecpp,pvaluepy,diff)
                    break
            assert found
Esempio n. 59
0
 def __init__(self,args):
     self.bed = Bed(args.bfile) #
     self.N = self.bed.iid_count
     if args.covfile is not None:
         cov = pd.read_table(args.covfile,header=None)
         self.cov = sm.add_constant(ju._reorder(cov,self.bed.iid))
         self.ncov = self.cov.shape[1] # + constant
     else:
         self.cov = np.ones((self.N,1))
         self.ncov = 1 # Constant
     af = ju.get_allele_frequency(self.bed,args) #
     snps = (af>args.maf)&(af<1-args.maf) #
     if (args.from_bp is not None) and (args.to_bp is not None):
         k = (bed.pos[:,2]>args.from_bp)&(bed.pos[:,2]<args.to_bp)
         snp1 = snps&k
     snps_to_use = self.bed.sid[snps]
     if args.extract is not None:
         keep = np.array([l.strip() for l in open(args.extract,'r')])
         snps_to_use = np.intersect1d(snps_to_use,keep)
     self.bed_index = np.sort(self.bed.sid_to_index(snps_to_use)) #
     pos = self.bed.pos[self.bed_index] #
     bim=pd.read_table(self.bed.filename+'.bim',header=None,
                       names=['chm','id','pos_mb','pos_bp','a1','a2'])
     self.af = af[self.bed_index] #
     self.M = len(self.bed_index) #
     self.windows = ju.get_windows(pos,self.M,args.window_size,args.window_type)
     self.sample_windows = ju.get_windows(pos,self.M,args.sample_window_size,
                                          args.sample_window_type)
     self.pos = pos[:,2]
     self.chr = pos[:,0]
     self.id = self.bed.sid[self.bed_index]
     self.A1 = bim['a1'].loc[self.bed_index]
     self.A2 = bim['a2'].loc[self.bed_index]
     self.numSamples = args.numSamples
     self.JMaxStats, self.ZMaxStats = self.sample(args)
     self.JMinP = stats.chi2.sf(self.JMaxStats,2)
     self.ZMinP = stats.chi2.sf(self.ZMaxStats**2,1)
     self.minP = np.minimum(self.JMinP,self.ZMinP)