def work_sequence(self): # is it OK to do the intersect and the linear regression 23 extra times? # clear G, y, snp_name, _ = load_intersect(self.snp_reader, self.pheno_fn) # compute linear regression _, p_values_lin = f_regression(G, y, center=True) # set up empty return structures #self.rs = snp_name #self.p_values = -np.ones(len(snp_name)) # get chr names/id chr_ids = self.snp_reader.pos[:,0] #self.pos = self.snp_reader.pos #loco = [[range(0,5000), range(5000,10000)]] loco = LeaveOneChromosomeOut(chr_ids, indices=True) if len(loco) is not self.chrom_count : raise Exception("The snp reader has {0} chromosome, not {1} as specified".format(len(loco),self.chrom_count)) for i, (train_snp_idx, test_snp_idx) in enumerate(loco): if i == 0: result = {"p_values":-np.ones(len(snp_name)), "p_values_lin": p_values_lin, "rs":snp_name, "pos":self.snp_reader.pos} else: result = None yield lambda i=i, train_snp_idx=train_snp_idx,test_snp_idx=test_snp_idx,result=result,G=G,y=y: self.dowork(i,train_snp_idx,test_snp_idx,result,G,y) # the 'i=i',etc is need to get around a strangeness in Python
def test_results_identical_with_fastlmmc(self): """ make sure gwas yields same results as fastlmmC """ currentFolder = os.path.dirname(os.path.realpath(__file__)) #prefix = r"C:\Users\chwidmer\Documents\Projects\sandbox\data\test" #bed_fn = prefix + "/jax_gt.up.filt.M" #dat_fn = prefix + "/jax_M_expression.1-18.dat" #pheno_fn = prefix + "/jax_M_expression.19.phe.txt" bed_fn = os.path.join(currentFolder, "../../feature_selection/examples/toydata") pheno_fn = os.path.join(currentFolder, "../../feature_selection/examples/toydata.phe") #prefix = "../../../tests\datasets\mouse" #bed_fn = os.path.join(prefix, "alldata") #pheno_fn = os.path.join(prefix, "pheno.txt") snp_reader = Bed(bed_fn) G, y, _, _ = load_intersect(snp_reader, pheno_fn) snp_pos = snp_reader.rs idx_sim = range(0, 5000) idx_test = range(5000, 10000) snp_pos_sim = snp_pos[idx_sim] snp_pos_test = snp_pos[idx_test] G_chr1, G_chr2 = G[:,idx_sim], G[:,idx_test] delta = 1.0 ################################### # REML IN lmm.py is BROKEN!! # we compare REML=False in lmm.py to fastlmmc REML = False gwas_c_reml = GwasTest(bed_fn, pheno_fn, snp_pos_sim, snp_pos_test, delta, REML=REML) gwas_c_reml.run_gwas() gwas = GwasPrototype(G_chr1, G_chr2, y, delta, REML=False) gwas.run_gwas() # check p-values in log-space! np.testing.assert_array_almost_equal(np.log(gwas.p_values), np.log(gwas_c_reml.p_values), decimal=3) if False: import pylab pylab.plot(np.log(gwas_c_reml.p_values), np.log(gwas_f.p_values_F), "x") pylab.plot(range(-66,0,1), range(-66,0,1)) pylab.show() # we compare lmm_cov.py to fastlmmc with REML=False gwas_c = GwasTest(bed_fn, pheno_fn, snp_pos_sim, snp_pos_test, delta, REML=True) gwas_c.run_gwas() gwas_f = FastGwas(G_chr1, G_chr2, y, delta, findh2=False) gwas_f.run_gwas() np.testing.assert_array_almost_equal(np.log(gwas_c.p_values), np.log(gwas_f.p_values_F), decimal=2) # additional testing code for the new wrapper functions # Fix delta from pysnptools.snpreader import Bed as BedSnpReader from fastlmm.association.single_snp import single_snp snpreader = BedSnpReader(bed_fn,count_A1=False) frame = single_snp(test_snps=snpreader[:,idx_test], pheno=pheno_fn, G0=snpreader[:,idx_sim],h2=1.0/(delta+1.0),leave_out_one_chrom=False,count_A1=False) sid_list,pvalue_list = frame['SNP'].values,frame['PValue'].values np.testing.assert_allclose(gwas_f.sorted_p_values_F, pvalue_list, rtol=1e-10) p_vals_by_genomic_pos = frame.sort_values(["Chr", "ChrPos"])["PValue"].tolist() np.testing.assert_allclose(gwas_c_reml.p_values, p_vals_by_genomic_pos, rtol=.1) np.testing.assert_allclose(gwas_c_reml.p_values, gwas_f.p_values_F, rtol=.1) np.testing.assert_allclose(gwas_f.sorted_p_values_F, gwas_c_reml.sorted_p_values, rtol=.1) # Search over delta gwas_c_reml_search = GwasTest(bed_fn, pheno_fn, snp_pos_sim, snp_pos_test, delta=None, REML=True) gwas_c_reml_search.run_gwas() frame_search = single_snp(test_snps=snpreader[:,idx_test], pheno=pheno_fn, G0=snpreader[:,idx_sim],h2=None,leave_out_one_chrom=False,count_A1=False) _,pvalue_list_search = frame_search['SNP'].values,frame_search['PValue'].values p_vals_by_genomic_pos = frame_search.sort_values(["Chr", "ChrPos"])["PValue"].tolist() np.testing.assert_allclose(gwas_c_reml_search.p_values, p_vals_by_genomic_pos, rtol=.001) np.testing.assert_allclose(gwas_c_reml_search.sorted_p_values, pvalue_list_search, rtol=.001)