def test_G0_has_reader(self): logging.info("TestSingleSnp test_G0_has_reader") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G0_has_reader") frame0 = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps, leave_out_one_chrom=False, covar=covar, mixing=0, output_file_name=output_file_name) self.compare_files(frame0, "one") frame1 = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=KernelIdentity(test_snps.iid), G1=test_snps, leave_out_one_chrom=False, covar=covar, mixing=1, output_file_name=output_file_name) self.compare_files(frame1, "one")
def test_file_cache(self): logging.info("TestSingleSnp test_file_cache") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G1") cache_file = self.file_name("cache_file")+".npz" if os.path.exists(cache_file): os.remove(cache_file) frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno,G0=test_snps[:,10:100], covar=covar, G1=test_snps[:,100:200], mixing=.5, output_file_name=output_file_name, cache_file = cache_file ) self.compare_files(frame,"G1") frame2 = single_snp(test_snps=test_snps[:,:10], pheno=pheno,G0=None, covar=covar, G1=None, mixing=.5, output_file_name=output_file_name, cache_file = cache_file ) self.compare_files(frame2,"G1")
def test_file_cache(self): logging.info("TestSingleSnp test_file_cache") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G1") cache_file = self.file_name("cache_file") + ".npz" if os.path.exists(cache_file): os.remove(cache_file) frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps[:, 10:100], leave_out_one_chrom=False, covar=covar, G1=test_snps[:, 100:200], mixing=.5, output_file_name=output_file_name, cache_file=cache_file) self.compare_files(frame, "G1") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps[:, 10:100], leave_out_one_chrom=False, covar=covar, G1=test_snps[:, 100:200], mixing=.5, output_file_name=output_file_name, cache_file=cache_file) self.compare_files(frame, "G1")
def test_gb_goal(self): logging.info("TestSingleSnp test_gb_goal") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("gb_goal") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, mixing=0, leave_out_one_chrom=False, G0=test_snps, covar=covar, GB_goal=0, output_file_name=output_file) self.compare_files(frame, "one") output_file = self.file_name("gb_goal2") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, mixing=0, leave_out_one_chrom=False, G0=test_snps, covar=covar, GB_goal=.12, output_file_name=output_file) self.compare_files(frame, "one")
def test_file_cache(self): logging.info("TestSingleSnp test_file_cache") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G1") cache_file = self.file_name("cache_file") + ".npz" if os.path.exists(cache_file): os.remove(cache_file) frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps[:, 10:100], covar=covar, G1=test_snps[:, 100:200], mixing=.5, output_file_name=output_file_name, cache_file=cache_file) self.compare_files(frame, "G1") frame2 = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=None, covar=covar, G1=None, mixing=.5, output_file_name=output_file_name, cache_file=cache_file) self.compare_files(frame2, "G1")
def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100 ''' logging.info("TestSingleSnp test_match_cpp") snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"), count_A1=False) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"] sim_idx = snps.sid_to_index(sim_sid) test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"] test_idx = snps.sid_to_index(test_sid) for G0,G1 in [(snps[:,sim_idx],KernelIdentity(snps.iid)),(KernelIdentity(snps.iid),snps[:,sim_idx])]: frame_h2 = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,h2=.5,leave_out_one_chrom=False,count_A1=False) frame_log_delta = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,log_delta=0,leave_out_one_chrom=False,count_A1=False) for frame in [frame_h2, frame_log_delta]: referenceOutfile = TestFeatureSelection.reference_file("single_snp/topsnps.single.txt") reference = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file assert len(frame) == len(reference) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue reldiff = abs(row.Pvalue - pvalue)/row.Pvalue assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format(sid,None,row.Pvalue,pvalue,reldiff)
def execute_fs(test_snps, pheno, G0, covar): """ run feature selection """ result = {} fs_result = {} # fs unconditioned ######################## tmp_uuid = str(uuid.uuid4())[0:13] out_fn = "tmp_pheno_%s.txt" % (tmp_uuid) out_data = pd.DataFrame({ "id1": G0.iid[:, 0], "id2": G0.iid[:, 1], "y": pheno["vals"] }) out_data.to_csv(out_fn, sep=" ", header=False, index=False) # write out covariates items = [ ('id1', G0.iid[:, 0]), ('id2', G0.iid[:, 1]), ] items += [("pc_%i" % i, covar["vals"][:, i]) for i in xrange(covar["vals"].shape[1])] cov_df = pd.DataFrame.from_items(items) cov_fn = "tmp_cov_%s.txt" % (tmp_uuid) cov_df.to_csv(cov_fn, sep=" ", header=False, index=False) #TODO: fix include_all!! fsd = create_feature_selection_distributable(G0, out_fn, None, 0, "fs_out", include_all=False, cov_fn=cov_fn) fs_result["result_uncond_all"] = Local().run(fsd) best_k, best_delta, best_obj, best_snps = fs_result["result_uncond_all"] fs_idx = argintersect_left(G0.sid, best_snps) G_fs = G0[:, fs_idx] result["fs_all"] = single_snp(test_snps, pheno, G0=G_fs).sort(["Chr", "ChrPos" ])["PValue"].as_matrix() result["fs_all_cov"] = single_snp( test_snps, pheno, G0=G_fs, covar=covar).sort(["Chr", "ChrPos"])["PValue"].as_matrix() return result, fs_result
def test_leave_one_out_with_prekernels(self): logging.info( "TestSingleSnpLeaveOutOneChrom test_leave_one_out_with_prekernels") from pysnptools.kernelstandardizer import DiagKtoN test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn chrom_to_kernel = {} with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: for chrom in np.unique(test_snps.pos[:, 0]): other_snps = test_snps[:, test_snps.pos[:, 0] != chrom] kernel = other_snps.read_kernel( standardizer=Unit(), block_size=500 ) #Create a kernel from the SNPs not used in testing chrom_to_kernel[chrom] = kernel.standardize( DiagKtoN() ) #improves the kernel numerically by making its diagonal sum to iid_count output_file = self.file_name("one_looc_prekernel") frame = single_snp(test_snps, pheno, covar=covar, K0=chrom_to_kernel, output_file_name=output_file, count_A1=False) self.compare_files(frame, "one_looc")
def execute_dual_fs(test_snps, pheno, G0, covar): """ implementation of dual-kernel feature selection """ result = {} fs_result = {} # extract data G_test = test_snps.read().standardize().val G_train_unnorm = G0.read().standardize().val # fs conditioned on full kernel select = FeatureSelectionInSample(max_log_k=7, order_by_lmm=True) fs_result["insample_cond_full"] = select.run_select(G_train_unnorm, G_train_unnorm, pheno["vals"], cov=covar["vals"]) best_k, fs_idx, best_mix, best_delta = fs_result["insample_cond_full"] print "best_k:", best_k, ", best_mix:", best_mix # set up foreground kernel G1 = G0[:, fs_idx] result["full_fs_low"] = single_snp(test_snps, pheno, G0=G0, covar=covar, G1=G1, mixing=best_mix).sort( ["Chr", "ChrPos"])["PValue"].as_matrix() return result, fs_result
def test_linreg(self): logging.info("TestSingleSnp test_linreg") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("linreg") frame1 = single_snp(test_snps=test_snps[:, :10], pheno=pheno, mixing=0, leave_out_one_chrom=False, G0=KernelIdentity(iid=test_snps.iid), covar=covar, output_file_name=output_file) frame1 = frame1[[ 'sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue' ]] self.compare_files(frame1, "linreg") frame2 = single_snp_linreg(test_snps=test_snps[:, :10], pheno=pheno, covar=covar, output_file_name=output_file) self.compare_files(frame2, "linreg")
def test_single_snp(args): import fastlmm from pysnptools.snpreader import SnpData, Pheno, SnpReader from fastlmm.association import single_snp from utils import read_hdf5_dataset import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import fastlmm.util.util as flutil logger.info('read phenotypes from file: ' + args.phenotype_file) phenotypes = pd.read_table(args.phenotype_file) iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis], 2, axis=1) if args.sample_indices_file is not None: logger.info('read indices from file: ' + args.sample_indices_file) sample_indices = read_hdf5_dataset(args.sample_indices_file) else: sample_indices = np.nonzero( (phenotypes['type'] == 'training').values)[0] logger.info('read SNP file (for test): ' + args.snp_file) test_snps = get_snpdata(iid, args.snp_file, sample_indices=sample_indices) logger.info('read SNP file (for K0): ' + args.k0_file) K0 = get_snpdata(iid, args.k0_file) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) df_pheno = phenotypes[phenotypes['type'] == 'training'].copy() df_pheno['fid'] = df_pheno['id'] df_pheno['iid'] = df_pheno['id'] traits = ('trait1', 'trait2', 'trait3') for trait in traits: pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait) logger.info('create Pheno file: ' + pheno_file) df_pheno[['fid', 'iid', trait]].to_csv(pheno_file, index=False, sep='\t', header=False) pheno = Pheno(pheno_file) logger.info('run FastLMM for single SNP test for %s' % trait) results_df = single_snp(test_snps, pheno, K0=K0, count_A1=True, GB_goal=args.GB_goal) result_file = os.path.join(args.output_dir, 'single_snp.' + trait) logger.info('save results to file: ' + result_file) results_df.to_hdf(result_file, trait) if args.manhattan: plot_file = os.path.join(args.output_dir, 'manhattan.%s.pdf' % trait) logger.info('create Manhattan plot: ' + plot_file) plt.clf() flutil.manhattan_plot(results_df.as_matrix( ["Chr", "ChrPos", "PValue"]), pvalue_line=1e-5, xaxis_unit_bp=False) plt.savefig(plot_file)
def execute_dual_fs(test_snps, pheno, G0, covar): """ implementation of dual-kernel feature selection """ result = {} fs_result = {} # extract data G_test = test_snps.read().standardize().val G_train_unnorm = G0.read().standardize().val # fs conditioned on full kernel select = FeatureSelectionInSample(max_log_k=7, order_by_lmm=True) fs_result["insample_cond_full"] = select.run_select(G_train_unnorm, G_train_unnorm, pheno["vals"], cov=covar["vals"]) best_k, fs_idx, best_mix, best_delta = fs_result["insample_cond_full"] print "best_k:", best_k, ", best_mix:", best_mix # set up foreground kernel G1 = G0[:,fs_idx] result["full_fs_low"] = single_snp(test_snps, pheno, G0=G0, covar=covar, G1=G1, mixing=best_mix).sort(["Chr", "ChrPos"])["PValue"].as_matrix() return result, fs_result
def execute_lmm(test_snps, pheno, G0, covar): result = {} fs_result = {} result["full"] = single_snp(test_snps, pheno, G0=G0, covar=covar).sort(["Chr", "ChrPos"])["PValue"].as_matrix() return result, fs_result
def execute_lmm(test_snps, pheno, G0, covar): result = {} fs_result = {} result["full"] = single_snp(test_snps, pheno, G0=G0, covar=covar).sort(["Chr", "ChrPos" ])["PValue"].as_matrix() return result, fs_result
def test_old(self): logging.info("test_old") output_file = self.file_name("old") results_df = single_snp(self.bed, self.phen_fn, covar=self.cov_fn, count_A1=True, output_file_name=output_file) self.compare_files(results_df, "old")
def test_G0_has_reader(self): logging.info("TestSingleSnp test_G0_has_reader") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G0_has_reader") frame0 = single_snp(test_snps=test_snps[:,:10], pheno=pheno, G0=test_snps, leave_out_one_chrom=False, covar=covar, mixing=0, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame0,"one") frame1 = single_snp(test_snps=test_snps[:,:10], pheno=pheno, G0=KernelIdentity(test_snps.iid), G1=test_snps, leave_out_one_chrom=False, covar=covar, mixing=1, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame1,"one")
def test_no_cov(self): logging.info("TestSingleSnp test_no_cov") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn output_file_name = self.file_name("no_cov") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno, G0=test_snps, mixing=0,leave_out_one_chrom=False, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"no_cov")
def mapper_single_snp_2K_given_chrom(test_chr): logging.info("Working on chr={0}".format(test_chr)) test_snps_chrom = test_snps[:,test_snps.pos[:,0]==test_chr] G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader chrom_index = chrom_list.index(test_chr) best_sid = chrom_index_to_best_sid[chrom_index] K1 = G_for_chrom[:,G_for_chrom.sid_to_index(best_sid)] result = single_snp(test_snps=test_snps_chrom, K0=G_for_chrom, K1=K1, pheno=pheno, covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank,mixing=mixing,h2=h2) return result
def test_gb_goal(self): logging.info("TestSingleSnp test_gb_goal") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("gb_goal") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno, mixing=0,leave_out_one_chrom=False, G0=test_snps, covar=covar, GB_goal=0, output_file_name=output_file,count_A1=False ) self.compare_files(frame,"one") output_file = self.file_name("gb_goal2") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno, mixing=0,leave_out_one_chrom=False, G0=test_snps, covar=covar, GB_goal=.12, output_file_name=output_file,count_A1=False ) self.compare_files(frame,"one")
def test_no_cov(self): logging.info("TestSingleSnp test_no_cov") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn output_file_name = self.file_name("no_cov") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno, G0=test_snps, mixing=0, output_file_name=output_file_name ) self.compare_files(frame,"no_cov")
def test_other(self): logging.info("TestSingleSnp test_other") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("other") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno,leave_out_one_chrom=False, K1=test_snps, covar=covar, output_file_name=output_file,count_A1=False ) self.compare_files(frame,"one")
def test_interact(self): logging.info("TestSingleSnp test_interact") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("interact") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno, mixing=0,leave_out_one_chrom=False, G0=test_snps, covar=covar, interact_with_snp=1, output_file_name=output_file,count_A1=False ) self.compare_files(frame,"interact")
def test_mixid(self): logging.info("TestSingleSnp test_mixid") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("mixid") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno,G0=test_snps[:,10:100], leave_out_one_chrom=False, covar=covar, K1=KernelIdentity(test_snps.iid),mixing=.25, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"mixid")
def test_mixingKs(self): logging.info("TestSingleSnp test_mixingKs") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("mixingKs") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno,K0=SnpKernel(test_snps[:,10:100],Unit()),leave_out_one_chrom=False, covar=covar, K1=SnpKernel(test_snps[:,100:200],Unit()),mixing=None, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"mixing")
def test_interact_looc(self): logging.info("TestSingleSnpLeaveOutOneChrom test_interact_looc") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("interact_looc") frame = single_snp(test_snps, pheno, covar=covar, mixing=0, interact_with_snp=0, output_file_name=output_file,count_A1=False ) self.compare_files(frame,"interact_looc")
def test_preload_files(self): logging.info("TestSingleSnp test_preload_files") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps, count_A1=False) output_file_name = self.file_name("preload_files") frame = single_snp(test_snps=bed[:,:10], pheno=pheno, G0=test_snps, mixing=0,leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"one")
def test_noK0(self): logging.info("TestSingleSnp test_noK0") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("noK0") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno, mixing=1,leave_out_one_chrom=False, G1=test_snps, covar=covar, output_file_name=output_file ) self.compare_files(frame,"one")
def test_two_looc(self): logging.info("TestSingleSnpLeaveOutOneChrom test_two_looc") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("two_looc") frame = single_snp(test_snps[:,::10], pheno, covar=covar, output_file_name=output_file,count_A1=False ) self.compare_files(frame,"two_looc")
def test_none(self): logging.info("TestSingleSnp test_none") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("none") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno, mixing=0,leave_out_one_chrom=False, K0=KernelIdentity(test_snps.iid), covar=covar, output_file_name=output_file ) self.compare_files(frame,"none")
def test_one_looc(self): logging.info("TestSingleSnpLeaveOutOneChrom test_one_looc") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("one_looc") frame = single_snp(test_snps, pheno, covar=covar, mixing=0, output_file_name=output_file, ) self.compare_files(frame,"one_looc")
def test_mixing(self): logging.info("TestSingleSnp test_mixing") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("mixing") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno,G0=test_snps[:,10:100], leave_out_one_chrom=False, covar=covar, G1=test_snps[:,100:200],mixing=None, output_file_name=output_file_name ) self.compare_files(frame,"mixing")
def execute_fs(test_snps, pheno, G0, covar): """ run feature selection """ result = {} fs_result = {} # fs unconditioned ######################## tmp_uuid = str(uuid.uuid4())[0:13] out_fn = "tmp_pheno_%s.txt" % (tmp_uuid) out_data = pd.DataFrame({"id1": G0.iid[:,0], "id2": G0.iid[:,1], "y": pheno["vals"]}) out_data.to_csv(out_fn, sep=" ", header=False, index=False) # write out covariates items = [ ('id1', G0.iid[:,0]), ('id2', G0.iid[:,1]), ] items += [("pc_%i" % i, covar["vals"][:,i]) for i in xrange(covar["vals"].shape[1])] cov_df = pd.DataFrame.from_items(items) cov_fn = "tmp_cov_%s.txt" % (tmp_uuid) cov_df.to_csv(cov_fn, sep=" ", header=False, index=False) #TODO: fix include_all!! fsd = create_feature_selection_distributable(G0, out_fn, None, 0, "fs_out", include_all=False, cov_fn=cov_fn) fs_result["result_uncond_all"] = Local().run(fsd) best_k, best_delta, best_obj, best_snps = fs_result["result_uncond_all"] fs_idx = argintersect_left(G0.sid, best_snps) G_fs = G0[:,fs_idx] result["fs_all"] = single_snp(test_snps, pheno, G0=G_fs).sort(["Chr", "ChrPos"])["PValue"].as_matrix() result["fs_all_cov"] = single_snp(test_snps, pheno, G0=G_fs, covar=covar).sort(["Chr", "ChrPos"])["PValue"].as_matrix() return result, fs_result
def test_unknown_sid(self): logging.info("TestSingleSnp test_unknown_sid") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn try: frame = single_snp(test_snps=test_snps,G0=test_snps,pheno=pheno,leave_out_one_chrom=False,mixing=0,covar=covar,sid_list=['1_4','bogus sid','1_9'],count_A1=False) failed = False except: failed = True assert(failed)
def test_two_looc(self): logging.info("TestSingleSnpLeaveOutOneChrom test_two_looc") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("two_looc") frame = single_snp(test_snps[:, ::10], pheno, covar=covar, output_file_name=output_file, count_A1=False) self.compare_files(frame, "two_looc")
def test_G0_has_reader(self): logging.info("TestSingleSnp test_G0_has_reader") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G0_has_reader") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno, G0=test_snps, covar=covar, mixing=0, output_file_name=output_file_name ) self.compare_files(frame,"one")
def test_preload_files(self): logging.info("TestSingleSnp test_preload_files") from pysnptools.snpreader import Bed test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps) output_file_name = self.file_name("preload_files") frame = single_snp(test_snps=bed[:,:10], pheno=pheno, G0=test_snps, mixing=0, covar=covar, output_file_name=output_file_name ) self.compare_files(frame,"one")
def test_mixing(self): logging.info("TestSingleSnp test_mixing") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("mixing") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno,G0=test_snps[:,10:100], covar=covar, G1=test_snps[:,100:200],mixing=None, output_file_name=output_file_name ) self.compare_files(frame,"mixing")
def test_no_cov(self): logging.info("TestSingleSnp test_no_cov") test_snps = Bed(self.bedbase) pheno = self.phen_fn output_file_name = self.file_name("no_cov") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps, mixing=0, leave_out_one_chrom=False, output_file_name=output_file_name) self.compare_files(frame, "no_cov")
def test_no_cov(self): logging.info("TestSingleSnp test_no_cov") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn output_file_name = self.file_name("no_cov") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps, mixing=0, output_file_name=output_file_name) self.compare_files(frame, "no_cov")
def mapper_gather_lots(i_fold_and_pair): i_fold, (train_idx, test_idx) = i_fold_and_pair logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold)) G_train = G_for_chrom[train_idx,:] #Precompute whole x whole standardized on train from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank) block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count) K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read() assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert" K_train = K_whole_unittrain[train_idx] single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2) is_all = (i_fold == n_folds) if n_folds > 1 else True k_list_in = [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)] if is_all: top_snps = list(single_snp_result.SNP[:max_k]) else: top_snps = None if i_fold == n_folds: k_index_to_nLL = None else: k_index_to_nLL = [] for k in k_list_in: top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])] logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k)) top_k_train = top_k[train_idx,:] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal) fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2raw=h2) #iid intersection means when can give the whole covariate and pheno top_k_test = top_k[test_idx,:] if k > 0 else None K0_whole_test = K_whole_unittrain[:,test_idx] nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno k_index_to_nLL.append(nLL) if i_fold > 0: k_list_in = None return k_list_in, top_snps, k_index_to_nLL
def mapper_gather_lots(i_fold_and_pair): i_fold, (train_idx, test_idx) = i_fold_and_pair logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold)) G_train = G_for_chrom[train_idx,:] #Precompute whole x whole standardized on train from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank) block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count) K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read() assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert" K_train = K_whole_unittrain[train_idx] single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2) is_all = (i_fold == n_folds) if n_folds > 1 else True k_list_in = [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)] if is_all: top_snps = list(single_snp_result.SNP[:max_k]) else: top_snps = None if i_fold == n_folds: k_index_to_nLL = None else: k_index_to_nLL = [] for k in k_list_in: top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])] logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k)) top_k_train = top_k[train_idx,:] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal) fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2=h2) #iid intersection means when can give the whole covariate and pheno top_k_test = top_k[test_idx,:] if k > 0 else None K0_whole_test = K_whole_unittrain[:,test_idx] nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno k_index_to_nLL.append(nLL) if i_fold > 0: k_list_in = None return k_list_in, top_snps, k_index_to_nLL
def test_file_cache(self): logging.info("TestSingleSnp test_file_cache") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G1") cache_file = self.file_name("cache_file")+".npz" if os.path.exists(cache_file): os.remove(cache_file) frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno,G0=test_snps[:,10:100], leave_out_one_chrom=False, covar=covar, G1=test_snps[:,100:200], mixing=.5, output_file_name=output_file_name, cache_file = cache_file,count_A1=False ) self.compare_files(frame,"G1") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno,G0=test_snps[:,10:100], leave_out_one_chrom=False, covar=covar, G1=test_snps[:,100:200], mixing=.5, output_file_name=output_file_name, cache_file = cache_file,count_A1=False ) self.compare_files(frame,"G1")
def test_G1(self): logging.info("TestSingleSnp test_G1") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G1") for force_full_rank,force_low_rank in [(False,True),(False,False),(True,False)]: logging.info("{0},{1}".format(force_full_rank,force_low_rank)) frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno,G0=test_snps[:,10:100], leave_out_one_chrom=False, covar=covar, G1=test_snps[:,100:200], mixing=.5,force_full_rank=force_full_rank,force_low_rank=force_low_rank, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"G1")
def test_SNC(self): logging.info("TestSNC") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps, count_A1=False) snc = bed.read() snc.val[:,2] = [0] * snc.iid_count # make SNP #2 have constant values (aka a SNC) output_file_name = self.file_name("snc") frame = single_snp(test_snps=snc[:,:10], pheno=pheno, G0=snc, mixing=0,leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"snc")
def test_covar_by_chrom_mixing(self): logging.info("TestSingleSnpLeaveOutOneChrom test_covar_by_chrom_mixing") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn covar = Pheno(self.cov_fn).read() covar = SnpData(iid=covar.iid,sid=["pheno-1"],val=covar.val) covar_by_chrom = {chrom:self.cov_fn for chrom in xrange(1,6)} output_file = self.file_name("covar_by_chrom_mixing") frame = single_snp(test_snps, pheno, covar=covar, covar_by_chrom=covar_by_chrom, output_file_name=output_file,count_A1=False ) self.compare_files(frame,"covar_by_chrom_mixing")
def test_other(self): logging.info("TestSingleSnp test_other") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("other") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, leave_out_one_chrom=False, K1=test_snps, covar=covar, output_file_name=output_file) self.compare_files(frame, "one")
def test_interact_looc(self): logging.info("TestSingleSnpLeaveOutOneChrom test_interact_looc") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("interact_looc") frame = single_snp(test_snps, pheno, covar=covar, mixing=0, interact_with_snp=0, output_file_name=output_file) self.compare_files(frame, "interact_looc")
def test_old_one(self): logging.info("test_old_one") output_file = self.file_name("old_one") test_snps3 = self.bed[:, self.bed.pos[:, 0] == 3] # Test only on chromosome 3 results_df = single_snp( test_snps=test_snps3, K0=self.bed, pheno=self.phen_fn, covar=self.cov_fn, count_A1=True, output_file_name=output_file, ) self.compare_files(results_df, "old_one")
def test_none(self): logging.info("TestSingleSnp test_none") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("none") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, mixing=0, leave_out_one_chrom=False, K0=KernelIdentity(test_snps.iid), covar=covar, output_file_name=output_file) self.compare_files(frame, "none")
def test_covar_by_chrom_mixing(self): logging.info( "TestSingleSnpLeaveOutOneChrom test_covar_by_chrom_mixing") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn covar = Pheno(self.cov_fn).read() covar = SnpData(iid=covar.iid, sid=["pheno-1"], val=covar.val) covar_by_chrom = {chrom: self.cov_fn for chrom in xrange(1, 6)} output_file = self.file_name("covar_by_chrom_mixing") frame = single_snp(test_snps, pheno, covar=covar, covar_by_chrom=covar_by_chrom, output_file_name=output_file) self.compare_files(frame, "covar_by_chrom_mixing")
def test_G0_has_reader(self): logging.info("TestSingleSnp test_G0_has_reader") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G0_has_reader") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps, covar=covar, mixing=0, output_file_name=output_file_name) self.compare_files(frame, "one")
def test_preload_files(self): logging.info("TestSingleSnp test_preload_files") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps) output_file_name = self.file_name("preload_files") frame = single_snp(test_snps=bed[:, :10], pheno=pheno, G0=test_snps, mixing=0, leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name) self.compare_files(frame, "one")
def test_noK0(self): logging.info("TestSingleSnp test_noK0") test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("noK0") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, mixing=1, leave_out_one_chrom=False, G1=test_snps, covar=covar, output_file_name=output_file, count_A1=False) self.compare_files(frame, "one")
def test_G1(self): logging.info("TestSingleSnp test_G1") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G1") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps[:, 10:100], covar=covar, G1=test_snps[:, 100:200], mixing=.5, output_file_name=output_file_name) self.compare_files(frame, "G1")
def test_mixingKs(self): logging.info("TestSingleSnp test_mixingKs") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("mixingKs") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, K0=SnpKernel(test_snps[:, 10:100], Unit()), leave_out_one_chrom=False, covar=covar, K1=SnpKernel(test_snps[:, 100:200], Unit()), mixing=None, output_file_name=output_file_name) self.compare_files(frame, "mixing")
def test_mixid(self): logging.info("TestSingleSnp test_mixid") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("mixid") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps[:, 10:100], leave_out_one_chrom=False, covar=covar, K1=KernelIdentity(test_snps.iid), mixing=.25, output_file_name=output_file_name) self.compare_files(frame, "mixid")
def test_interact(self): logging.info("TestSingleSnp test_interact") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("interact") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, mixing=0, leave_out_one_chrom=False, G0=test_snps, covar=covar, interact_with_snp=1, output_file_name=output_file) self.compare_files(frame, "interact")
def test_G1_mixing(self): logging.info("TestSingleSnp test_G1_mixing") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file_name = self.file_name("G1_mixing") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps, leave_out_one_chrom=False, covar=covar, G1=test_snps[:, 100:200], mixing=0, output_file_name=output_file_name) self.compare_files(frame, "one")
def epi_reml(pair_snps, pheno, covar=None, kernel_snps=None, output_dir='results', part_count=33, runner=None, override=False): from pysnptools.kernelreader import SnpKernel from pysnptools.standardizer import Unit import datetime from fastlmm.association import single_snp part_list = list(split_on_sids(pair_snps, part_count)) part_pair_count = (part_count * part_count + part_count) / 2 part_pair_index = -1 print("part_pair_count={0:,}".format(part_pair_count)) K0 = SnpKernel(kernel_snps or pair_snps, standardizer=Unit()).read() #Precompute the similarity if not os.path.exists(output_dir): os.makedirs(output_dir) start_time = datetime.datetime.now() for i in range(part_count): part_i = part_list[i] for j in range(i, part_count): part_pair_index += 1 pairs = _Pairs2(part_i) if i == j else _Pairs2( part_i, part_list[j]) print("Looking at pair {0},{1} which is {2} of {3}".format( i, j, part_pair_index, part_pair_count)) output_file = '{0}/result.{1}.{2}.tsv'.format( output_dir, part_pair_index, part_pair_count) if override or not os.path.exists(output_file): result_df_ij = single_snp(pairs, K0=K0, pheno=pheno, covar=covar, leave_out_one_chrom=False, count_A1=True, runner=runner) result_df_ij.to_csv(output_file, sep="\t", index=False) print(result_df_ij[:1]) time_so_far = datetime.datetime.now() - start_time total_time_estimate = time_so_far * part_pair_count / ( part_pair_index + 1) print(total_time_estimate)