def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100 ''' logging.info("TestSingleSnp test_match_cpp") snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"), count_A1=False) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"] sim_idx = snps.sid_to_index(sim_sid) test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"] test_idx = snps.sid_to_index(test_sid) for G0,G1 in [(snps[:,sim_idx],KernelIdentity(snps.iid)),(KernelIdentity(snps.iid),snps[:,sim_idx])]: frame_h2 = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,h2=.5,leave_out_one_chrom=False,count_A1=False) frame_log_delta = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,log_delta=0,leave_out_one_chrom=False,count_A1=False) for frame in [frame_h2, frame_log_delta]: referenceOutfile = TestFeatureSelection.reference_file("single_snp/topsnps.single.txt") reference = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file assert len(frame) == len(reference) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue reldiff = abs(row.Pvalue - pvalue)/row.Pvalue assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format(sid,None,row.Pvalue,pvalue,reldiff)
def main(args): print('reading seeed snps') seed_snps = pd.read_csv(args.seed_snps, header=None, names=['SNP'], index_col='SNP') seed_snps['ibs_length'] = 0 seed_snps['ibd'] = 0 print('reading typed snps') typed_snps = pd.read_csv(args.typed_snps, header=None, names=['SNP']) print('reading genotypes') data = Bed(args.bfile) X = data.read().val typed_snps_indices = np.sort(data.sid_to_index(typed_snps.SNP)) typed_snps_bp = data.col_property[typed_snps_indices,2] print(len(seed_snps), 'snps in list') print(data.iid_count, data.sid_count, 'are dimensions of X') def analyze_snp(i): # find first typed snp after query snp snp_bp = data.col_property[i,2] v = np.where(typed_snps_bp > snp_bp)[0] if len(v) > 0: typed_i = v[0] else: typed_i = len(typed_snps_indices)-1 n1, n2 = np.where(X[:,i] == 1)[0] if (X[n1,typed_snps_indices[typed_i]] - X[n2, typed_snps_indices[typed_i]])**2 == 4: return 0, 0 typed_il, typed_ir = fis.find_boundaries( X[n1,typed_snps_indices], X[n2,typed_snps_indices], typed_i) typed_ir -= 1 il = typed_snps_indices[typed_il] ir = typed_snps_indices[typed_ir] cM = data.col_property[ir, 1] - \ data.col_property[il, 1] ibd = (np.mean(X[n1,il:ir] == X[n2,il:ir]) > 0.99) return cM, int(ibd) for (i, snp) in iter.show_progress( it.izip(data.sid_to_index(seed_snps.index), seed_snps.index), total=len(seed_snps)): # total=10): seed_snps.ix[snp, ['ibs_length', 'ibd']] = analyze_snp(i) print(seed_snps.iloc[:100]) seed_snps.to_csv(args.outfile, sep='\t')
def __init__(self,args): if args.window_type not in ['BP','SNP']: raise ValueError('Window type not supported') bed_1 = Bed(args.bfile) # af1 = self.get_allele_frequency(bed_1,args) # print(len(af1), "SNPs in file 1") snps_1 = (af1>args.maf)&(af1<1-args.maf) # print(np.sum(snps_1), "SNPs in file 1 after MAF filter") if (args.from_bp is not None) and (args.to_bp is not None): k = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp) snps_1 = snps_1&k snps_to_use = bed_1.sid[snps_1] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) snps_to_use = np.intersect1d(snps_to_use,keep) print(len(snps_to_use),"SNPs remaining after extraction") bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) # pos = bed_1.pos[bed_1_index] # bim_1=pd.read_table(bed_1.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) af = af1[bed_1_index] # if args.afile is not None: a1 = pd.read_table(args.afile,header=None,sep='\s*', names=['id1','id2','theta']) else: a1 = None self.af = af self.M = len(bed_1_index) # self.windows = self.get_windows(pos,args) # self.chr = pos[:,0] self.pos = pos[:,2] self.id = bed_1.sid[bed_1_index] self.A1 = bim_1['a1'].loc[bed_1_index] self.A2 = bim_1['a2'].loc[bed_1_index] self.scores = self.compute(bed_1,bed_1_index,af,a1,args) #
def __init__(self,args): if args.window_type not in ['KBP','SNP']: raise ValueError('Window type not supported') bed_1 = Bed(args.bfile,count_A1=False) # af1 = self.get_allele_frequency(bed_1,args) # print(len(af1), "SNPs in file 1") snps_1 = (af1>args.maf)&(af1<1-args.maf) # print(np.sum(snps_1), "SNPs in file 1 after MAF filter") if (args.from_bp is not None) and (args.to_bp is not None): k = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp) snps_1 = snps_1&k snps_to_use = bed_1.sid[snps_1] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) snps_to_use = np.intersect1d(snps_to_use,keep) print(len(snps_to_use),"SNPs remaining after extraction") bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) # pos = bed_1.pos[bed_1_index] # bim_1=pd.read_table(bed_1.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) af = af1[bed_1_index] # # if args.afile is not None: # a1 = pd.read_table(args.afile,header=None,sep='\s*', # names=['id1','id2','theta']) # else: a1 = None self.af = af self.M = len(bed_1_index) # self.windows = self.get_windows(pos,args) # self.chr = pos[:,0] self.pos = pos[:,2] self.id = bed_1.sid[bed_1_index] self.A1 = bim_1['a1'].loc[bed_1_index] self.A2 = bim_1['a2'].loc[bed_1_index] self.scores = self.compute(bed_1,bed_1_index,af,a1,args) #
def __init__(self, args): if args.window_type not in ['KBP', 'SNP']: raise ValueError('Window type not supported') bed_1 = Bed(args.bfile, count_A1=False) # af1 = self.get_allele_frequency(bed_1, args) # print(len(af1), "SNPs in file 1") snps_1 = (af1 > args.maf) & (af1 < 1 - args.maf) # print(np.sum(snps_1), "SNPs in file 1 after MAF filter") # Omit SNPs with NA values for h2weight if args.h2weight: data = pd.read_table(args.bfile + '.h2weight', header=None, names=['SNP', 'h2weight'], index_col=False) if (len(data['SNP']) != len(bed_1.sid) or (data['SNP'] == bed_1.sid).min() == False): raise ValueError( 'SNPs disagree between bed/bim/fam and h2weight files') h2weight = data['h2weight'] snps_1 = snps_1 & ~h2weight.isnull().values print(np.sum(snps_1), "SNPs in file 1 after extracting non-NA h2weight") del (data) if (args.from_bp is not None) and (args.to_bp is not None): k = (bed_1.pos[:, 2] > args.from_bp) & (bed_1.pos[:, 2] < args.to_bp) snps_1 = snps_1 & k snps_to_use = bed_1.sid[snps_1] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract, 'r')]) snps_to_use = np.intersect1d(snps_to_use, keep) print(len(snps_to_use), "SNPs remaining after extraction") bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) # pos = bed_1.pos[bed_1_index] # bim_1 = pd.read_table( bed_1.filename + '.bim', header=None, names=['chm', 'id', 'pos_mb', 'pos_bp', 'a1', 'a2']) af = af1[bed_1_index] # # if args.afile is not None: # a1 = pd.read_table(args.afile,header=None,sep='\s*', # names=['id1','id2','theta']) # else: a1 = None try: h2weight = h2weight[bed_1_index].values except NameError: h2weight = None self.af = af self.M = len(bed_1_index) # self.windows = self.get_windows(pos, args) # self.chr = pos[:, 0] self.pos = pos[:, 2] self.id = bed_1.sid[bed_1_index] self.A1 = bim_1['a1'].loc[bed_1_index] self.A2 = bim_1['a2'].loc[bed_1_index] self.scores = self.compute(bed_1, bed_1_index, af, a1, h2weight, args) #
def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>fastlmmc -snpPairs -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.pairs.txt -logDelta 0 -verbose 100 ''' logging.info("TestEpistasis test_match_cpp") from pysnptools.snpreader import Bed snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps")) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"] sim_idx = snps.sid_to_index(sim_sid) test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"] test_idx = snps.sid_to_index(test_sid) frame = epistasis(snps[:,test_idx], pheno,covar=covar, G0 = snps[:,sim_idx],log_delta=0) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) referenceOutfile = TestFeatureSelection.reference_file("epistasis/topsnps.pairs.txt") import pandas as pd table = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file assert len(pvalue_list) == len(table) for row in table.iterrows(): snp0cpp,snp1cpp,pvaluecpp,i1,i2 = row[1] for i in xrange(len(pvalue_list)): found = False pvaluepy = pvalue_list[i] snp0py = sid0[i] snp1py = sid1[i] if (snp0py == snp0cpp and snp1py == snp1cpp) or (snp0py == snp1cpp and snp1py == snp0cpp): found = True diff = abs(pvaluecpp - pvaluepy)/pvaluecpp assert diff < .035, "'{0}' '{1}' pvalue_list differ too much {4} -- {2} vs {3}".format(snp0cpp,snp1cpp,pvaluecpp,pvaluepy,diff) break assert found
class _Epistasis(object) : #implements IDistributable def __init__(self,test_snps,pheno,G0, G1=None, mixing=0.0, covar=None,sid_list_0=None,sid_list_1=None, log_delta=None, min_log_delta=-5, max_log_delta=10, output_file=None, cache_file=None): self.test_snps = test_snps self.pheno = pheno self.output_file_or_none = output_file self.cache_file = cache_file self.covar = covar self.sid_list_0 = sid_list_0 self.sid_list_1 = sid_list_1 self.G0=G0 self.G1_or_none=G1 self.mixing=mixing self.external_log_delta=log_delta self.min_log_delta = min_log_delta self.max_log_delta = max_log_delta self._ran_once = False self._str = "{0}({1},{2},G0={6},G1={7},mixing={8},covar={3},output_file={12},sid_list_0={4},sid_list_1{5},log_delta={9},min_log_delta={10},max_log_delta={11},cache_file={13})".format( self.__class__.__name__, self.test_snps,self.pheno,self.covar,self.sid_list_0,self.sid_list_1, self.G0, self.G1_or_none, self.mixing, self.external_log_delta, self.min_log_delta, self.max_log_delta, output_file, cache_file) self.block_size = 1000 def set_sid_sets(self): sid_set_0 = set(self.sid_list_0) self.intersect = sid_set_0.intersection(self.sid_list_1) self.just_sid_0 = sid_set_0.difference(self.intersect) self.just_sid_1 = self.intersect.symmetric_difference(self.sid_list_1) self._pair_count = len(self.just_sid_0)*len(self.intersect) + len(self.just_sid_0)*len(self.just_sid_1) + len(self.intersect)*len(self.just_sid_1) + len(self.intersect) * (len(self.intersect)-1)//2 self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none = pstutil.intersect_apply([self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none]) #should put G0 and G1 first def _run_once(self): if self._ran_once: return self._ran_once = None if isinstance(self.test_snps, str): self.test_snps = Bed(self.test_snps) if isinstance(self.G0, str): self.G0 = Bed(self.G0) if isinstance(self.pheno, str): self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True) #!! what about missing=-9? if self.covar is not None and isinstance(self.covar, str): self.covar = pstpheno.loadPhen(self.covar)#!! what about missing=-9? if self.G1_or_none is not None and isinstance(self.G1_or_none, str): self.G1_or_none = Bed(self.G1_or_none) if self.sid_list_0 is None: self.sid_list_0 = self.test_snps.sid if self.sid_list_1 is None: self.sid_list_1 = self.test_snps.sid self.set_sid_sets() #!!Should fix up to add only of no constant columns - will need to add a test case for this if self.covar is None: self.covar = np.ones((self.test_snps.iid_count, 1)) else: self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1)))) self.n_cov = self.covar.shape[1] if self.output_file_or_none is None: self.__tempdirectory = ".working" else: self.__tempdirectory = self.output_file_or_none + ".working" self._ran_once = True #start of IDistributable interface-------------------------------------- @property def work_count(self): self._run_once() block_count = self.div_ceil(self._pair_count, self.block_size) return block_count def work_sequence(self): self._run_once() return self.work_sequence_range(0,self.work_count) def work_sequence_range(self, start, end): self._run_once() lmm = self.lmm_from_cache_file() lmm.sety(self.pheno['vals']) for sid0_list, sid1_list in self.pair_block_sequence_range(start,end): yield lambda lmm=lmm,sid0_list=sid0_list,sid1_list=sid1_list : self.do_work(lmm,sid0_list,sid1_list) # the 'lmm=lmm,...' is need to get around a strangeness in Python def reduce(self, result_sequence): #doesn't need "run_once()" frame = pd.concat(result_sequence) frame.sort("PValue", inplace=True) frame.index = np.arange(len(frame)) if self.output_file_or_none is not None: frame.to_csv(self.output_file_or_none, sep="\t", index=False) return frame #!!Find a place to output info like this near the end of the run #logging.info("PhenotypeName\t{0}".format(pheno['header'])) #logging.info("SampleSize\t{0}".format(test_snps.iid_count)) #logging.info("SNPCount\t{0}".format(test_snps.sid_count)) #logging.info("Runtime\t{0}".format(time.time()-t0)) @property def tempdirectory(self): self._run_once() return self.__tempdirectory #optional override -- the str name of the instance is used by the cluster as the job name def __str__(self): #Doesn't need run_once return self._str def copyinputs(self, copier): self._run_once() if isinstance(self.test_snps, str): copier.input(self.test_snps + ".bed") copier.input(self.test_snps + ".bim") copier.input(self.test_snps + ".fam") else: copier.input(self.test_snps) copier.input(self.pheno) copier.input(self.covar) if isinstance(self.G0, str): copier.input(self.G0 + ".bed") copier.input(self.G0 + ".bim") copier.input(self.G0 + ".fam") else: copier.input(self.G0) copier.input(self.G1_or_none) copier.input(self.cache_file) def copyoutputs(self,copier): #Doesn't need run_once copier.output(self.output_file_or_none) #end of IDistributable interface--------------------------------------- @staticmethod def div_ceil(num, den): #!!move to utils? return -(-num//den) #The -/- trick makes it do ceiling instead of floor. "//" will do integer division even in the future and on floats. def pair_block_sequence_range(self,block_start,block_end): self._run_once() assert 0 <= block_start and block_start <= block_end and block_end <= self.work_count, "real assert" block_index = block_start start = block_index * self.pair_count // self.work_count next_start = (block_index+1) * self.pair_count // self.work_count size_goal = next_start - start end = block_end * self.pair_count // self.work_count sid0_list = [] sid1_list = [] for sid0, sid1 in self.pair_sequence_range(start,end): sid0_list.append(sid0) sid1_list.append(sid1) if len(sid0_list) == size_goal: yield sid0_list, sid1_list block_index += 1 if block_index == block_end: return sid0_list = [] sid1_list = [] start = next_start next_start = (block_index+1) * self.pair_count // self.work_count size_goal = next_start - start assert len(sid0_list) == 0, "real assert" #If start == end, then returns without yielding anything def pair_sequence_range(self,start,end): self._run_once() assert 0 <= start and start <= end and end <= self._pair_count, "real assert" i = start for sid0, sid1 in self.pair_sequence_with_start(start): yield sid0, sid1 i = i + 1 if i == end: break assert i == end, "Not enough items found. Didn't get to the end" def pair_sequence_with_start(self,start): self._run_once() skip_ref = [start] just_sid_0_list = list(self.just_sid_0) just_sid_1_list = list(self.just_sid_1) intersect_list = list(self.intersect) for sid0, sid1 in self.combo_distinct(just_sid_0_list, intersect_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_distinct(just_sid_0_list, just_sid_1_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_distinct(intersect_list, just_sid_1_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_same(intersect_list, skip_ref): yield sid0, sid1 assert skip_ref[0] == 0, "real assert" def combo_distinct(self, distinct__list0, distinct__list1, skip_ref): row_count = len(distinct__list0) col_count = len(distinct__list1) if skip_ref[0] >= row_count * col_count: skip_ref[0] = skip_ref[0] - row_count * col_count assert skip_ref[0] >=0, "real assert" return row_start = skip_ref[0] // col_count skip_ref[0] = skip_ref[0] - row_start * col_count assert skip_ref[0] >=0, "real assert" for row_index in xrange(row_start, row_count): sid0 = distinct__list0[row_index] if row_index == row_start: col_start = skip_ref[0] skip_ref[0] = 0 else: col_start = 0 for col_index in xrange(col_start, col_count): sid1 = distinct__list1[col_index] yield sid0, sid1 def combo_same(self, list, skip_ref): count = len(list) full_size = count * (count + 1) // 2 if skip_ref[0] >= full_size: skip_ref[0] = skip_ref[0] - full_size assert skip_ref[0] >=0, "real assert" return row_start = int((-1 + 2*count - np.sqrt(1 - 4*count + 4*count**2 - 8*skip_ref[0]))/2) skip_ref[0] = skip_ref[0] - (count*row_start - (row_start*(1 + row_start))//2) assert skip_ref[0] >=0, "real assert" for row_index in xrange(row_start, count): sid0 = list[row_index] if row_index == row_start: col_start = skip_ref[0] skip_ref[0] = 0 else: col_start = 0 for col_index in xrange(col_start + 1 + row_index, count): sid1 = list[col_index] assert sid0 is not sid1, "real assert" yield sid0, sid1 @property def pair_count(self): self._run_once() return self._pair_count def lmm_from_cache_file(self): logging.info("Loading precomputation from {0}".format(self.cache_file)) lmm = LMM() with np.load(self.cache_file) as data: lmm.U = data['arr_0'] lmm.S = data['arr_1'] return lmm def fill_in_cache_file(self): self._run_once() logging.info("filling in the cache_file and log_delta, as needed") if self.G1_or_none is None: self.G1val_or_none = None else: self.G1val_or_none = self.G1_or_none.read().val # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs if self.cache_file is None: self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz") if os.path.exists(self.cache_file): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date os.remove(self.cache_file) lmm = None if not os.path.exists(self.cache_file): logging.info("Precomputing eigen") lmm = LMM() G0_standardized = self.G0.read().standardize() lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing) logging.info("Saving precomputation to {0}".format(self.cache_file)) util.create_directory_if_necessary(self.cache_file) np.savez(self.cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write if self.external_log_delta is None: if lmm is None: lmm = self.lmm_from_cache_file() logging.info("searching for internal delta") lmm.setX(self.covar) lmm.sety(self.pheno['vals']) #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count result = lmm.find_log_delta(REML=False, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta ) #!!what about findA2H2? minH2=0.00001 self.external_log_delta = result['log_delta'] self.internal_delta = np.exp(self.external_log_delta) * self.G0.sid_count logging.info("internal_delta={0}".format(self.internal_delta)) logging.info("external_log_delta={0}".format(self.external_log_delta)) do_pair_count = 0 do_pair_time = time.time() def do_work(self, lmm, sid0_list, sid1_list): dataframe = pd.DataFrame( index=np.arange(len(sid0_list)), columns=('SNP0', 'Chr0', 'GenDist0', 'ChrPos0', 'SNP1', 'Chr1', 'GenDist1', 'ChrPos1', 'PValue', 'NullLogLike', 'AltLogLike') ) #!!Is this the only way to set types in a dataframe? dataframe['Chr0'] = dataframe['Chr0'].astype(np.float) dataframe['GenDist0'] = dataframe['GenDist0'].astype(np.float) dataframe['ChrPos0'] = dataframe['ChrPos0'].astype(np.float) dataframe['Chr1'] = dataframe['Chr1'].astype(np.float) dataframe['GenDist1'] = dataframe['GenDist1'].astype(np.float) dataframe['ChrPos1'] = dataframe['ChrPos1'].astype(np.float) dataframe['PValue'] = dataframe['PValue'].astype(np.float) dataframe['NullLogLike'] = dataframe['NullLogLike'].astype(np.float) dataframe['AltLogLike'] = dataframe['AltLogLike'].astype(np.float) #This is some of the code for a different way that reads and dot-products 50% more, but does less copying. Seems about the same speed #sid0_index_list = self.test_snps.sid_to_index(sid0_list) #sid1_index_list = self.test_snps.sid_to_index(sid1_list) #sid_index_union_dict = {} #sid0_index_index_list = self.create_index_index(sid_index_union_dict, sid0_index_list) #sid1_index_index_list = self.create_index_index(sid_index_union_dict, sid1_index_list) #snps0_read = self.test_snps[:,sid0_index_list].read().standardize() #snps1_read = self.test_snps[:,sid1_index_list].read().standardize() sid_union = set(sid0_list).union(sid1_list) sid_union_index_list = sorted(self.test_snps.sid_to_index(sid_union)) snps_read = self.test_snps[:,sid_union_index_list].read().standardize() sid0_index_list = snps_read.sid_to_index(sid0_list) sid1_index_list = snps_read.sid_to_index(sid1_list) products = snps_read.val[:,sid0_index_list] * snps_read.val[:,sid1_index_list] # in the products matrix, each column i is the elementwise product of sid i in each list X = np.hstack((self.covar, snps_read.val, products)) UX = lmm.U.T.dot(X) k = lmm.S.shape[0] N = X.shape[0] if (k<N): UUX = X - lmm.U.dot(UX) else: UUX = None for pair_index, sid0 in enumerate(sid0_list): sid1 = sid1_list[pair_index] sid0_index = sid0_index_list[pair_index] sid1_index = sid1_index_list[pair_index] index_list = np.array([pair_index]) #index to product index_list = index_list + len(sid_union_index_list) #Shift by the number of snps in the union index_list = np.hstack((np.array([sid0_index,sid1_index]),index_list)) # index to sid0 and sid1 index_list = index_list + self.covar.shape[1] #Shift by the number of values in the covar index_list = np.hstack((np.arange(self.covar.shape[1]),index_list)) #indexes of the covar index_list_less_product = index_list[:-1] #index to everything but the product #Null -- the two additive SNPs lmm.X = X[:,index_list_less_product] lmm.UX = UX[:,index_list_less_product] if (k<N): lmm.UUX = UUX[:,index_list_less_product] else: lmm.UUX = None res_null = lmm.nLLeval(delta=self.internal_delta, REML=False) ll_null = -res_null["nLL"] #Alt -- now with the product feature lmm.X = X[:,index_list] lmm.UX = UX[:,index_list] if (k<N): lmm.UUX = UUX[:,index_list] else: lmm.UUX = None res_alt = lmm.nLLeval(delta=self.internal_delta, REML=False) ll_alt = -res_alt["nLL"] test_statistic = ll_alt - ll_null degrees_of_freedom = 1 pvalue = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom) logging.debug("<{0},{1}>, null={2}, alt={3}, pvalue={4}".format(sid0,sid1,ll_null,ll_alt,pvalue)) dataframe.iloc[pair_index] = [ sid0, snps_read.pos[sid0_index,0], snps_read.pos[sid0_index,1], snps_read.pos[sid0_index,2], sid1, snps_read.pos[sid1_index,0], snps_read.pos[sid1_index,1], snps_read.pos[sid1_index,2], pvalue, ll_null, ll_alt] self.do_pair_count += 1 if self.do_pair_count % 100 == 0: start = self.do_pair_time self.do_pair_time = time.time() logging.info("do_pair_count={0}, time={1}".format(self.do_pair_count,self.do_pair_time-start)) return dataframe
def __init__(self,args): if args.window_type not in ['BP','SNP']: raise ValueError('Window type not supported') bed_1 = Bed(args.bfile1) # bed_2 = Bed(args.bfile2) af1 = self.get_allele_frequency(bed_1,args) # af2 = self.get_allele_frequency(bed_2,args) print(len(af1), "SNPs in file 1") print(len(af2), "SNPs in file 2") snps_1 = (af1>args.maf)&(af1<1-args.maf) # snps_2 = (af2>args.maf)&(af2<1-args.maf) print(np.sum(snps_1), "SNPs in file 1 after MAF filter") print(np.sum(snps_2), "SNPs in file 2 after MAF filter") if (args.from_bp is not None) and (args.to_bp is not None): k1 = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp) k2 = (bed_2.pos[:,2]>args.from_bp)&(bed_2.pos[:,2]<args.to_bp) snps_1 = snps_1&k1 snps_2 = snps_2&k2 snps_to_use = np.intersect1d(bed_1.sid[snps_1],bed_2.sid[snps_2]) print(len(snps_to_use),"SNPs common in both populations") if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) print(len(keep),"SNPs to extract") snps_to_use = np.intersect1d(snps_to_use,keep) print(len(snps_to_use),"SNPs remaining after extraction") bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) # bed_2_index = np.sort(bed_2.sid_to_index(snps_to_use)) if not args.no_align: alignment,bed_1_index,bed_2_index =\ self.align_alleles(bed_1,bed_1_index,af1,bed_2,bed_2_index,af2) else: alignment = np.ones(len(bed_1_index)) pos = bed_1.pos[bed_1_index] # bim_1=pd.read_table(bed_1.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) af1 = af1[bed_1_index] # af2 = af2[bed_2_index] if args.afile1 is not None: a1 = pd.read_table(args.afile,header=None,sep='\s*', names=['id1','id2','theta']) else: a1 = None if args.afile2 is not None: a2 = pd.read_table(args.afile,header=None,sep='\s*', names=['id1','id2','theta']) else: a2 = None self.af1 = af1 # self.af2 = af2 self.M = len(bed_1_index) # self.N = (bed_1.iid_count, bed_2.iid_count) # self.chr = pos[:,0] self.pos = pos[:,2] self.id = bed_1.sid[bed_1_index] self.A1 = bim_1['a1'].loc[bed_1_index] self.A2 = bim_1['a2'].loc[bed_1_index] self.windows = self.get_windows(pos,args) # self.scores1 = self.compute(bed_1,bed_1_index,af1,a1,args) self.scores2 = self.compute(bed_2,bed_2_index,af2,a2,args) # self.scoresX = self.compute2(bed_1,bed_1_index,bed_2,bed_2_index, alignment,a1,a2,args) #
def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100 ''' logging.info("TestSingleSnp test_match_cpp") snps = Bed( os.path.join(self.pythonpath, "tests/datasets/selecttest/snps")) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = [ "snp26250_m0_.19m1_.19", "snp82500_m0_.28m1_.28", "snp63751_m0_.23m1_.23", "snp48753_m0_.4m1_.4", "snp45001_m0_.26m1_.26", "snp52500_m0_.05m1_.05", "snp75002_m0_.39m1_.39", "snp41253_m0_.07m1_.07", "snp11253_m0_.2m1_.2", "snp86250_m0_.33m1_.33", "snp3753_m0_.23m1_.23", "snp75003_m0_.32m1_.32", "snp30002_m0_.25m1_.25", "snp26252_m0_.19m1_.19", "snp67501_m0_.15m1_.15", "snp63750_m0_.28m1_.28", "snp30001_m0_.28m1_.28", "snp52502_m0_.35m1_.35", "snp33752_m0_.31m1_.31", "snp37503_m0_.37m1_.37", "snp15002_m0_.11m1_.11", "snp3751_m0_.34m1_.34", "snp7502_m0_.18m1_.18", "snp52503_m0_.3m1_.3", "snp30000_m0_.39m1_.39", "isnp4457_m0_.11m1_.11", "isnp23145_m0_.2m1_.2", "snp60001_m0_.39m1_.39", "snp33753_m0_.16m1_.16", "isnp60813_m0_.2m1_.2", "snp82502_m0_.34m1_.34", "snp11252_m0_.13m1_.13" ] sim_idx = snps.sid_to_index(sim_sid) test_sid = [ "snp26250_m0_.19m1_.19", "snp63751_m0_.23m1_.23", "snp82500_m0_.28m1_.28", "snp48753_m0_.4m1_.4", "snp45001_m0_.26m1_.26", "snp52500_m0_.05m1_.05", "snp75002_m0_.39m1_.39", "snp41253_m0_.07m1_.07", "snp86250_m0_.33m1_.33", "snp15002_m0_.11m1_.11", "snp33752_m0_.31m1_.31", "snp26252_m0_.19m1_.19", "snp30001_m0_.28m1_.28", "snp11253_m0_.2m1_.2", "snp67501_m0_.15m1_.15", "snp3753_m0_.23m1_.23", "snp52502_m0_.35m1_.35", "snp30000_m0_.39m1_.39", "snp30002_m0_.25m1_.25" ] test_idx = snps.sid_to_index(test_sid) for G0, G1 in [(snps[:, sim_idx], KernelIdentity(snps.iid)), (KernelIdentity(snps.iid), snps[:, sim_idx])]: frame_h2 = single_snp(test_snps=snps[:, test_idx], pheno=pheno, G0=G0, G1=G1, covar=covar, h2=.5, leave_out_one_chrom=False) frame_log_delta = single_snp(test_snps=snps[:, test_idx], pheno=pheno, G0=G0, G1=G1, covar=covar, log_delta=0, leave_out_one_chrom=False) for frame in [frame_h2, frame_log_delta]: referenceOutfile = TestFeatureSelection.reference_file( "single_snp/topsnps.single.txt") reference = pd.read_table( referenceOutfile, sep="\t" ) # We've manually remove all comments and blank lines from this file assert len(frame) == len(reference) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue reldiff = abs(row.Pvalue - pvalue) / row.Pvalue assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format( sid, None, row.Pvalue, pvalue, reldiff)
def unbalance_condition_longwas_trans( data_file, id, tpoint, trait, bed_file, kin_file, var_com, condition_snp, snp_lst=None, tfix=None, fix=None, forder=3, aorder=3, porder=3, na_method='omit', prefix_outfile='unbalance_condition_longwas_trans'): """ the longitudinal GWAS for the unbalanced data treating the SNP as the time varied random effect. :param data_file: the data file. The first row is the variate names whose first initial position is alphabetical. For the class variates, the first letter must be capital; for the covariates (continuous variates), the first letter must be lowercase. :param id: A class variate name which indicates the individual id column in the data file. :param tpoint: A covariate names which indicates the time point column in the data file. :param trait: A variate name which indicates the analyzed trait column in the data file. :param bed_file: the prefix for the plink binary file. :param kin_file: the file for genomic relationship matrix. This file can be produced by gmat.gmatrix.agmat function using agmat(bed_file, inv=True, small_val=0.001, out_fmt='id_id_val') :param var_com: the estimated variance parameters by the gmat.longwas.unbalance.unbalance_varcom function. :param condition_snp: conditional snp :param snp_lst: the snp list to test. Default is None. :param tfix: A class variate name for the time varied fixed effect. Default value is None. Only one time varied fixed effect can be included in the current version. :param fix: Expression for the time independent fixed effect. Default value is None. An example: fix = "Sex + age + Season". :param forder: the order of Legendre polynomials for the time varied fixed effect. The default value is 3. :param aorder: the order of Legendre polynomials for the additive genetic effect. The default value is 3. :param porder: the order of Legendre polynomials for the permanent environment effect. The default value is 3. :param na_method: The method to deal with missing values. The default value is 'omit'. 'omit' method will delete the row with missing values. 'include' method will fill the missing values with the adjacent values. :param prefix_outfile: the prefix for the output file. Default is 'unbalance_longwas_fixed'. :return: A pandas data frame for the test result. """ logging.info('################################') logging.info('###Prepare the related matrix###') logging.info('################################') if var_com.shape[0] != aorder * (aorder + 1) / 2 + aorder + 1 + porder * ( porder + 1) / 2 + porder + 1 + 1: logging.info('ERROR: Variances do not match the data, please check') exit() logging.info('***Read the data file***') logging.info('Data file: ' + data_file) data_df = pd.read_csv(data_file, sep='\s+', header=0) logging.info('NA method: ' + na_method) if na_method == 'omit': data_df = data_df.dropna() elif na_method == 'include': data_df = data_df.fillna(method='ffill') data_df = data_df.fillna(method='bfill') else: logging.info('na_method does not exist: ' + na_method) exit() col_names = data_df.columns logging.info('The column names of data file: ' + ' '.join(list(col_names))) logging.info( 'Note: Variates beginning with a capital letter is converted into factors.' ) class_vec = [] for val in col_names: if not val[0].isalpha(): logging.info( "The first character of columns names must be alphabet!") exit() if val[0] == val.capitalize()[0]: class_vec.append(val) data_df[val] = data_df[val].astype('str') else: try: data_df[val] = data_df[val].astype('float') except Exception as e: logging.info(e) logging.info(val + " may contain string, please check!") exit() logging.info('Individual column: ' + id) if id not in col_names: logging.info(id + ' is not in the data file, please check!') exit() if id not in class_vec: logging.info('The initial letter of {} should be capital'.format(id)) exit() id_order = [] id_arr = list(data_df[id]) id_order.append(id_arr[0]) for i in range(1, len(id_arr)): if id_arr[i] != id_arr[i - 1]: id_order.append(id_arr[i]) id_in_data = set(data_df[id]) if len(id_in_data) - len(id_order) != 0: logging.info('The data is not sored by individual ID!') exit() logging.info('Time points column: ' + tpoint) if tpoint not in col_names: logging.info(tpoint + ' is not in the data file, please check!') exit() if tpoint in class_vec: logging.info( 'The initial letter of {} should be lowercase'.format(tpoint)) exit() logging.info('Trait column: ' + trait) if trait not in col_names: logging.info(trait + ' is not in the data file, please check!') exit() if trait in class_vec: logging.info( 'The initial letter of {} should be lowercase'.format(trait)) exit() logging.info('Code factor variables of the data file: ' + ' '.join(list(class_vec))) code_val = {} code_dct = dct_2D() for val in class_vec: code_val[val] = 0 temp = [] for i in range(data_df.shape[0]): if data_df[val][i] not in code_dct[val]: code_val[val] += 1 code_dct[val][data_df[val][i]] = str(code_val[val]) temp.append(code_dct[val][data_df[val][i]]) data_df[val] = np.array(temp) for val in class_vec: data_df[val] = data_df[val].astype('int') logging.info('***Build the design matrix for fixed effect***') logging.info('Time dependent fixed effect: ' + str(tfix)) leg_fix = leg(data_df[tpoint], forder) if tfix == None: xmat_t = np.concatenate(leg_fix, axis=1) xmat_t = csr_matrix(xmat_t) else: if tfix not in class_vec: logging.info(tfix + ' is not the class variate') exit() row = np.array(range(data_df.shape[0])) col = np.array(data_df[tfix]) - 1 val = np.array([1.0] * data_df.shape[0]) tfix_mat = csr_matrix((val, (row, col))) xmat_t = [] for i in range(len(leg_fix)): xmat_t.append(tfix_mat.multiply(leg_fix[i])) xmat_t = hstack(xmat_t) del row, col, val gc.collect() logging.info('Time independent fix effect: ' + str(fix)) xmat_nt = None if fix == None: xmat_nt = None else: try: fix_exp = '' vec = fix.split('+') for i in vec: val = i.strip() if val in class_vec: fix_exp += 'C(' + val + ')' else: fix_exp += val xmat_nt = dmatrix(fix_exp, data_df) logging.info('The expression for fixed effect: ' + fix_exp) except Exception as e: logging.info(e + ': Check the fix effect expression.') exit() xmat_nt = csr_matrix(xmat_nt[:, 1:]) xmat = hstack([xmat_t, xmat_nt]) xmat = xmat.toarray() max_id = max(data_df[id]) + 1 tmin = min(data_df[tpoint]) tmax = max(data_df[tpoint]) leg_lst = [ ] # legendre polynomials for time dependent fixed SNP effects, save for each individuals for i in range(1, max_id): leg_lst.append( leg_mt(data_df[data_df[id] == i][tpoint], tmax, tmin, forder)) tpoint_vec = sorted(set(data_df[tpoint])) leg_tpoint_mat = leg_mt(np.array(tpoint_vec), tmax, tmin, aorder) leg_tpoint_accum = np.sum(leg_tpoint_mat, axis=0) logging.info('***Read the kinship matrix***') logging.info('Kinship file: ' + kin_file) with open(kin_file) as fin: row = [] col = [] kin = [] id_in_kin = {} for line in fin: arr = line.split() id_in_kin[arr[0]] = 1 id_in_kin[arr[1]] = 1 if arr[0] not in code_dct[id]: logging.info(arr[0] + ' is not in the kinship inversion file!') exit() if arr[1] not in code_dct[id]: logging.info(arr[1], 'is not in the kinship inversion file!') exit() row.append(int(code_dct[id][arr[0]])) col.append(int(code_dct[id][arr[1]])) kin.append(float(arr[2])) id_not_in_kin = list(set(code_dct[id].keys()) - set(id_in_kin.keys())) if len(id_not_in_kin) != 0: logging.info( 'The ID: {} in the data file is not in the kinship file!'.format( ' '.join(id_not_in_kin))) exit() kin = csr_matrix( (np.array(kin), (np.array(row) - 1, np.array(col) - 1))).toarray() kin = np.add(kin, kin.T) kin[np.diag_indices_from(kin)] = 0.5 * np.diag(kin) del row, col gc.collect() logging.info('***Build the dedign matrix for random effect***') logging.info('Legendre order for additive effects: ' + str(aorder)) leg_add = leg(data_df[tpoint], aorder) row = np.array(range(data_df.shape[0])) col = np.array(data_df[id]) - 1 val = np.array([1.0] * data_df.shape[0]) add_mat = csr_matrix((val, (row, col)), shape=(data_df.shape[0], kin.shape[0])) zmat_add = [] for i in range(len(leg_add)): zmat_add.append(add_mat.multiply(leg_add[i])) logging.info('Legendre order for permanent environmental effect: ' + str(porder)) leg_per = leg(data_df[tpoint], porder) per_mat = csr_matrix((val, (row, col))) zmat_per = [] for i in range(len(leg_per)): zmat_per.append((per_mat.multiply(leg_per[i]))) del row, col, val gc.collect() zmat = [zmat_add, zmat_per] y = data_df[trait].values.reshape(data_df.shape[0], 1) # kin_inv = [kin_inv, sparse.eye(max(data_df[id]), format="csr")] logging.info('***Prepare the merged Z matrix***') eff_ind = [[0, xmat.shape[1]]] # the index for all effects [start end] zmat_con_lst = [] # combined random matrix for i in range(len(zmat)): temp = [eff_ind[i][-1]] zmat_con_lst.append(hstack(zmat[i])) for j in range(len(zmat[i])): temp.append(temp[-1] + zmat[i][j].shape[1]) eff_ind.append(temp) logging.info( '***Calculate the phenotypic (co)variance and P = V(-1) - V(-1)X[X(T)V(-1)X](-1)X(T)V(-1)***' ) add_cov = var_com.loc[var_com.loc[:, 'vari'] == 1, :] row = np.array(add_cov['varij']) - 1 col = np.array(add_cov['varik']) - 1 val = add_cov['var_val'] add_cov = csr_matrix((val, (row, col))).toarray() add_cov = add_cov + np.tril(add_cov, k=-1).T per_cov = var_com.loc[var_com.loc[:, 'vari'] == 2, :] row = np.array(per_cov['varij']) - 1 col = np.array(per_cov['varik']) - 1 val = per_cov['var_val'] per_cov = csr_matrix((val, (row, col))).toarray() per_cov = per_cov + np.tril(per_cov, k=-1).T res_var = np.array(var_com['var_val'])[-1] vmat = zmat_con_lst[0].dot((zmat_con_lst[0].dot(np.kron(add_cov, kin))).T) one_id = sparse.eye(zmat_con_lst[1].shape[1] / per_cov.shape[0]) vmat = vmat + zmat_con_lst[1].dot( (zmat_con_lst[1].dot(sparse.kron(per_cov, one_id))).T) vmat_diag = np.diag(vmat) + res_var np.fill_diagonal(vmat, vmat_diag) vmat = linalg.inv(vmat) fam_df = pd.read_csv(bed_file + '.fam', sep='\s+', header=None) id_geno = list(np.array(fam_df.iloc[:, 1], dtype=str)) id_order_index = [] for i in id_order: id_order_index.append(id_geno.index(i)) snp_on_disk = Bed(bed_file, count_A1=False) condition_snp_index = snp_on_disk.sid_to_index([condition_snp])[0] condition_snp_val = snp_on_disk[:, condition_snp_index].read().val condition_snp_val = condition_snp_val[id_order_index, 0] snp_condition = list( map(lambda x, y: x * y, leg_lst, list(condition_snp_val))) snp_condition = np.concatenate(snp_condition, axis=0) xmat = np.concatenate((xmat, snp_condition), axis=1) vxmat = np.dot(vmat, xmat) xvxmat = np.dot(xmat.T, vxmat) xvxmat = linalg.inv(xvxmat) pmat = vmat - reduce(np.dot, [vxmat, xvxmat, vxmat.T]) logging.info('***Read the snp data***') # snp_mat = read_plink(bed_file) snp_on_disk = Bed(bed_file, count_A1=False) num_id = snp_on_disk.iid_count num_snp = snp_on_disk.sid_count logging.info("There are {:d} individuals and {:d} SNPs.".format( num_id, num_snp)) """ fam_df = pd.read_csv(bed_file + '.fam', sep='\s+', header=None) id_geno = list(np.array(fam_df.iloc[:, 1], dtype=str)) id_order_index = [] for i in id_order: id_order_index.append(id_geno.index(i)) """ if snp_lst is None: snp_lst = range(num_snp) snp_lst = list(snp_lst) if min(snp_lst) < 0 or max(snp_lst) >= num_snp: logging.info('The value in the snp list should be >= {} and < {}', 0, num_snp) exit() snp_mat = snp_on_disk[:, snp_lst].read().val if np.any(np.isnan(snp_mat)): logging.info('Missing genotypes are imputed with random genotypes.') snp_mat = snp_mat[id_order_index, :] # snp_mat = snp_mat[:, snp_lst] logging.info( '#####################################################################' ) logging.info( '###Start the random regression longitudinal GWAS for unbalance data###' ) logging.info( '#####################################################################' ) qpmat = zmat_con_lst[0].T.dot(pmat) qpqmat = zmat_con_lst[0].T.dot(qpmat.T) qpymat = np.dot(qpmat, y) chi_df = add_cov.shape[1] eff_vec = [] chi_vec = [] p_vec = [] p_min_vec = [] p_accum_vec = [] for i in tqdm(range(snp_mat.shape[1])): snpi = np.kron(add_cov, snp_mat[:, i:(i + 1)].T) snpi_eff = np.dot(snpi, qpymat) snpi_var = reduce(np.dot, [snpi, qpqmat, snpi.T]) chi_val = np.sum( reduce(np.dot, [snpi_eff.T, linalg.inv(snpi_var), snpi_eff])) p_val = chi2.sf(chi_val, chi_df) eff_vec.append(snpi_eff[:, -1]) chi_vec.append(chi_val) p_vec.append(p_val) p_tpoint_vec = [] for k in range(leg_tpoint_mat.shape[0]): eff_tpoint = np.sum(np.dot(leg_tpoint_mat[k, :], snpi_eff)) eff_var_tpoint = np.sum( np.dot(leg_tpoint_mat[k, :], np.dot(snpi_var, leg_tpoint_mat[k, :]))) chi_tpoint = eff_tpoint * eff_tpoint / eff_var_tpoint p_tpoint = chi2.sf(chi_tpoint, 1) p_tpoint_vec.append(p_tpoint) p_min_vec.append(min(p_tpoint_vec)) eff_accum = np.sum(np.dot(leg_tpoint_accum, snpi_eff)) eff_var_accum = np.sum( np.dot(leg_tpoint_accum, np.dot(snpi_var, leg_tpoint_accum))) chi_accum = eff_accum * eff_accum / eff_var_accum p_accum = chi2.sf(chi_accum, 1) p_accum_vec.append(p_accum) logging.info('Finish association analysis') logging.info('***Output***') snp_info_file = bed_file + '.bim' snp_info = pd.read_csv(snp_info_file, sep='\s+', header=None) res_df = snp_info.iloc[snp_lst, [0, 1, 3, 4, 5]] res_df.columns = ['chro', 'snp_ID', 'pos', 'allele1', 'allele2'] res_df.loc[:, 'order'] = snp_lst res_df = res_df.iloc[:, [5, 0, 1, 2, 3, 4]] eff_vec = np.array(eff_vec) for i in range(eff_vec.shape[1]): col_ind = 'eff' + str(i) res_df.loc[:, col_ind] = eff_vec[:, i] res_df.loc[:, 'chi_val'] = chi_vec res_df.loc[:, 'p_val'] = p_vec res_df.loc[:, 'p_min'] = p_min_vec res_df.loc[:, 'p_accum'] = p_accum_vec out_file = prefix_outfile + '.res' res_df.to_csv(out_file, sep=' ', index=False) return res_df
class _Epistasis(object) : #implements IDistributable def __init__(self,test_snps,pheno,G0, G1=None, mixing=0.0, covar=None,sid_list_0=None,sid_list_1=None, log_delta=None, min_log_delta=-5, max_log_delta=10, output_file=None, cache_file=None): self._ran_once = False self.test_snps = test_snps self.pheno = pheno self.output_file_or_none = output_file self.cache_file = cache_file self.covar = covar self.sid_list_0 = sid_list_0 self.sid_list_1 = sid_list_1 self.G0=G0 self.G1_or_none=G1 self.mixing=mixing self.external_log_delta=log_delta self.min_log_delta = min_log_delta self.max_log_delta = max_log_delta self._str = "{0}({1},{2},G0={6},G1={7},mixing={8},covar={3},output_file={12},sid_list_0={4},sid_list_1{5},log_delta={9},min_log_delta={10},max_log_delta={11},cache_file={13})".format( self.__class__.__name__, self.test_snps,self.pheno,self.covar,self.sid_list_0,self.sid_list_1, self.G0, self.G1_or_none, self.mixing, self.external_log_delta, self.min_log_delta, self.max_log_delta, output_file, cache_file) self.block_size = 1000 def set_sid_sets(self): sid_set_0 = set(self.sid_list_0) self.intersect = sid_set_0.intersection(self.sid_list_1) self.just_sid_0 = sid_set_0.difference(self.intersect) self.just_sid_1 = self.intersect.symmetric_difference(self.sid_list_1) self._pair_count = len(self.just_sid_0)*len(self.intersect) + len(self.just_sid_0)*len(self.just_sid_1) + len(self.intersect)*len(self.just_sid_1) + len(self.intersect) * (len(self.intersect)-1)//2 self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none = pstutil.intersect_apply([self.test_snps, self.pheno, self.covar, self.G0, self.G1_or_none]) #should put G0 and G1 first def _run_once(self): if self._ran_once: return self._ran_once = None if isinstance(self.test_snps, str): self.test_snps = Bed(self.test_snps) if isinstance(self.G0, str): self.G0 = Bed(self.G0) if isinstance(self.pheno, str): self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True,missing='NaN') if self.covar is not None and isinstance(self.covar, str): self.covar = pstpheno.loadPhen(self.covar,missing='NaN') if self.G1_or_none is not None and isinstance(self.G1_or_none, str): self.G1_or_none = Bed(self.G1_or_none) if self.sid_list_0 is None: self.sid_list_0 = self.test_snps.sid if self.sid_list_1 is None: self.sid_list_1 = self.test_snps.sid self.set_sid_sets() #!!Should fix up to add only of no constant columns - will need to add a test case for this if self.covar is None: self.covar = np.ones((self.test_snps.iid_count, 1)) else: self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1)))) self.n_cov = self.covar.shape[1] if self.output_file_or_none is None: self.__tempdirectory = ".working" else: self.__tempdirectory = self.output_file_or_none + ".working" self._ran_once = True #start of IDistributable interface-------------------------------------- @property def work_count(self): self._run_once() block_count = self.div_ceil(self._pair_count, self.block_size) return block_count def work_sequence(self): self._run_once() return self.work_sequence_range(0,self.work_count) def work_sequence_range(self, start, end): self._run_once() lmm = self.lmm_from_cache_file() lmm.sety(self.pheno['vals']) for sid0_list, sid1_list in self.pair_block_sequence_range(start,end): yield lambda lmm=lmm,sid0_list=sid0_list,sid1_list=sid1_list : self.do_work(lmm,sid0_list,sid1_list) # the 'lmm=lmm,...' is need to get around a strangeness in Python def reduce(self, result_sequence): #doesn't need "run_once()" frame = pd.concat(result_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if self.output_file_or_none is not None: frame.to_csv(self.output_file_or_none, sep="\t", index=False) return frame #!!Find a place to output info like this near the end of the run #logging.info("PhenotypeName\t{0}".format(pheno['header'])) #logging.info("SampleSize\t{0}".format(test_snps.iid_count)) #logging.info("SNPCount\t{0}".format(test_snps.sid_count)) #logging.info("Runtime\t{0}".format(time.time()-t0)) @property def tempdirectory(self): self._run_once() return self.__tempdirectory #optional override -- the str name of the instance is used by the cluster as the job name def __str__(self): #Doesn't need run_once return self._str def copyinputs(self, copier): self._run_once() if isinstance(self.test_snps, str): copier.input(self.test_snps + ".bed") copier.input(self.test_snps + ".bim") copier.input(self.test_snps + ".fam") else: copier.input(self.test_snps) copier.input(self.pheno) copier.input(self.covar) if isinstance(self.G0, str): copier.input(self.G0 + ".bed") copier.input(self.G0 + ".bim") copier.input(self.G0 + ".fam") else: copier.input(self.G0) copier.input(self.G1_or_none) copier.input(self.cache_file) def copyoutputs(self,copier): #Doesn't need run_once copier.output(self.output_file_or_none) #end of IDistributable interface--------------------------------------- @staticmethod def div_ceil(num, den): #!!move to utils? return -(-num//den) #The -/- trick makes it do ceiling instead of floor. "//" will do integer division even in the future and on floats. def pair_block_sequence_range(self,block_start,block_end): self._run_once() assert 0 <= block_start and block_start <= block_end and block_end <= self.work_count, "real assert" block_index = block_start start = block_index * self.pair_count // self.work_count next_start = (block_index+1) * self.pair_count // self.work_count size_goal = next_start - start end = block_end * self.pair_count // self.work_count sid0_list = [] sid1_list = [] for sid0, sid1 in self.pair_sequence_range(start,end): sid0_list.append(sid0) sid1_list.append(sid1) if len(sid0_list) == size_goal: yield sid0_list, sid1_list block_index += 1 if block_index == block_end: return sid0_list = [] sid1_list = [] start = next_start next_start = (block_index+1) * self.pair_count // self.work_count size_goal = next_start - start assert len(sid0_list) == 0, "real assert" #If start == end, then returns without yielding anything def pair_sequence_range(self,start,end): self._run_once() assert 0 <= start and start <= end and end <= self._pair_count, "real assert" i = start for sid0, sid1 in self.pair_sequence_with_start(start): yield sid0, sid1 i = i + 1 if i == end: break assert i == end, "Not enough items found. Didn't get to the end" def pair_sequence_with_start(self,start): self._run_once() skip_ref = [start] just_sid_0_list = list(self.just_sid_0) just_sid_1_list = list(self.just_sid_1) intersect_list = list(self.intersect) for sid0, sid1 in self.combo_distinct(just_sid_0_list, intersect_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_distinct(just_sid_0_list, just_sid_1_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_distinct(intersect_list, just_sid_1_list, skip_ref): yield sid0, sid1 for sid0, sid1 in self.combo_same(intersect_list, skip_ref): yield sid0, sid1 assert skip_ref[0] == 0, "real assert" def combo_distinct(self, distinct__list0, distinct__list1, skip_ref): row_count = len(distinct__list0) col_count = len(distinct__list1) if skip_ref[0] >= row_count * col_count: skip_ref[0] = skip_ref[0] - row_count * col_count assert skip_ref[0] >=0, "real assert" return row_start = skip_ref[0] // col_count skip_ref[0] = skip_ref[0] - row_start * col_count assert skip_ref[0] >=0, "real assert" for row_index in range(row_start, row_count): sid0 = distinct__list0[row_index] if row_index == row_start: col_start = skip_ref[0] skip_ref[0] = 0 else: col_start = 0 for col_index in range(col_start, col_count): sid1 = distinct__list1[col_index] yield sid0, sid1 def combo_same(self, list, skip_ref): count = len(list) full_size = count * (count + 1) // 2 if skip_ref[0] >= full_size: skip_ref[0] = skip_ref[0] - full_size assert skip_ref[0] >=0, "real assert" return row_start = int((-1 + 2*count - np.sqrt(1 - 4*count + 4*count**2 - 8*skip_ref[0]))/2) skip_ref[0] = skip_ref[0] - (count*row_start - (row_start*(1 + row_start))//2) assert skip_ref[0] >=0, "real assert" for row_index in range(row_start, count): sid0 = list[row_index] if row_index == row_start: col_start = skip_ref[0] skip_ref[0] = 0 else: col_start = 0 for col_index in range(col_start + 1 + row_index, count): sid1 = list[col_index] assert sid0 is not sid1, "real assert" yield sid0, sid1 @property def pair_count(self): self._run_once() return self._pair_count def lmm_from_cache_file(self): logging.info("Loading precomputation from {0}".format(self.cache_file)) lmm = LMM() with np.load(self.cache_file) as data: lmm.U = data['arr_0'] lmm.S = data['arr_1'] return lmm def fill_in_cache_file(self): self._run_once() logging.info("filling in the cache_file and log_delta, as needed") if self.G1_or_none is None: self.G1val_or_none = None else: self.G1val_or_none = self.G1_or_none.read().val # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs if self.cache_file is None: self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz") if os.path.exists(self.cache_file): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date os.remove(self.cache_file) lmm = None if not os.path.exists(self.cache_file): logging.info("Precomputing eigen") lmm = LMM() G0_standardized = self.G0.read().standardize() lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing) logging.info("Saving precomputation to {0}".format(self.cache_file)) util.create_directory_if_necessary(self.cache_file) np.savez(self.cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write if self.external_log_delta is None: if lmm is None: lmm = self.lmm_from_cache_file() logging.info("searching for internal delta") lmm.setX(self.covar) lmm.sety(self.pheno['vals']) #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count result = lmm.find_log_delta(REML=False, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta ) #!!what about findA2H2? minH2=0.00001 self.external_log_delta = result['log_delta'] self.internal_delta = np.exp(self.external_log_delta) * self.G0.sid_count logging.info("internal_delta={0}".format(self.internal_delta)) logging.info("external_log_delta={0}".format(self.external_log_delta)) do_pair_count = 0 do_pair_time = time.time() def do_work(self, lmm, sid0_list, sid1_list): dataframe = pd.DataFrame( index=np.arange(len(sid0_list)), columns=('SNP0', 'Chr0', 'GenDist0', 'ChrPos0', 'SNP1', 'Chr1', 'GenDist1', 'ChrPos1', 'PValue', 'NullLogLike', 'AltLogLike') ) #!!Is this the only way to set types in a dataframe? dataframe['Chr0'] = dataframe['Chr0'].astype(np.float) dataframe['GenDist0'] = dataframe['GenDist0'].astype(np.float) dataframe['ChrPos0'] = dataframe['ChrPos0'].astype(np.float) dataframe['Chr1'] = dataframe['Chr1'].astype(np.float) dataframe['GenDist1'] = dataframe['GenDist1'].astype(np.float) dataframe['ChrPos1'] = dataframe['ChrPos1'].astype(np.float) dataframe['PValue'] = dataframe['PValue'].astype(np.float) dataframe['NullLogLike'] = dataframe['NullLogLike'].astype(np.float) dataframe['AltLogLike'] = dataframe['AltLogLike'].astype(np.float) #This is some of the code for a different way that reads and dot-products 50% more, but does less copying. Seems about the same speed #sid0_index_list = self.test_snps.sid_to_index(sid0_list) #sid1_index_list = self.test_snps.sid_to_index(sid1_list) #sid_index_union_dict = {} #sid0_index_index_list = self.create_index_index(sid_index_union_dict, sid0_index_list) #sid1_index_index_list = self.create_index_index(sid_index_union_dict, sid1_index_list) #snps0_read = self.test_snps[:,sid0_index_list].read().standardize() #snps1_read = self.test_snps[:,sid1_index_list].read().standardize() sid_union = set(sid0_list).union(sid1_list) sid_union_index_list = sorted(self.test_snps.sid_to_index(sid_union)) snps_read = self.test_snps[:,sid_union_index_list].read().standardize() sid0_index_list = snps_read.sid_to_index(sid0_list) sid1_index_list = snps_read.sid_to_index(sid1_list) products = snps_read.val[:,sid0_index_list] * snps_read.val[:,sid1_index_list] # in the products matrix, each column i is the elementwise product of sid i in each list X = np.hstack((self.covar, snps_read.val, products)) UX = lmm.U.T.dot(X) k = lmm.S.shape[0] N = X.shape[0] if (k<N): UUX = X - lmm.U.dot(UX) else: UUX = None for pair_index, sid0 in enumerate(sid0_list): sid1 = sid1_list[pair_index] sid0_index = sid0_index_list[pair_index] sid1_index = sid1_index_list[pair_index] index_list = np.array([pair_index]) #index to product index_list = index_list + len(sid_union_index_list) #Shift by the number of snps in the union index_list = np.hstack((np.array([sid0_index,sid1_index]),index_list)) # index to sid0 and sid1 index_list = index_list + self.covar.shape[1] #Shift by the number of values in the covar index_list = np.hstack((np.arange(self.covar.shape[1]),index_list)) #indexes of the covar index_list_less_product = index_list[:-1] #index to everything but the product #Null -- the two additive SNPs lmm.X = X[:,index_list_less_product] lmm.UX = UX[:,index_list_less_product] if (k<N): lmm.UUX = UUX[:,index_list_less_product] else: lmm.UUX = None res_null = lmm.nLLeval(delta=self.internal_delta, REML=False) ll_null = -res_null["nLL"] #Alt -- now with the product feature lmm.X = X[:,index_list] lmm.UX = UX[:,index_list] if (k<N): lmm.UUX = UUX[:,index_list] else: lmm.UUX = None res_alt = lmm.nLLeval(delta=self.internal_delta, REML=False) ll_alt = -res_alt["nLL"] test_statistic = ll_alt - ll_null degrees_of_freedom = 1 pvalue = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom) logging.debug("<{0},{1}>, null={2}, alt={3}, pvalue={4}".format(sid0,sid1,ll_null,ll_alt,pvalue)) dataframe.iloc[pair_index] = [ sid0, snps_read.pos[sid0_index,0], snps_read.pos[sid0_index,1], snps_read.pos[sid0_index,2], sid1, snps_read.pos[sid1_index,0], snps_read.pos[sid1_index,1], snps_read.pos[sid1_index,2], pvalue, ll_null, ll_alt] self.do_pair_count += 1 if self.do_pair_count % 100 == 0: start = self.do_pair_time self.do_pair_time = time.time() logging.info("do_pair_count={0}, time={1}".format(self.do_pair_count,self.do_pair_time-start)) return dataframe
def __init__(self,args): if args.window_type not in ['KBP','SNP']: raise ValueError('Window type not supported') # Open files bed_1 = Bed(args.bfile1,count_A1=False) # bed_2 = Bed(args.bfile2,count_A1=False) # Get indel locations, if any bim_1=pd.read_table(bed_1.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) bim_2=pd.read_table(bed_2.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) is_indel_1 = np.array([(len(str(a1))>1)|(len(str(a2))>1) for a1,a2 in bim_1[['a1','a2']].values]) is_indel_2 = np.array([(len(str(a1))>1)|(len(str(a2))>1) for a1,a2 in bim_2[['a1','a2']].values]) # Make sure two SNPs don't have the same position is_duplicated_bp_1=bim_1.pos_bp.duplicated() is_duplicated_bp_2=bim_2.pos_bp.duplicated() # Make sure two SNPs don't have the same ID is_duplicated_id_1=bim_1.id.duplicated() is_duplicated_id_2=bim_2.id.duplicated() # Get allele frequencies af1 = self.get_allele_frequency(bed_1,args) # af2 = self.get_allele_frequency(bed_2,args) print(len(af1), "Variants in file 1") print(len(af2), "Variants in file 2") # Get good SNPs snps_1 = (af1>args.maf)&(af1<1-args.maf)&(~is_indel_1)&(~is_duplicated_bp_1)&(~is_duplicated_id_1) # snps_2 = (af2>args.maf)&(af2<1-args.maf)&(~is_indel_2)&(~is_duplicated_bp_2)&(~is_duplicated_id_2) print(np.sum(snps_1), "SNPs in file 1 after MAF and indel filter") print(np.sum(snps_2), "SNPs in file 2 after MAF and indel filter") if (args.from_bp is not None) and (args.to_bp is not None): k1 = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp) k2 = (bed_2.pos[:,2]>args.from_bp)&(bed_2.pos[:,2]<args.to_bp) snps_1 = snps_1&k1 snps_2 = snps_2&k2 snps_to_use = np.intersect1d(bed_1.sid[snps_1.values],bed_2.sid[snps_2.values]) print(len(snps_to_use),"SNPs common in both populations") if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) print(len(keep),"SNPs to extract") snps_to_use = np.intersect1d(snps_to_use,keep) print(len(snps_to_use),"SNPs remaining after extraction") bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) # bed_2_index = np.sort(bed_2.sid_to_index(snps_to_use)) if not args.no_align: alignment,bed_1_index,bed_2_index =\ self.align_alleles(bed_1,bed_1_index,af1,bed_2,bed_2_index,af2) else: alignment = np.ones(len(bed_1_index)) pos = bed_1.pos[bed_1_index] # af1 = af1[bed_1_index] # af2 = af2[bed_2_index] # if args.afile1 is not None: # a1 = pd.read_table(args.afile,header=None,sep='\s*', # names=['id1','id2','theta']) # else: a1 = None # if args.afile2 is not None: # a2 = pd.read_table(args.afile,header=None,sep='\s*', # names=['id1','id2','theta']) # else: a2 = None self.af1 = af1 # self.af2 = af2 self.M = len(bed_1_index) # self.N = (bed_1.iid_count, bed_2.iid_count) # self.chr = pos[:,0] self.pos = pos[:,2] self.id = bed_1.sid[bed_1_index] self.A1 = bim_1['a1'].iloc[bed_1_index] self.A2 = bim_1['a2'].iloc[bed_1_index] self.windows = self.get_windows(pos,args) # self.scores1 = self.compute(bed_1,bed_1_index,af1,a1,args) self.scores2 = self.compute(bed_2,bed_2_index,af2,a2,args) # self.scoresX = self.compute2(bed_1,bed_1_index,bed_2,bed_2_index, alignment,a1,a2,args) #
class fit(object): def __init__(self,args): self.bed = Bed(args.bfile) # self.N = self.bed.iid_count if args.covfile is not None: cov = pd.read_table(args.covfile,header=None) self.cov = sm.add_constant(ju._reorder(cov,self.bed.iid)) self.ncov = self.cov.shape[1] # + constant else: self.cov = np.ones((self.N,1)) self.ncov = 1 # Constant if args.phenofile is not None: Y = pd.read_table(args.phenofile,header=None,na_values='-9') else: try: Y = pd.read_table(args.bfile+'.pheno',header=None,na_values='-9') except IOError: print("Phenotype file not found.") exit(1) self.Y = ju._reorder(Y,self.bed.iid) af = ju.get_allele_frequency(self.bed,args) # snps = (af>args.maf)&(af<1-args.maf) # if (args.from_bp is not None) and (args.to_bp is not None): k = (bed.pos[:,2]>args.from_bp)&(bed.pos[:,2]<args.to_bp) snp1 = snps&k snps_to_use = self.bed.sid[snps] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) snps_to_use = np.intersect1d(snps_to_use,keep) self.bed_index = np.sort(self.bed.sid_to_index(snps_to_use)) # pos = self.bed.pos[self.bed_index] # bim=pd.read_table(self.bed.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) self.af = af[self.bed_index] # self.M = len(self.bed_index) # self.windows = ju.get_windows(pos,self.M,args.window_size,args.window_type) self.pos = pos[:,2] self.chr = pos[:,0] self.id = self.bed.sid[self.bed_index] self.A1 = bim['a1'].loc[self.bed_index] self.A2 = bim['a2'].loc[self.bed_index] self.logistic = False self.chimin = stats.chi2.ppf(1-args.minp,2) # Fit null if (not args.linear) and (self.Y.min() >= 0 and self.Y.max() <= 1): self.null = sm.Logit(self.Y, self.cov, missing='drop').fit(disp=0) self.logistic = True else: self.null = sm.OLS(self.Y, self.cov, missing='drop').fit(disp=0) if self.ncov > 1: self.cov = sm.add_constant(self.null.fittedvalues) self.marg_res, self.joint_res = self.compute(args) def compute(self,args): t=time() marg_res = [] joint_res = [] Z = [] windex = 0 li,ri = self.windows[windex] nstr = np.max((args.SNPs_to_read,ri-li)) offset = li G = self.bed[:,self.bed_index[li:(li+nstr)]].read().val G = ju._impute_missing(G) # replace missing with mean self.compute_marg(marg_res,Z,G,li,args) A = ju._norm_data(G) while ri < offset+nstr: st = li-offset fi = ri-offset # All correlations of SNP j with SNPs in its window R = np.dot(np.atleast_2d(A[:,st]/self.N),A[:,(st+1):fi]).flatten() Zl = Z[li] Zr = np.array(Z[(li+1):ri]) # Use marginal Z-scores and R to compute expected joint chi2s ChiP = (1/(1-R**2))*(Zl**2+Zr**2-2*R*Zl*Zr) ChiP[R**2 < args.r2min] = -1 self.compute_joint(joint_res,G,ChiP,offset,li,ri,args) windex += 1 li,ri = self.windows[windex] for i in xrange(offset+nstr,self.M,nstr): sys.stdout.flush() sys.stdout.write("SNP: %d, %f\r" % (i, time()-t)) Gn = self.bed[:,self.bed_index[i:(i+nstr)]].read().val Gn = ju._impute_missing(Gn) An = ju._norm_data(Gn) self.compute_marg(marg_res,Z,Gn,i,args) G = np.hstack((G,Gn)) A = np.hstack((A,An)) if G.shape[1] > args.SNPs_to_store: G = G[:,nstr:] A = A[:,nstr:] offset += nstr while ri < i+nstr: st = li-offset fi = ri-offset # All correlations of SNP j with SNPs in its window R = np.dot(np.atleast_2d(A[:,st]/self.N),A[:,(st+1):fi]).flatten() Zl = Z[li] Zr = np.array(Z[(li+1):ri]) ChiP = (1/(1-R**2))*(Zl**2+Zr**2-2*R*Zl*Zr) ChiP[R**2 < args.r2min] = -1 self.compute_joint(joint_res,G,ChiP,offset,li,ri,args) try: windex += 1 li,ri = self.windows[windex] except IndexError: break marg_res = pd.DataFrame(marg_res) joint_res = pd.DataFrame(joint_res) return marg_res, joint_res def compute_joint(self,joint_res,G,ChiP,offset,li,ri,args): st = li-offset fi = ri-offset snp1 = G[:,st] for i,snp2 in enumerate(G[:,(st+1):fi].T): if ChiP[i] > self.chimin: X = np.hstack((self.cov,snp1.reshape((len(snp1),1)), snp2.reshape((len(snp2),1)))) if self.logistic: joint = sm.Logit(self.Y,X).fit(disp=0) else: joint = sm.OLS(self.Y,X).fit(disp=0) joint_b1 = joint.params[-2] joint_b2 = joint.params[-1] joint_or1 = np.exp(joint_b1) joint_or2 = np.exp(joint_b2) joint_se1 = joint.bse[-2] joint_se2 = joint.bse[-1] joint_p1 = joint.pvalues[-2] joint_p2 = joint.pvalues[-1] joint_t1 = joint.tvalues[-2] joint_t2 = joint.tvalues[-1] joint_Chi2 = 2*(joint.llf - self.null.llf) pv = stats.chi2.sf(joint_Chi2,2) joint_res.append([self.chr[li],self.id[li],self.id[offset+i], self.pos[li],self.pos[offset+i],joint_b1,joint_se1, joint_or1,joint_t1,joint_p1,joint_b2,joint_se2, joint_or2,joint_t2,joint_p2,joint_Chi2,pv]) else: continue def compute_marg(self,marg_res,Z,G,offset,args): for i,snp in enumerate(G.T): X = np.hstack((self.cov,snp.reshape((len(snp),1)))) if self.logistic: marg = sm.Logit(self.Y,X).fit(disp=0) else: marg = sm.OLS(self.Y,X).fit(disp=0) marg_b = marg.params[-1] marg_or = np.exp(marg_b) marg_se = marg.bse[-1] marg_p = marg.pvalues[-1] marg_t = marg.tvalues[-1] marg_Chi2 = 2*(marg.llf - self.null.llf) pv = stats.chi2.sf(marg_Chi2,1) marg_res.append([self.chr[offset+i],self.id[offset+i], self.pos[offset+i],marg_b, marg_se, marg_or, marg_t, marg_p,marg_Chi2,pv]) Z.append(marg_b/marg_se)
def __init__(self, args): if args.window_type not in ['BP', 'SNP']: raise ValueError('Window type not supported') bed_1 = Bed(args.bfile1) # bed_2 = Bed(args.bfile2) af1 = self.get_allele_frequency(bed_1, args) # af2 = self.get_allele_frequency(bed_2, args) print(len(af1), "SNPs in file 1") print(len(af2), "SNPs in file 2") snps_1 = (af1 > args.maf) & (af1 < 1 - args.maf) # snps_2 = (af2 > args.maf) & (af2 < 1 - args.maf) print(np.sum(snps_1), "SNPs in file 1 after MAF filter") print(np.sum(snps_2), "SNPs in file 2 after MAF filter") if (args.from_bp is not None) and (args.to_bp is not None): k1 = (bed_1.pos[:, 2] > args.from_bp) & (bed_1.pos[:, 2] < args.to_bp) k2 = (bed_2.pos[:, 2] > args.from_bp) & (bed_2.pos[:, 2] < args.to_bp) snps_1 = snps_1 & k1 snps_2 = snps_2 & k2 snps_to_use = np.intersect1d(bed_1.sid[snps_1], bed_2.sid[snps_2]) print(len(snps_to_use), "SNPs common in both populations") if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract, 'r')]) print(len(keep), "SNPs to extract") snps_to_use = np.intersect1d(snps_to_use, keep) print(len(snps_to_use), "SNPs remaining after extraction") bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) # bed_2_index = np.sort(bed_2.sid_to_index(snps_to_use)) if not args.no_align: alignment,bed_1_index,bed_2_index =\ self.align_alleles(bed_1,bed_1_index,af1,bed_2,bed_2_index,af2) else: alignment = np.ones(len(bed_1_index)) pos = bed_1.pos[bed_1_index] # bim_1 = pd.read_table( bed_1.filename + '.bim', header=None, names=['chm', 'id', 'pos_mb', 'pos_bp', 'a1', 'a2']) af1 = af1[bed_1_index] # af2 = af2[bed_2_index] if args.afile1 is not None: a1 = pd.read_table(args.afile, header=None, sep='\s*', names=['id1', 'id2', 'theta']) else: a1 = None if args.afile2 is not None: a2 = pd.read_table(args.afile, header=None, sep='\s*', names=['id1', 'id2', 'theta']) else: a2 = None self.af1 = af1 # self.af2 = af2 self.M = len(bed_1_index) # self.N = (bed_1.iid_count, bed_2.iid_count) # self.chr = pos[:, 0] self.pos = pos[:, 2] self.id = bed_1.sid[bed_1_index] self.A1 = bim_1['a1'].loc[bed_1_index] self.A2 = bim_1['a2'].loc[bed_1_index] self.windows = self.get_windows(pos, args) # self.scores1 = self.compute(bed_1, bed_1_index, af1, a1, args) self.scores2 = self.compute(bed_2, bed_2_index, af2, a2, args) # self.scoresX = self.compute2(bed_1, bed_1_index, bed_2, bed_2_index, alignment, a1, a2, args) #
class sample(object): def __init__(self,args): self.bed = Bed(args.bfile) # self.N = self.bed.iid_count if args.covfile is not None: cov = pd.read_table(args.covfile,header=None) self.cov = sm.add_constant(ju._reorder(cov,self.bed.iid)) self.ncov = self.cov.shape[1] # + constant else: self.cov = np.ones((self.N,1)) self.ncov = 1 # Constant af = ju.get_allele_frequency(self.bed,args) # snps = (af>args.maf)&(af<1-args.maf) # if (args.from_bp is not None) and (args.to_bp is not None): k = (bed.pos[:,2]>args.from_bp)&(bed.pos[:,2]<args.to_bp) snp1 = snps&k snps_to_use = self.bed.sid[snps] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) snps_to_use = np.intersect1d(snps_to_use,keep) self.bed_index = np.sort(self.bed.sid_to_index(snps_to_use)) # pos = self.bed.pos[self.bed_index] # bim=pd.read_table(self.bed.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) self.af = af[self.bed_index] # self.M = len(self.bed_index) # self.windows = ju.get_windows(pos,self.M,args.window_size,args.window_type) self.sample_windows = ju.get_windows(pos,self.M,args.sample_window_size, args.sample_window_type) self.pos = pos[:,2] self.chr = pos[:,0] self.id = self.bed.sid[self.bed_index] self.A1 = bim['a1'].loc[self.bed_index] self.A2 = bim['a2'].loc[self.bed_index] self.numSamples = args.numSamples self.JMaxStats, self.ZMaxStats = self.sample(args) self.JMinP = stats.chi2.sf(self.JMaxStats,2) self.ZMinP = stats.chi2.sf(self.ZMaxStats**2,1) self.minP = np.minimum(self.JMinP,self.ZMinP) def sample(self,args): t=time() nz = 0 ZMaxStats = np.zeros((self.numSamples,1)) JMaxStats = np.zeros((self.numSamples,1)) windex = 0 sli,sri = self.sample_windows[windex] tli,tri = self.windows[windex] nstr = np.max((args.SNPs_to_read,sri-sli)) offset = sli G = self.bed[:,self.bed_index[sli:(sli+nstr)]].read().val G = ju._impute_missing(G) A = ju._norm_data(G) # Sample Z-scores and do joint tests of first window R = np.dot(A[:,sli:sri].T/self.N,A[:,sli:sri]) Z=np.random.multivariate_normal(np.zeros((R.shape[0])),R,args.numSamples) nz += R.shape[0] zli,zri = sli,sri # position of Z relative to full genotype gli,gri = zli,zri # position of Z relative to genotype in memory Rp = R[(tli+1):tri,0] to_test = Rp**2 > args.r2min Rp = Rp[to_test] Zl = np.atleast_2d(Z[:,0]).T Zr = np.array(Z[:,1:(tri-tli)])[:,to_test] ChiP = (1/(1-Rp**2))*(Zl**2+Zr**2-2*Rp*Zl*Zr) ZMaxStats = np.atleast_2d(np.hstack((ZMaxStats,abs(Z))).max(1)).T # ZMaxStats = np.maximum(ZMaxStats,abs(Z.max(1))) # JMaxStats = np.maximum(JMaxStats,ChiP.max(1)) JMaxStats = np.atleast_2d(np.hstack((JMaxStats,ChiP)).max(1)).T # Slide through genotype in memory while True: windex += 1 sli,sri = self.sample_windows[windex] tli,tri = self.windows[windex] if sri >= offset+nstr: break tst,tfi,sst,sfi = np.array([tli,tri,sli,sri])-offset #print sli, sri, zli, zri, gli, gri, Z.shape[1] if zli < sli: # drop zli..sli and update indices Z = Z[:,(sli-zli):] zli,gli = sli,sst if zri < sri: # marginal sample everything from zri..sri S = A[:,gli:gri] # G that overlaps Z Sn = A[:,gri:sri] # G about to have Z scores sampled r12 = S.T.dot(Sn)/self.N r11 = Sn.T.dot(Sn)/self.N Zn = self.sample_func(Z,S,Sn,r11,r12,args) Z = np.hstack((Z,Zn)) nz += (sri-gri) zri,gri = sri,sfi ZMaxStats = np.atleast_2d(np.hstack((ZMaxStats,abs(Zn))).max(1)).T # ZMaxStats = np.maximum(ZMaxStats,abs(Zn.flatten())) # All correlations of SNP tli with SNPs in its window # Surely these are already computed and some cleverness # can be used to re-use them but its a fast calculation anyways if sri-sli > 1: R = np.dot(np.atleast_2d(A[:,tst]/self.N), A[:,(tst+1):tfi]).flatten() to_test = R**2 > args.r2min R = R[to_test] Zl = np.atleast_2d(Z[:,0]).T Zr = np.array(Z[:,1:(sri-sli)])[:,to_test] ChiP = (1/(1-R**2))*(Zl**2+Zr**2-2*R*Zl*Zr) JMaxStats = np.atleast_2d(np.hstack((JMaxStats,ChiP)).max(1)).T #JMaxStats = np.maximum(JMaxStats,ChiP.max(1)) for i in xrange(offset+nstr,self.M,nstr): sys.stdout.flush() sys.stdout.write("SNP: %d, %f\r" % (i, time()-t)) Gn = self.bed[:,self.bed_index[i:(i+nstr)]].read().val Gn = ju._impute_missing(Gn) An = ju._norm_data(Gn) G = np.hstack((G,Gn)) A = np.hstack((A,An)) if G.shape[1] > args.SNPs_to_store: G = G[:,nstr:] A = A[:,nstr:] offset += nstr gli -= nstr gri -= nstr while sri < i+nstr: tst,tfi,sst,sfi = np.array([tli,tri,sli,sri])-offset if zli < sli: # drop zli..sli and update indices Z = Z[:,(sli-zli):] zli,gli = sli,sst if zri < sri: # marginal sample everything from zri..sri S = A[:,gli:gri] # G that overlaps Z Sn = A[:,gri:sfi] # G about to have Z scores sampled r12 = S.T.dot(Sn)/self.N r11 = Sn.T.dot(Sn)/self.N Zn = self.sample_func(Z,S,Sn,r11,r12,args) Z = np.hstack((Z,Zn)) nz += (sfi-gri) zri,gri = sri,sfi ZMaxStats = np.atleast_2d(np.hstack((ZMaxStats,abs(Zn))).max(1)).T # ZMaxStats = np.maximum(ZMaxStats,abs(Zn.flatten())) if sri-sli > 1: R = np.dot(np.atleast_2d(A[:,tst]/self.N), A[:,(tst+1):tfi]).flatten() to_test = R**2 > args.r2min R = R[to_test] Zl = np.atleast_2d(Z[:,0]).T Zr = np.array(Z[:,1:(sri-sli)])[:,to_test] ChiP = (1/(1-R**2))*(Zl**2+Zr**2-2*R*Zl*Zr) #JMaxStats = np.maximum(JMaxStats,ChiP.max(1)) JMaxStats = np.atleast_2d(np.hstack((JMaxStats,ChiP)).max(1)).T try: windex += 1 sli,sri = self.sample_windows[windex] tli,tri = self.windows[windex] except IndexError: break # print "HERE:", nz return JMaxStats.flatten(), ZMaxStats.flatten() def sample_func(self,Z,S,Sn,r11,r12,args): S22IS12 = sp.linalg.lstsq(S,Sn,cond=1e-8)[0] muC = Z.dot(S22IS12) SigC = r11-r12.T.dot(S22IS12) Zn = np.random.multivariate_normal(np.zeros((SigC.shape[0])), SigC,size=args.numSamples) Zn += muC return Zn