def preprocess(self, s): print('Acor is preprocessing', s.name, 'with refpanel=', self.params.refpanel) print(self.params) annots = [ pa.Annotation(paths.annotations + aname) for aname in self.params.annot_chr.split(',') ] for a in annots: print('preprocessing', a.filestem()) for c in s.chromosomes: if not os.path.exists( a.conv_filename(c, full=self.params.fullconv)): conv_command = [ 'python', '-u', paths.code + 'acor/acor.py', '--annot-chr', a.stem_chr, '--bfile-chr', self.refpanel.bfile_chr] + \ (['-fullconv'] if self.params.fullconv else []) + \ ['conv', '--chroms', str(c)] print(' '.join(conv_command)) outfilepath = a.filestem(c) + '.' + \ ('full' if self.params.fullconv else '') + \ 'convbsub_out' bsub.submit( conv_command, outfilepath, jobname=self.params.annot_chr.replace('/','_') + \ ',conv,chr='+str(c))
def required_files(self, s): annots = [ pa.Annotation(paths.annotations + aname) for aname in self.params.annot_chr.split(',') ] return [ a.conv_filename(c, full=self.params.fullconv) for c in s.chromosomes for a in annots ]
def run(self, s, beta_num): print('Acor is running', s.name, 'on beta', beta_num, 'with refpanel=', self.params.refpanel) print(self.params) annots = [ pa.Annotation(paths.annotations + aname) for aname in self.params.annot_chr.split(',') ] cmd = [ 'python', '-u', paths.code + 'acor/acor.py', '--annot-chr', ' '.join([a.stem_chr for a in annots]), '--bfile-chr', self.refpanel.bfile_chr] + \ (['-fullconv'] if self.params.fullconv else []) + \ ['cor', '--ldscores-chr', self.refpanel.bfile_chr, '--sumstats', s.sumstats_filename(beta_num), '--out', self.result_filename(s, beta_num), self.params.kind] + \ (['-reg-var'] if self.params.regvar else []) + \ (['-noweights'] if not self.params.weights else []) + \ (['-biascorrect'] if self.params.biascorrect else []) + \ (['--maf-thresh', str(self.params.maf_thresh)] if self.params.maf_thresh > 0 else []) + \ ['--chroms'] + [str(c) for c in s.chromosomes] print(' '.join(cmd)) subprocess.call(cmd) acorresults = pd.read_csv(self.result_filename(s, beta_num) + '.results', delim_whitespace=True, header=0) rowindex = np.where( np.concatenate([a.names(s.chromosomes[-1]) for a in annots]) == self.params.coeff)[0][0] if self.params.print == 'all': estimate = acorresults['MU_EST'][rowindex] elif self.params.print == 'num': estimate = acorresults['TOP'][rowindex] elif self.params.print == 'denom': estimate = acorresults['BOTTOM'][rowindex] stderr = acorresults['MU_STDERR'][rowindex] pval = acorresults['MU_P'][rowindex] return pd.DataFrame(columns=['ESTIMATE', 'STDERR', 'P'], data=[[estimate, stderr, pval]])
def run(self, s, beta_num): print('LDSC is running', s.name, 'on beta', beta_num, 'with refpanel=', self.params.refpanel) print(self.params) annots = [ pa.Annotation(paths.annotations + aname) for aname in self.params.annot_chr.split(',') ] if self.params.constrain_intercept: extra_args = ['--no-intercept'] else: extra_args = [] cmd = [ 'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--h2', s.sumstats_filename(beta_num), '--ref-ld-chr', ','.join( [a.stem_chr for a in annots]), '--w-ld-chr', self.refpanel.bfile_chr, '--overlap-annot', '--print-coefficients', '--chisq-max', '999999', '--frqfile-chr', self.refpanel.bfile_chr, '--out', self.result_filename(s, beta_num) ] + extra_args print(' '.join(cmd)) subprocess.call(cmd) ldscresults = pd.read_csv(self.result_filename(s, beta_num) + '.results', delim_whitespace=True, header=0) rowindex = np.where( np.concatenate([a.names(22) for a in annots]) == self.params.coeff)[0][0] estimate = ldscresults['Prop._h2'][rowindex] stderr = ldscresults['Prop._h2_std_error'][rowindex] pval = ldscresults['Enrichment_p'][rowindex] return pd.DataFrame(columns=['ESTIMATE', 'STDERR', 'P'], data=[[estimate, stderr, pval]])
def preprocess(self, s): print('LDSC is preprocessing', s.name, 'with refpanel=', self.params.refpanel) print(self.params) for annot_chr in self.params.annot_chr.split(','): a = pa.Annotation(paths.annotations + annot_chr) for c in range(1, 23): if not os.path.exists(a.ldscores_filename(c)): ldscores_command = [ 'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2', '--ld-wind-cm', str(self.params.ld_window), '--bfile', self.refpanel.bfile(c), '--annot', a.annot_filename(c), '--out', a.filestem(c) ] print(' '.join(ldscores_command)) outfilepath = a.filestem(c) + '.ldscoresbsub_out' bsub.submit( ldscores_command, outfilepath, jobname=self.params.annot_chr.replace('/','_') + \ ',ldcores,chr='+str(c))
def write(folder, args, name, background_names, mux, muy, z, corr_thresh=0.8): print('STORYTELLING for ', name, 'z=', z) refpanel = gd.Dataset(args.bfile_reg_chr) annot = [ga.Annotation(a) for a in args.sannot_chr if name in ga.Annotation(a).names(22, RV=True)][0] backgroundannots = [ga.Annotation(a) for a in args.background_sannot_chr] print('focal annotation columns:', annot.names(22, True)) print('background annotations:', background_names) # get refpanel snp metadata print('re-reading snps') snps = pd.concat([refpanel.bim_df(c) for c in args.chroms], axis=0) # read sumstats print('re-reading sumstats') ss = pd.concat([ pd.read_csv(args.pss_chr+str(c)+'.pss.gz', sep='\t', usecols=['N','Winv_ahat_I']) for c in args.chroms]) snps['ahat'] = ss['Winv_ahat_I'] snps['N'] = ss['N'] del ss # read annotations print('re-reading background annotations') for a in backgroundannots: mynames = [n for n in a.names(22, RV=True) if '.R' in n] #names of annotations snps = pd.concat([snps, pd.concat([a.RV_df(c)[mynames] for c in args.chroms], axis=0) ], axis=1) print('reading focal annotation') snps = pd.concat([snps, pd.concat([annot.RV_df(c)[name] for c in args.chroms], axis=0) ], axis=1) print('residualizing background out of focal') A = snps[background_names] snps['chi2'] = snps.N * snps.ahat**2 snps['Rv'] = snps[name] snps['ahat_resid'] = snps.ahat - A.values.dot(muy) snps['Rv_resid'] = snps.Rv - A.values.dot(mux) snps['typed'] = snps.ahat_resid.notnull() snps = snps[snps.typed].reset_index(drop=True) snps['significant'] = snps.chi2 > 29.716785 print(snps.significant.sum(), 'genome-wide significant SNPs') print('searching for good windows') # get endpoints of windows stride = 20 windowsize_in_strides = 5 windowsize = stride * windowsize_in_strides # find all starting points of windows containing GWAS-sig SNPs starts = np.concatenate([ [ int(i/stride)*stride - k*stride for k in range(0, windowsize_in_strides)] for i in np.where(snps.significant)[0]]) starts = np.array(sorted(list(set(starts)))) # compute corresponding endpoints ends = starts + windowsize # truncate any windows that extend past the ends of the genome starts = starts[ends < len(snps)] ends = ends[ends < len(snps)] ends = ends[starts >= 0] starts = starts[starts >= 0] print(len(starts), 'windows with GWAS hits found') # compute correlations numbers = pd.DataFrame( np.array([[i,j, np.max(snps.iloc[i:j].chi2), np.corrcoef(snps.iloc[i:j].Rv_resid, snps.iloc[i:j].ahat_resid)[0,1]] for i,j in zip(starts, ends)]), columns=['start','end','maxchi2','corr']) # keep only cases with strong correlations in the right direction numbers = numbers[numbers['corr']**2 >= corr_thresh] numbers = numbers[np.sign(numbers['corr']) == np.sign(z)] print(len(numbers), 'windows with GWAS hits and squared correlation with Rv >=', corr_thresh, 'in the right direction') for i,j in zip(numbers.start, numbers.end): i = int(i); j = int(j) print('saving', i,j) c = snps.iloc[i].CHR start = snps.iloc[i].BP end = snps.iloc[j].BP plt.figure() plt.scatter(snps.iloc[i:j].Rv_resid, snps.iloc[i:j].ahat_resid * np.sqrt(snps.iloc[i:j].N)) plt.title('chr{}:{}-{}'.format(c, start, end)) plt.xlabel(r'residual $Rv$') plt.ylabel(r'residual $Z$') filename = '{}/chr{}:{}-{}.pdf'.format(folder, c, start, end) fs.makedir_for_file(filename) plt.savefig(filename) plt.close()
def required_files(self, s): a = pa.Annotation(paths.annotations + self.params.sannot_chr) return [a.filestem(c) + '.testprocess' for c in s.chromosomes]
def main(args): print('initializing...') import gzip, gc, time import numpy as np import pandas as pd import gprim.annotation as ga import gprim.dataset as gd import ypy.memo as memo # basic initialization mhc_bp = [25684587, 35455756] refpanel = gd.Dataset(args.bfile_chr) annots = [ga.Annotation(annot) for annot in args.sannot_chr] # read in ld blocks, remove MHC, read SNPs to print ldblocks = pd.read_csv(args.ld_blocks, delim_whitespace=True, header=None, names=['chr', 'start', 'end']) mhcblocks = (ldblocks.chr == 'chr6') & \ (ldblocks.end > mhc_bp[0]) & \ (ldblocks.start < mhc_bp[1]) ldblocks = ldblocks[~mhcblocks] print(len(ldblocks), 'loci after removing MHC') print_snps = pd.read_csv(args.print_snps, header=None, names=['SNP']) print_snps['printsnp'] = True print(len(print_snps), 'print snps') # process annotations for annot in annots: t0 = time.time() for c in args.chroms: print(time.time() - t0, ': loading chr', c, 'of', args.chroms) # get refpanel snp metadata for this chromosome snps = refpanel.bim_df(c) snps = ga.smart_merge(snps, refpanel.frq_df(c)[['SNP', 'MAF']]) print(len(snps), 'snps in refpanel', len(snps.columns), 'columns, including metadata') # read in annotation print('reading annot', annot.filestem()) names = annot.names(c) # names of annotations namesR = [n + '.R' for n in names] # names of results a = annot.sannot_df(c) if 'SNP' in a.columns: print( 'not a thinannot => doing full reconciliation of snps and allele coding' ) snps = ga.reconciled_to(snps, a, names, missing_val=0) else: print( 'detected thinannot, so assuming that annotation is synched to refpanel' ) snps = pd.concat([snps, a[names]], axis=1) # add information on which snps to print print('merging in print_snps') snps = pd.merge(snps, print_snps, how='left', on='SNP') snps.printsnp.fillna(False, inplace=True) snps.printsnp.astype(bool) # put on per-normalized-genotype scale if args.alpha != -1: print('scaling by maf according to alpha=', args.alpha) snps[names] = snps[names].values*\ np.power(2*snps.MAF.values*(1-snps.MAF.values), (1.+args.alpha)/2)[:,None] # make room for RV and make sure annotation values are treated as floats snps = pd.concat([ snps, pd.DataFrame(np.zeros(snps[names].shape), columns=namesR) ], axis=1) snps[names] = snps[names].astype(float) # compute simple statistics about annotation print('computing basic statistics and writing') info = pd.DataFrame(columns=[ 'M', 'M_5_50', 'sqnorm', 'sqnorm_5_50', 'supp', 'supp_5_50' ]) info['name'] = names info.set_index('name', inplace=True) info['M'] = len(snps) info['sqnorm'] = np.linalg.norm(snps[names], axis=0)**2 info['supp'] = np.linalg.norm(snps[names], ord=0, axis=0) M_5_50 = (snps.MAF >= 0.05).values info['M_5_50'] = M_5_50.sum() info['sqnorm_5_50'] = np.linalg.norm(snps.loc[M_5_50, names], axis=0)**2 info['supp_5_50'] = np.linalg.norm(snps.loc[M_5_50, names], ord=0, axis=0) info.to_csv(annot.info_filename(c), sep='\t') # process ldblocks one by one for ldblock, X, meta, ind in refpanel.block_data(ldblocks, c, meta=snps): if meta.printsnp.sum() == 0: print('no print-snps in this block') continue print(meta.printsnp.sum(), 'print-snps') if (meta[names] == 0).values.all(): print('annotations are all 0 in this block') snps.loc[ind, namesR] = 0 else: mask = meta.printsnp.values V = meta[names].values XV = X.dot(V) snps.loc[ind[mask], namesR] = \ X[:,mask].T.dot(XV[:,-len(names):]) / X.shape[0] # write print('writing output') with gzip.open(annot.RV_filename(c), 'w') as f: snps.loc[snps.printsnp, ['SNP', 'A1', 'A2'] + names + namesR].to_csv(f, index=False, sep='\t') del snps memo.reset() gc.collect() print('done')
def annotation(self): return pa.Annotation(paths.annotations + self.params.annot_chr)
def required_files(self, s): annots = [ pa.Annotation(paths.annotations + aname) for aname in self.params.annot_chr.split(',') ] return [a.ldscores_filename(c) for c in range(1, 23) for a in annots]