def main(args): np.random.seed(args.beta_num) sim = SumstatSimulation(args.sim_name) arch = Architecture(sim.architecture) d = Dataset(sim.dataset) # sample the beta beta = arch.draw_effect_sizes(sim.dataset, sim.h2g)[:, 0] # compute noiseless phenotypes slice by slice Y = np.zeros(d.N) t0 = time() for s in d.slices(): # X will be N x M print(int(time() - t0), ": getting genotypes from file. SNPs", s) X = d.get_standardized_genotypes(s) print("computing phenotypes. SNPs", s) Y += X.dot(beta[s[0] : s[1]]) del X # normalize the Y and the beta to the desired heritability normalization = np.std(Y) / np.sqrt(sim.h2g) if normalization == 0: normalization = 1 # just in case we have some 0s... Y /= normalization beta /= normalization # write the betas and the noiseless phenotypes pickle.dump(beta, sim.beta_file(args.beta_num, "wb"), 2) pickle.dump(Y, sim.noiseless_Y_file(args.beta_num, "wb"), 2)
def preprocess(self): if self.params.baseline and not self.baseline_preprocessing_in_progress( ): print('baseline model not found. creating...') self.declare_baseline_preprocessing_in_progress() self.create_baseline_model() print('submitting ld score jobs for annotation of interest') gs = GenomicSubset(self.params.region) # create the annotation file for chrnum in self.refpanel.chromosomes(): d = Dataset(self.params.refpanel, chrnum=chrnum) ss = SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum)) SnpSubset.print_subsets(self.annotation_filename(chrnum), [ss], [self.params.region], add_other=True) # create the ldscores file for chrnum in self.refpanel.chromosomes(): d = Dataset(self.params.refpanel, chrnum=chrnum) ldscores_command = [ 'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2', '--ld-wind-cm', str(self.params.ld_window / 1000.), '--bfile', d.genotypes_bedfile.filename, '--annot', self.annotation_filename(chrnum), '--out', self.annotation_l2_filestem(chrnum) ] print(' '.join(ldscores_command)) outfilepath = self.annotation_l2_filestem(chrnum) + '.bsub_out' bsub.submit(ldscores_command, outfilepath, jobname=self.preprocessing_foldername() + ',chr=' + str(chrnum))
def main(args): np.random.seed(args.beta_num) sim = SumstatSimulation(args.sim_name) arch = Architecture(sim.architecture) d = Dataset(sim.dataset) # sample the beta beta = arch.draw_effect_sizes(sim.dataset, sim.h2g)[:, 0] # compute noiseless phenotypes slice by slice Y = np.zeros(d.N) t0 = time() for s in d.slices(): # X will be N x M print(int(time() - t0), ': getting genotypes from file. SNPs', s) X = d.get_standardized_genotypes(s) print('computing phenotypes. SNPs', s) Y += X.dot(beta[s[0]:s[1]]) del X # normalize the Y and the beta to the desired heritability normalization = np.std(Y) / np.sqrt(sim.h2g) if normalization == 0: normalization = 1 # just in case we have some 0s... Y /= normalization beta /= normalization # write the betas and the noiseless phenotypes pickle.dump(beta, sim.beta_file(args.beta_num, 'wb'), 2) pickle.dump(Y, sim.noiseless_Y_file(args.beta_num, 'wb'), 2)
def create_baseline_model(self): gss = [GenomicSubset(region) for region in LDSC.baseline_model_regions] # create the annotation file for chrnum in self.refpanel.chromosomes(): print('creating baseline annot file for chr', chrnum) d = Dataset(self.params.refpanel, chrnum=chrnum) sss = [ SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum)) for gs in gss ] SnpSubset.print_subsets(self.baseline_filename(chrnum), sss, LDSC.baseline_model_regions) # create the ldscores file for chrnum in self.refpanel.chromosomes(): d = Dataset(self.params.refpanel, chrnum=chrnum) ldscores_command = [ 'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2', '--ld-wind-cm', str(self.params.ld_window / 1000.), '--bfile', d.genotypes_bedfile.filename, '--annot', self.baseline_filename(chrnum), '--out', self.baseline_l2_filestem(chrnum) ] print(' '.join(ldscores_command)) outfilepath = self.baseline_l2_filestem(chrnum) + '.bsub_out' bsub.submit(ldscores_command, outfilepath, jobname='baseline,chr=' + str(chrnum))
def main(args): np.random.seed(args.beta_num + args.sample_num * 10000) sim = SumstatSimulation(args.sim_name) d = Dataset(sim.dataset) pretty.print_namespace(sim) print() # read in noiseless phenotypes Y = pickle.load(sim.noiseless_Y_file(args.beta_num)) # choose individuals and create ensemble of Ys indices = np.random.choice(Y.shape[0], size=(sim.sample_size, )) Y = Y[indices] # compute how much noise to add sigma2e = 1 - sim.h2g print('adding noise. sigma2e =', sigma2e) Y += np.sqrt(sigma2e) * np.random.randn(*Y.shape) if sim.condition_on_covariates: print('projecting covariates out of Y') Y = d.project_out_covariates(Y, covariates=d.covariates[indices]) alphahat = np.zeros(d.M) t0 = time() def compute_sumstats_for_slice(s): # X will be N x M print(int(time() - t0), ': getting genotypes from file. SNPs', s) X = d.get_standardized_genotypes(s)[indices] if sim.condition_on_covariates: print(int(time() - t0), ': projecting out covariates') X = d.project_out_covariates(X, covariates=d.covariates[indices]) print(int(time() - t0), ': computing sumstats. SNPs', s) alphahat[s[0]:s[1]] = X.T.dot(Y) / sim.sample_size del X map(compute_sumstats_for_slice, d.slices()) # write output def write_output(): pickle.dump(indices, sim.individuals_file(args.beta_num, args.sample_num, 'wb'), 2) pickle.dump(Y, sim.noisy_Y_file(args.beta_num, args.sample_num, 'wb'), 2) pickle.dump(alphahat, sim.sumstats_file(args.beta_num, args.sample_num, 'wb'), 2) write_output()
def main(args): refpanel = Dataset(args.refpanel + '.' + str(args.chrnum)) annot_filename = '{}.{}.annot.gz'.format(args.annot_stem, args.chrnum) result_filename = '{}.maf1p.{}.annot.gz'.format(args.annot_stem, args.chrnum) print('reading annot') annot = pd.read_csv(annot_filename, compression='gzip', sep='\t', header=0) name = annot.columns[-1] names = annot.columns.values print('reading frq') refpanel_frq = pd.read_csv(refpanel.genotypes_bedfile.filename + '.frq', delim_whitespace=True, header=0) refpanel_frq = refpanel_frq[['SNP', 'MAF']] print('merging') annot = annot.merge(refpanel_frq, how='left', on=['SNP']) print('before filtering, |A| =', np.sum(annot[name])) annot.ix[annot['MAF'] > 0.01, name] = 0 print('after filtering, |A| =', np.sum(annot[name])) # write output print('writing output to', result_filename) with gzip.open(result_filename, 'wt') as f: annot[names].to_csv(f, index=False, sep='\t')
def main(args): np.random.seed(args.beta_num + args.sample_num * 10000) sim = SumstatSimulation(args.sim_name) d = Dataset(sim.dataset) pretty.print_namespace(sim); print() # read in noiseless phenotypes Y = pickle.load(sim.noiseless_Y_file(args.beta_num)) # choose individuals and create ensemble of Ys indices = np.random.choice(Y.shape[0], size=(sim.sample_size,)) Y = Y[indices] # compute how much noise to add sigma2e = 1 - sim.h2g print('adding noise. sigma2e =', sigma2e) Y += np.sqrt(sigma2e) * np.random.randn(*Y.shape) if sim.condition_on_covariates: print('projecting covariates out of Y') Y = d.project_out_covariates(Y, covariates=d.covariates[indices]) alphahat = np.zeros(d.M) t0 = time() def compute_sumstats_for_slice(s): # X will be N x M print(int(time() - t0), ': getting genotypes from file. SNPs', s) X = d.get_standardized_genotypes(s)[indices] if sim.condition_on_covariates: print(int(time() - t0), ': projecting out covariates') X = d.project_out_covariates(X, covariates=d.covariates[indices]) print(int(time() - t0), ': computing sumstats. SNPs', s) alphahat[s[0]:s[1]] = X.T.dot(Y) / sim.sample_size del X map(compute_sumstats_for_slice, d.slices()) # write output def write_output(): pickle.dump(indices, sim.individuals_file( args.beta_num, args.sample_num, 'wb'), 2) pickle.dump(Y, sim.noisy_Y_file( args.beta_num, args.sample_num, 'wb'), 2) pickle.dump(alphahat, sim.sumstats_file( args.beta_num, args.sample_num, 'wb'), 2) write_output()
def main(args): result_filename = '{}.{}.annot.gz'.format(args.annot_stem, args.chrnum) name = args.annot_stem.split('/')[-1] print('reading refpanel bim') refpanel = Dataset(args.refpanel + '.' + str(args.chrnum)) refpanel_bim = pd.read_csv(refpanel.genotypes_bedfile.filename + '.bim', sep='\t', names=['CHR', 'SNP', 'cM', 'BP', 'A1', 'A2']) print('\tthere are', len(refpanel_bim), 'SNPs') print('reading frq') refpanel_frq = pd.read_csv(refpanel.genotypes_bedfile.filename + '.frq', delim_whitespace=True, header=0) refpanel_frq = refpanel_frq[['SNP', 'MAF']] print('merging') annot = refpanel_bim.merge(refpanel_frq, how='left', on=['SNP']) annot[name] = 0 print('before filtering, |A| =', np.sum(annot[name])) to_include = np.flatnonzero((annot['MAF'] <= 0.01).values) # sizes = [ # 27285, # 26794, # 23253, # 18865, # 20449, # 17280, # 15776, # 13198, # 14079, # 15070, # 14247, # 12971, # 9039, # 11219, # 8688, # 11506, # 10187, # 7603, # 7804, # 6765, # 3747, # 4021 # ] # to_include = to_include[np.random.choice(len(to_include), replace=False, # size=sizes[args.chrnum-1])] annot.ix[to_include, name] = 1 print('after filtering, |A| =', np.sum(annot[name])) annot.rename(columns={'cM': 'CM'}, inplace=True) names = ['CHR', 'BP', 'SNP', 'CM', name] # write output print('writing output') with gzip.open(result_filename, 'wt') as f: annot[names].to_csv(f, index=False, sep='\t')
def submit(args): my_args = ['main', '--chrom', '$LSB_JOBINDEX'] d = Dataset('UK10Khg19.22') outfilepath = d.auxfiles_path + '../' + \ '%I/.preprocess.out' bsub.submit(['python', '-u', paths.code + 'real/preprocess.py'] + my_args, outfilepath, jobname='preprocess[1-22]', memory_GB=16)
def get_refpanel(args): print('reading refpanel and refpanel bim') refpanel = Dataset(args.refpanel + '.' + str(args.chrnum)) refpanel_bim = pd.read_csv(refpanel.genotypes_bedfile.filename + '.bim', sep='\t', names=['CHR', 'SNP', 'cM', 'BP', 'A1', 'A2']) refpanel_bim['refpanelINDEX'] = np.arange(len(refpanel_bim)) print('\trefpanel contains', len(refpanel_bim), 'SNPs') return refpanel, refpanel_bim
def main(args): d = Dataset(args.refpanel + '.' + str(args.chrnum)) annot_filename = '{}.{}.annot.gz'.format(args.annot_stem, args.chrnum) cannot_filename = '{}.{}.cannot.gz'.format(args.annot_stem, args.chrnum) cannot_norm_filename = '{}.{}.cannot.norm'.format(args.annot_stem, args.chrnum) annot = pd.read_csv(annot_filename, compression='gzip', sep='\t', header=0) name = annot.columns[-1] v = annot.ix[:,name].values #TODO: use ld blocks, possibly just those that have non-trivial intersection with the # nonzero entries of v print('computing Xv') Xv = np.zeros(d.N) for s in d.slices(): print(s) X = d.get_standardized_genotypes(s) Xv += X.dot(v[s[0]:s[1]]) print('computing XTXv') XTXv = np.zeros(d.M) for s in d.slices(): print(s) X = d.get_standardized_genotypes(s) XTXv[s[0]:s[1]] = X.T.dot(Xv) print('computing V^TRv') Rv = XTXv / d.N vTRv = v.dot(Rv) # write output print('writing output') annot[name+'.CONV'] = Rv with gzip.open(cannot_filename, 'wt') as f: annot.to_csv(f, index=False, sep='\t') with open(cannot_norm_filename, 'w') as f: f.write(str(vTRv))
def create_annot(args): path = '/'.join(args.bedfile.split('/')[:-1]) + '/' filename = args.bedfile.split('/')[-1] if filename[-4:] == '.bed': name = filename[:-4] else: name = filename gs = GenomicSubset(name, path=path) for chrnum in range(1, 23)[::-1]: print('creating annot file for chr', chrnum) d = Dataset(args.refpanel + '.' + str(chrnum)) sss = [SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))] SnpSubset.print_subsets('{}{}.{}.annot.gz'.format(path, name, chrnum), sss, [name])
def run(self, beta_num, sim): print('loading data set and region info') d = Dataset(sim.dataset) gs = GenomicSubset(self.params.region) ss = SnpSubset(d, bedtool=gs.bedtool) print('loading ld score info') ref_ldscores, w_ld, M_annot = self.ld_score_info() N = np.ones((d.M, 1)) * d.N print(('ref_ldscores shape:{}\nw_ld shape:{}\nN shape:{}\n' + \ 'M_annot shape:{}').format( ref_ldscores.shape, w_ld.shape, N.shape, M_annot.shape)) overlaps = self.overlap_vector() print('num snps overlapping with each category:', overlaps) results = [] variances = [] for alphahat in sim.sumstats_files(beta_num): alphahat = d.N * alphahat**2 if self.params.constrain_intercept: hsqhat = ldsc.ldscore.regressions.Hsq(alphahat.reshape( (d.M, 1)), ref_ldscores, w_ld, N, M_annot, intercept=1) else: hsqhat = ldsc.ldscore.regressions.Hsq( alphahat.reshape((d.M, 1)), ref_ldscores, w_ld, N, M_annot) results.append(hsqhat.coef.dot(overlaps)) variances.append(overlaps.dot(hsqhat.coef_cov).dot(overlaps)) print('intercept:', hsqhat.intercept) print(len(results), results[-1], variances[-1]) return np.concatenate([np.array([results]).T, np.array([variances]).T], axis=1)
def main(args): d = Dataset(args.refpanel + '.' + str(args.chrnum)) annot_filename = '{}.{}.annot.gz'.format(args.annot_stem, args.chrnum) cannot_filename = '{}.{}.cannot.gz'.format(args.annot_stem, args.chrnum) cannot_norm_filename = '{}.{}.cannot.norm'.format(args.annot_stem, args.chrnum) annot = pd.read_csv(annot_filename, compression='gzip', sep='\t', header=0) name = annot.columns[-1] v = annot.ix[:, name].values #TODO: use ld blocks, possibly just those that have non-trivial intersection with the # nonzero entries of v print('computing Xv') Xv = np.zeros(d.N) for s in d.slices(): print(s) X = d.get_standardized_genotypes(s) Xv += X.dot(v[s[0]:s[1]]) print('computing XTXv') XTXv = np.zeros(d.M) for s in d.slices(): print(s) X = d.get_standardized_genotypes(s) XTXv[s[0]:s[1]] = X.T.dot(Xv) print('computing V^TRv') Rv = XTXv / d.N vTRv = v.dot(Rv) # write output print('writing output') annot[name + '.CONV'] = Rv with gzip.open(cannot_filename, 'wt') as f: annot.to_csv(f, index=False, sep='\t') with open(cannot_norm_filename, 'w') as f: f.write(str(vTRv))
self.add_ridge(Lambda) self.covcsr /= (1 + Lambda) if __name__ == '__main__': from primitives import Dataset, GenomicSubset, SnpSubset import copy from time import time import argparse np.random.seed(0) parser = argparse.ArgumentParser() parser.add_argument('--M', type=int, required=True, help='the number of SNPs to use') parser.add_argument('-check_dense', action='store_true', default=False) args = parser.parse_args() d = Dataset('GERA', forced_M=args.M) indivs = d.random_indivs(200) t0 = time() R = LdMatrix(d, indivs, 200) R.add_ridge(0.05) print('computing R took', time() - t0) print('shape of R is:', R.covcsr.shape) # tiny = GenomicSubset('tiny') # tiny_irs = SnpSubset(d, bedtool=tiny.bedtool).irs tiny_irs = IntRangeSet('300:350') RA = LdMatrix(d, indivs, 200, snpset_irs=tiny_irs, output=False) b = np.random.randn(d.M) # check inverse computation
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--refpanel', type=str, required=True) parser.add_argument('--ldblocks', type=str, required=False, default='pickrell_ldblocks.hg19.eur.bed') parser.add_argument('--region', type=str, required=True) parser.add_argument('--sumstats_path', type=str, required=True) args = parser.parse_args() print('loading reference panel') refpanel = Dataset(args.refpanel) print('loading region') A = GenomicSubset(args.region) print('loading ld blocks') blocks = BedTool(paths.reference + args.ldblocks) print('finding ld blocks that overlap with A') relevant_blocks = blocks.intersect(A.bedtool, wa=True).saveas() print('found', len(relevant_blocks), 'blocks that overlap with A') print('reading refpanel bim') refpanel_bim = pd.read_csv(refpanel.genotypes_bedfile.filename + '.bim', sep='\t', names=['CHR', 'SNP', 'cM', 'BP', 'A1', 'A2'])
if __name__ == '__main__': from primitives import Dataset, GenomicSubset, SnpSubset import copy from time import time import argparse np.random.seed(0) parser = argparse.ArgumentParser() parser.add_argument('--M', type=int, required=True, help='the number of SNPs to use') parser.add_argument('-check_dense', action='store_true', default=False) args = parser.parse_args() d = Dataset('GERA', forced_M=args.M) indivs = d.random_indivs(200) t0 = time() R = LdMatrix(d, indivs, 200) R.add_ridge(0.05) print('computing R took', time() - t0) print('shape of R is:', R.covcsr.shape) # tiny = GenomicSubset('tiny') # tiny_irs = SnpSubset(d, bedtool=tiny.bedtool).irs tiny_irs = IntRangeSet('300:350') RA = LdMatrix(d, indivs, 200, snpset_irs=tiny_irs, output=False) b = np.random.randn(d.M) # check inverse computation
def refpanel(self): if self.__refpanel is None: self.__refpanel = Dataset(self.params.refpanel) return self.__refpanel
print(time.time()-t0, 'variance is {} + {} + {} = {}'.format(variance1, variance2, variance3, variance)) print('zscore:', point_estimate / np.sqrt(variance)) return point_estimate, variance, R, RA if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--refpanel', type=str, required=True) parser.add_argument('--ldblocks', type=str, required=False, default='pickrell_ldblocks.hg19.eur.bed') parser.add_argument('--region', type=str, required=True) parser.add_argument('--sumstats_path', type=str, required=True) args = parser.parse_args() print('loading reference panel') refpanel = Dataset(args.refpanel) print('loading region') A = GenomicSubset(args.region) print('loading ld blocks') blocks = BedTool(paths.reference + args.ldblocks) print('finding ld blocks that overlap with A') relevant_blocks = blocks.intersect(A.bedtool, wa=True).saveas() print('found', len(relevant_blocks), 'blocks that overlap with A') print('reading refpanel bim') refpanel_bim = pd.read_csv(refpanel.genotypes_bedfile.filename + '.bim', sep='\t', names=['CHR', 'SNP', 'cM', 'BP', 'A1', 'A2'])