def create_baseline_model(self): gss = [GenomicSubset(region) for region in LDSC.baseline_model_regions] # create the annotation file for chrnum in self.refpanel.chromosomes(): print('creating baseline annot file for chr', chrnum) d = Dataset(self.params.refpanel, chrnum=chrnum) sss = [SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum)) for gs in gss] SnpSubset.print_subsets(self.baseline_filename(chrnum), sss, LDSC.baseline_model_regions) # create the ldscores file for chrnum in self.refpanel.chromosomes(): d = Dataset(self.params.refpanel, chrnum=chrnum) ldscores_command = [ 'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2', '--ld-wind-cm', str(self.params.ld_window / 1000.), '--bfile', d.genotypes_bedfile.filename, '--annot', self.baseline_filename(chrnum), '--out', self.baseline_l2_filestem(chrnum)] print(' '.join(ldscores_command)) outfilepath = self.baseline_l2_filestem(chrnum) + '.bsub_out' bsub.submit( ldscores_command, outfilepath, jobname='baseline,chr='+str(chrnum))
def preprocess(self): if self.params.baseline and not self.baseline_preprocessing_in_progress(): print('baseline model not found. creating...') self.declare_baseline_preprocessing_in_progress() self.create_baseline_model() print('submitting ld score jobs for annotation of interest') gs = GenomicSubset(self.params.region) # create the annotation file for chrnum in self.refpanel.chromosomes(): d = Dataset(self.params.refpanel, chrnum=chrnum) ss = SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum)) SnpSubset.print_subsets(self.annotation_filename(chrnum), [ss], [self.params.region], add_other=True) # create the ldscores file for chrnum in self.refpanel.chromosomes(): d = Dataset(self.params.refpanel, chrnum=chrnum) ldscores_command = [ 'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2', '--ld-wind-cm', str(self.params.ld_window / 1000.), '--bfile', d.genotypes_bedfile.filename, '--annot', self.annotation_filename(chrnum), '--out', self.annotation_l2_filestem(chrnum)] print(' '.join(ldscores_command)) outfilepath = self.annotation_l2_filestem(chrnum) + '.bsub_out' bsub.submit( ldscores_command, outfilepath, jobname=self.preprocessing_foldername()+',chr='+str(chrnum))
def preprocess(self): matplotlib.use("Agg") gs = GenomicSubset(self.params.region) A = SnpSubset(self.refpanel, bedtool=gs.bedtool) W = A.expanded_by(self.params.ld_window / 1000.0) R = BlockDiag.ld_matrix(self.refpanel, W.irs.ranges(), 300, band_units="SNPs") pickle.dump(R, self.R_file(mode="wb"), 2) # R.plot(A.irs, filename=self.R_plotfilename()) RA = R.zero_outside_irs(A.irs) pickle.dump(RA, self.RA_file(mode="wb"), 2)
def create_annot(args): path = '/'.join(args.bedfile.split('/')[:-1]) + '/' filename = args.bedfile.split('/')[-1] if filename[-4:] == '.bed': name = filename[:-4] else: name = filename gs = GenomicSubset(name, path=path) for chrnum in range(1,23)[::-1]: print('creating annot file for chr', chrnum) d = Dataset(args.refpanel + '.' + str(chrnum)) sss = [SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))] SnpSubset.print_subsets('{}{}.{}.annot.gz'.format(path, name, chrnum), sss, [name])
def compute_statistic(self, alphahat, R, RA, N, Nref, memoize=False): Rajd = Nadjust_after = None if self.params.Radjust == "after": Nadjust_after = Nref Radj = R elif self.params.Radjust == "before": Nadjust_after = None Radj = R.adjusted_before_inversion(Nref) else: Nadjust_after = None Radj = R if self.params.RAreg: print("regularizing RA") RA = RA.add_ridge(self.params.Lambda, renormalize=True) gs = GenomicSubset(self.params.region) A = SnpSubset(self.refpanel, bedtool=gs.bedtool) RA.zero_outside_irs(A.irs) if not memoize or not hasattr(self, "bias"): print("adding lambda") Radjreg = Radj.add_ridge(self.params.Lambda, renormalize=True) print("computing inverse") self.Radjreginv = Radjreg.inv(Nadjust_after=Nadjust_after) print("done.computing bias...") A = SnpSubset(self.refpanel, bedtool=GenomicSubset(self.params.region).bedtool) W = self.window(A) if not self.params.avgunbiased: tr = self.Radjreginv.dot(RA).trace() self.scaling = 1 else: tr = RA.dot(self.Radjreginv).dot(R).dot(self.Radjreginv).trace() Q = R.dot(self.Radjreginv).dot(RA).dot(self.Radjreginv).dot(R) Q.zero_outside_irs(A.irs) self.scaling = A.num_snps() / Q.trace() # self.bias = tr / N + \ # float(self.refpanel.M-len(W.irs))/self.refpanel.M * \ # self.params.sigma2g * tr / self.params.pop_size self.bias = tr / N + self.params.sigma2g * tr / self.params.pop_size print("\nbias =", self.bias) print("scaling =", self.scaling) betahat = self.Radjreginv.dot(alphahat) return self.scaling * (betahat.dot(RA.dot(betahat)) - self.bias)
def init(self): self.Rri = pickle.load(self.Rri_file()) self.R = pickle.load(self.R_file()) self.RA = pickle.load(self.RA_file()) self.A = SnpSubset(self.refpanel, GenomicSubset(self.params.region).bedtool) self.ZR = pickle.load(self.biasmatrix_file()) self.Q, self.Z, self.QZ, self.QZR = self.get_variance_matrices()