Example #1
0
    def preprocess(self):
        if self.params.baseline and not self.baseline_preprocessing_in_progress():
            print('baseline model not found. creating...')
            self.declare_baseline_preprocessing_in_progress()
            self.create_baseline_model()

        print('submitting ld score jobs for annotation of interest')
        gs = GenomicSubset(self.params.region)

        # create the annotation file
        for chrnum in self.refpanel.chromosomes():
            d = Dataset(self.params.refpanel, chrnum=chrnum)
            ss = SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))
            SnpSubset.print_subsets(self.annotation_filename(chrnum),
                    [ss], [self.params.region], add_other=True)

        # create the ldscores file
        for chrnum in self.refpanel.chromosomes():
            d = Dataset(self.params.refpanel, chrnum=chrnum)
            ldscores_command = [
                    'python', '-u', paths.foreign + 'ldsc/ldsc.py',
                    '--l2',
                    '--ld-wind-cm', str(self.params.ld_window / 1000.),
                    '--bfile', d.genotypes_bedfile.filename,
                    '--annot', self.annotation_filename(chrnum),
                    '--out', self.annotation_l2_filestem(chrnum)]
            print(' '.join(ldscores_command))
            outfilepath = self.annotation_l2_filestem(chrnum) + '.bsub_out'
            bsub.submit(
                    ldscores_command,
                    outfilepath,
                    jobname=self.preprocessing_foldername()+',chr='+str(chrnum))
Example #2
0
    def preprocess(self):
        if self.params.baseline and not self.baseline_preprocessing_in_progress(
        ):
            print('baseline model not found. creating...')
            self.declare_baseline_preprocessing_in_progress()
            self.create_baseline_model()

        print('submitting ld score jobs for annotation of interest')
        gs = GenomicSubset(self.params.region)

        # create the annotation file
        for chrnum in self.refpanel.chromosomes():
            d = Dataset(self.params.refpanel, chrnum=chrnum)
            ss = SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))
            SnpSubset.print_subsets(self.annotation_filename(chrnum), [ss],
                                    [self.params.region],
                                    add_other=True)

        # create the ldscores file
        for chrnum in self.refpanel.chromosomes():
            d = Dataset(self.params.refpanel, chrnum=chrnum)
            ldscores_command = [
                'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2',
                '--ld-wind-cm',
                str(self.params.ld_window / 1000.), '--bfile',
                d.genotypes_bedfile.filename, '--annot',
                self.annotation_filename(chrnum), '--out',
                self.annotation_l2_filestem(chrnum)
            ]
            print(' '.join(ldscores_command))
            outfilepath = self.annotation_l2_filestem(chrnum) + '.bsub_out'
            bsub.submit(ldscores_command,
                        outfilepath,
                        jobname=self.preprocessing_foldername() + ',chr=' +
                        str(chrnum))
Example #3
0
    def compute_statistic(self, alphahat, R, RA, N, Nref, memoize=False):
        Rajd = Nadjust_after = None
        if self.params.Radjust == 'after':
            Nadjust_after = Nref
            Radj = R
        elif self.params.Radjust == 'before':
            Nadjust_after = None
            Radj = R.adjusted_before_inversion(Nref)
        else:
            Nadjust_after = None
            Radj = R

        if self.params.RAreg:
            print('regularizing RA')
            RA = RA.add_ridge(self.params.Lambda, renormalize=True)
            gs = GenomicSubset(self.params.region)
            A = SnpSubset(self.refpanel, bedtool=gs.bedtool)
            RA.zero_outside_irs(A.irs)

        if not memoize or not hasattr(self, 'bias'):
            print('adding lambda')
            Radjreg = Radj.add_ridge(self.params.Lambda, renormalize=True)
            print('computing inverse')
            self.Radjreginv = Radjreg.inv(Nadjust_after=Nadjust_after)

            print('done.computing bias...')
            A = SnpSubset(self.refpanel,
                          bedtool=GenomicSubset(self.params.region).bedtool)
            W = self.window(A)
            if not self.params.avgunbiased:
                tr = self.Radjreginv.dot(RA).trace()
                self.scaling = 1
            else:
                tr = RA.dot(self.Radjreginv).dot(R).dot(
                    self.Radjreginv).trace()
                Q = R.dot(self.Radjreginv).dot(RA).dot(self.Radjreginv).dot(R)
                Q.zero_outside_irs(A.irs)
                self.scaling = A.num_snps() / Q.trace()
            # self.bias = tr / N + \
            #         float(self.refpanel.M-len(W.irs))/self.refpanel.M * \
            #             self.params.sigma2g * tr / self.params.pop_size
            self.bias = tr / N + \
                        self.params.sigma2g * tr / self.params.pop_size
            print('\nbias =', self.bias)
            print('scaling =', self.scaling)

        betahat = self.Radjreginv.dot(alphahat)

        return self.scaling * (betahat.dot(RA.dot(betahat)) - self.bias)
Example #4
0
def create_annot(args):
    path = '/'.join(args.bedfile.split('/')[:-1]) + '/'
    filename = args.bedfile.split('/')[-1]
    if filename[-4:] == '.bed':
        name = filename[:-4]
    else:
        name = filename

    gs = GenomicSubset(name, path=path)
    for chrnum in range(1, 23)[::-1]:
        print('creating annot file for chr', chrnum)
        d = Dataset(args.refpanel + '.' + str(chrnum))
        sss = [SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))]
        SnpSubset.print_subsets('{}{}.{}.annot.gz'.format(path, name, chrnum),
                                sss, [name])
Example #5
0
def create_annot(args):
    path = '/'.join(args.bedfile.split('/')[:-1]) + '/'
    filename = args.bedfile.split('/')[-1]
    if filename[-4:] == '.bed':
        name = filename[:-4]
    else:
        name = filename

    gs = GenomicSubset(name, path=path)
    for chrnum in range(1,23)[::-1]:
        print('creating annot file for chr', chrnum)
        d = Dataset(args.refpanel + '.' + str(chrnum))
        sss = [SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))]
        SnpSubset.print_subsets('{}{}.{}.annot.gz'.format(path, name, chrnum),
                sss, [name])
Example #6
0
    def create_baseline_model(self):
        gss = [GenomicSubset(region) for region in LDSC.baseline_model_regions]

        # create the annotation file
        for chrnum in self.refpanel.chromosomes():
            print('creating baseline annot file for chr', chrnum)
            d = Dataset(self.params.refpanel, chrnum=chrnum)
            sss = [
                SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))
                for gs in gss
            ]
            SnpSubset.print_subsets(self.baseline_filename(chrnum), sss,
                                    LDSC.baseline_model_regions)

        # create the ldscores file
        for chrnum in self.refpanel.chromosomes():
            d = Dataset(self.params.refpanel, chrnum=chrnum)
            ldscores_command = [
                'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2',
                '--ld-wind-cm',
                str(self.params.ld_window / 1000.), '--bfile',
                d.genotypes_bedfile.filename, '--annot',
                self.baseline_filename(chrnum), '--out',
                self.baseline_l2_filestem(chrnum)
            ]
            print(' '.join(ldscores_command))
            outfilepath = self.baseline_l2_filestem(chrnum) + '.bsub_out'
            bsub.submit(ldscores_command,
                        outfilepath,
                        jobname='baseline,chr=' + str(chrnum))
Example #7
0
    def preprocess(self, use_filesystem=True):
        if not self.covariance_preprocessing_in_progress(
        ) or not use_filesystem:
            print('creating covariance matrix...')
            if use_filesystem:
                self.declare_covariance_preprocessing_in_progress()
            self.R = self.compute_covariance()
            if use_filesystem:
                pickle.dump(self.R, self.R_file(mode='wb'), 2)
        else:
            print('loading covariance matrix')
            self.R = pickle.load(self.R_file())

        if not self.invcovariance_preprocessing_in_progress(
        ) or not use_filesystem:
            print('creating inverse covariance matrix')
            if use_filesystem:
                self.declare_invcovariance_preprocessing_in_progress()
            self.Rri = self.compute_invcovariance()
            if use_filesystem:
                pickle.dump(self.Rri, self.Rri_file(mode='wb'), 2)
        else:
            print('loading inverse covariance matrix')
            self.Rri = pickle.load(self.Rri_file())

        t0 = time.time()
        print(time.time() - t0, ': creating and saving RA')
        self.A = SnpSubset(self.refpanel,
                           GenomicSubset(self.params.region).bedtool)
        self.RA = self.R.copy()
        self.RA.zero_outside_irs(self.A.irs)
        if use_filesystem:
            pickle.dump(self.RA, self.RA_file(mode='wb'), 2)

        print(time.time() - t0, ': computing and saving scaling')
        self.Z = self.Rri.dot(self.RA.dot(self.Rri))
        self.Q = self.R.dot(self.Z).dot(self.R)
        QA = self.Q.copy()
        QA.zero_outside_irs(self.A.irs)
        self.scalings = {
            r:
            len(self.A.irs & IntRangeSet(r)) / np.trace(QA.ranges_to_arrays[r])
            for r in QA.ranges()
        }
        print(time.time() - t0, ': scalings are', self.scalings)
        if use_filesystem:
            self.set_scalings(self.scalings)

        print(time.time() - t0, ': computing and saving bias matrix')
        self.ZR = self.RA.dot(self.Rri).dot(self.R).dot(self.Rri)
        if use_filesystem:
            pickle.dump(self.ZR, self.biasmatrix_file(mode='wb'), 2)

        print(time.time() - t0, ': variance matrices')
        self.QZ = self.Q.dot(self.Z)
        self.QZR = self.QZ.dot(self.R)
        if use_filesystem:
            self.save_variance_matrices(self.Q, self.Z, self.QZ, self.QZR)
        print(time.time() - t0, ': done')
Example #8
0
 def init(self):
     self.Rri = pickle.load(self.Rri_file())
     self.R = pickle.load(self.R_file())
     self.RA = pickle.load(self.RA_file())
     self.A = SnpSubset(self.refpanel,
                        GenomicSubset(self.params.region).bedtool)
     self.ZR = pickle.load(self.biasmatrix_file())
     self.Q, self.Z, self.QZ, self.QZR = self.get_variance_matrices()
Example #9
0
 def chunks_containing_region(self):
     breakpoints = BedTool(paths.reference + self.params.breakpointsfile)
     blocks = SnpPartition(self.refpanel, breakpoints, remove_mhc=True)
     self.A = SnpSubset(self.refpanel,
                        GenomicSubset(self.params.region).bedtool)
     return [
         int(i / self.chunk_size(blocks.ranges()))
         for i in blocks.indices_containing(self.A.irs)
     ]
Example #10
0
 def preprocess(self):
     matplotlib.use('Agg')
     gs = GenomicSubset(self.params.region)
     ss = SnpSubset(self.refpanel, bedtool=gs.bedtool)
     RA = BlockDiag.ld_matrix(self.refpanel, ss.irs.ranges(),
                              self.params.ld_bandwidth / 1000.)
     try:  # if the plotting has some error we don't want to not save the stuff
         # RA.plot(ss.irs, filename=self.RA_plotfilename())
         pass
     except:
         pass
     pickle.dump(RA, self.RA_file(mode='wb'), 2)
Example #11
0
 def preprocess(self):
     matplotlib.use('Agg')
     gs = GenomicSubset(self.params.region)
     A = SnpSubset(self.refpanel, bedtool=gs.bedtool)
     W = A.expanded_by(self.params.ld_window / 1000.)
     R = BlockDiag.ld_matrix(self.refpanel,
                             W.irs.ranges(),
                             300,
                             band_units='SNPs')
     pickle.dump(R, self.R_file(mode='wb'), 2)
     # R.plot(A.irs, filename=self.R_plotfilename())
     RA = R.zero_outside_irs(A.irs)
     pickle.dump(RA, self.RA_file(mode='wb'), 2)
Example #12
0
 def preprocess(self):
     matplotlib.use('Agg')
     gs = GenomicSubset(self.params.region)
     A = SnpSubset(self.refpanel, bedtool=gs.bedtool)
     W = self.window(A)
     R = BlockDiag.ld_matrix(self.refpanel, W.irs.ranges(),
                             1000000)  # bandwidth=infty
     pickle.dump(R, self.R_file(mode='wb'), 2)
     try:  # if the plotting has some error we don't want to not save the stuff
         # R.plot(A.irs, filename=self.R_plotfilename())
         pass
     except:
         pass
     RA = R.zero_outside_irs(A.irs)
     pickle.dump(RA, self.RA_file(mode='wb'), 2)
Example #13
0
    def run(self, beta_num, sim):
        print('loading data set and region info')
        d = Dataset(sim.dataset)
        gs = GenomicSubset(self.params.region)
        ss = SnpSubset(d, bedtool=gs.bedtool)

        print('loading ld score info')
        ref_ldscores, w_ld, M_annot = self.ld_score_info()
        N = np.ones((d.M, 1)) * d.N

        print(('ref_ldscores shape:{}\nw_ld shape:{}\nN shape:{}\n' + \
                'M_annot shape:{}').format(
                    ref_ldscores.shape,
                    w_ld.shape,
                    N.shape,
                    M_annot.shape))

        overlaps = self.overlap_vector()
        print('num snps overlapping with each category:', overlaps)
        results = []
        variances = []
        for alphahat in sim.sumstats_files(beta_num):
            alphahat = d.N * alphahat**2
            if self.params.constrain_intercept:
                hsqhat = ldsc.ldscore.regressions.Hsq(alphahat.reshape(
                    (d.M, 1)),
                                                      ref_ldscores,
                                                      w_ld,
                                                      N,
                                                      M_annot,
                                                      intercept=1)
            else:
                hsqhat = ldsc.ldscore.regressions.Hsq(
                    alphahat.reshape((d.M, 1)), ref_ldscores, w_ld, N, M_annot)
            results.append(hsqhat.coef.dot(overlaps))
            variances.append(overlaps.dot(hsqhat.coef_cov).dot(overlaps))
            print('intercept:', hsqhat.intercept)
            print(len(results), results[-1], variances[-1])

        return np.concatenate([np.array([results]).T,
                               np.array([variances]).T],
                              axis=1)
Example #14
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--refpanel', type=str, required=True)
    parser.add_argument('--ldblocks',
                        type=str,
                        required=False,
                        default='pickrell_ldblocks.hg19.eur.bed')
    parser.add_argument('--region', type=str, required=True)
    parser.add_argument('--sumstats_path', type=str, required=True)

    args = parser.parse_args()

    print('loading reference panel')
    refpanel = Dataset(args.refpanel)

    print('loading region')
    A = GenomicSubset(args.region)

    print('loading ld blocks')
    blocks = BedTool(paths.reference + args.ldblocks)

    print('finding ld blocks that overlap with A')
    relevant_blocks = blocks.intersect(A.bedtool, wa=True).saveas()
    print('found', len(relevant_blocks), 'blocks that overlap with A')

    print('reading refpanel bim')
    refpanel_bim = pd.read_csv(refpanel.genotypes_bedfile.filename + '.bim',
                               sep='\t',
                               names=['CHR', 'SNP', 'cM', 'BP', 'A1', 'A2'])
    refpanel_bim['INDEX'] = np.arange(len(refpanel_bim))
    refpanel_bim['A'] = 1
    refpanel_bim.ix[SnpSubset(refpanel, A.bedtool).irs, 'A'] = 1
Example #15
0

if __name__ == '__main__':
    from primitives import Dataset, GenomicSubset, SnpSubset
    import copy
    from time import time
    import argparse
    np.random.seed(0)
    parser = argparse.ArgumentParser()
    parser.add_argument('--M', type=int, required=True, help='the number of SNPs to use')
    parser.add_argument('-check_dense', action='store_true', default=False)
    args = parser.parse_args()

    d = Dataset('GERA', forced_M=args.M)
    indivs = d.random_indivs(200)
    tiny_gs = GenomicSubset('50')
    tiny_ss = SnpSubset(d, bedtool=tiny_gs.bedtool)
    tiny_buffered_ss = tiny_ss.expanded_by(0.01)

    t0 = time()
    R = BlockDiag.ld_matrix(d, tiny_buffered_ss.irs.ranges(), 0.01, indivs=indivs) # 1 cM bandwidth
    R = R.add_ridge(0.05, renormalize=True)
    print('trace of renormalized R should be close to M (with noise due to sample vs pop LD',
            R.trace(), tiny_buffered_ss.num_snps(),
            R.trace() == tiny_buffered_ss.num_snps())
    print('computing R took', time() - t0)
    print('shape of R is:', R.shape())

    RA = R.copy()
    RA.zero_outside_irs(tiny_ss.irs)
    b = BlockDiag.from_big1darray(np.random.randn(d.M), R.ranges())