Example #1
0
    def preprocess(self, s):
        print('Acor is preprocessing', s.name, 'with refpanel=',
              self.params.refpanel)
        print(self.params)

        annots = [
            pa.Annotation(paths.annotations + aname)
            for aname in self.params.annot_chr.split(',')
        ]

        for a in annots:
            print('preprocessing', a.filestem())
            for c in s.chromosomes:
                if not os.path.exists(
                        a.conv_filename(c, full=self.params.fullconv)):
                    conv_command = [
                        'python', '-u', paths.code + 'acor/acor.py',
                        '--annot-chr', a.stem_chr,
                        '--bfile-chr', self.refpanel.bfile_chr] + \
                        (['-fullconv'] if self.params.fullconv else []) + \
                        ['conv',
                        '--chroms', str(c)]
                    print(' '.join(conv_command))
                    outfilepath = a.filestem(c) + '.' + \
                            ('full' if self.params.fullconv else '') + \
                            'convbsub_out'
                    bsub.submit(
                            conv_command,
                            outfilepath,
                            jobname=self.params.annot_chr.replace('/','_') + \
                                    ',conv,chr='+str(c))
Example #2
0
 def required_files(self, s):
     annots = [
         pa.Annotation(paths.annotations + aname)
         for aname in self.params.annot_chr.split(',')
     ]
     return [
         a.conv_filename(c, full=self.params.fullconv)
         for c in s.chromosomes for a in annots
     ]
Example #3
0
    def run(self, s, beta_num):
        print('Acor is running', s.name, 'on beta', beta_num, 'with refpanel=',
              self.params.refpanel)
        print(self.params)
        annots = [
            pa.Annotation(paths.annotations + aname)
            for aname in self.params.annot_chr.split(',')
        ]

        cmd = [
                'python', '-u', paths.code + 'acor/acor.py',
                '--annot-chr', ' '.join([a.stem_chr for a in annots]),
                '--bfile-chr', self.refpanel.bfile_chr] + \
                (['-fullconv'] if self.params.fullconv else []) + \
                ['cor',
                '--ldscores-chr', self.refpanel.bfile_chr,
                '--sumstats', s.sumstats_filename(beta_num),
                '--out', self.result_filename(s, beta_num),
                self.params.kind] + \
                (['-reg-var'] if self.params.regvar else []) + \
                (['-noweights'] if not self.params.weights else []) + \
                (['-biascorrect'] if self.params.biascorrect else []) + \
                (['--maf-thresh', str(self.params.maf_thresh)] if self.params.maf_thresh > 0
                        else []) + \
                ['--chroms'] + [str(c) for c in s.chromosomes]
        print(' '.join(cmd))
        subprocess.call(cmd)

        acorresults = pd.read_csv(self.result_filename(s, beta_num) +
                                  '.results',
                                  delim_whitespace=True,
                                  header=0)
        rowindex = np.where(
            np.concatenate([a.names(s.chromosomes[-1])
                            for a in annots]) == self.params.coeff)[0][0]
        if self.params.print == 'all':
            estimate = acorresults['MU_EST'][rowindex]
        elif self.params.print == 'num':
            estimate = acorresults['TOP'][rowindex]
        elif self.params.print == 'denom':
            estimate = acorresults['BOTTOM'][rowindex]
        stderr = acorresults['MU_STDERR'][rowindex]
        pval = acorresults['MU_P'][rowindex]

        return pd.DataFrame(columns=['ESTIMATE', 'STDERR', 'P'],
                            data=[[estimate, stderr, pval]])
Example #4
0
    def run(self, s, beta_num):
        print('LDSC is running', s.name, 'on beta', beta_num, 'with refpanel=',
              self.params.refpanel)
        print(self.params)
        annots = [
            pa.Annotation(paths.annotations + aname)
            for aname in self.params.annot_chr.split(',')
        ]

        if self.params.constrain_intercept:
            extra_args = ['--no-intercept']
        else:
            extra_args = []
        cmd = [
            'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--h2',
            s.sumstats_filename(beta_num), '--ref-ld-chr', ','.join(
                [a.stem_chr
                 for a in annots]), '--w-ld-chr', self.refpanel.bfile_chr,
            '--overlap-annot', '--print-coefficients', '--chisq-max', '999999',
            '--frqfile-chr', self.refpanel.bfile_chr, '--out',
            self.result_filename(s, beta_num)
        ] + extra_args
        print(' '.join(cmd))
        subprocess.call(cmd)

        ldscresults = pd.read_csv(self.result_filename(s, beta_num) +
                                  '.results',
                                  delim_whitespace=True,
                                  header=0)
        rowindex = np.where(
            np.concatenate([a.names(22)
                            for a in annots]) == self.params.coeff)[0][0]
        estimate = ldscresults['Prop._h2'][rowindex]
        stderr = ldscresults['Prop._h2_std_error'][rowindex]
        pval = ldscresults['Enrichment_p'][rowindex]

        return pd.DataFrame(columns=['ESTIMATE', 'STDERR', 'P'],
                            data=[[estimate, stderr, pval]])
Example #5
0
    def preprocess(self, s):
        print('LDSC is preprocessing', s.name, 'with refpanel=',
              self.params.refpanel)
        print(self.params)

        for annot_chr in self.params.annot_chr.split(','):
            a = pa.Annotation(paths.annotations + annot_chr)
            for c in range(1, 23):
                if not os.path.exists(a.ldscores_filename(c)):
                    ldscores_command = [
                        'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2',
                        '--ld-wind-cm',
                        str(self.params.ld_window), '--bfile',
                        self.refpanel.bfile(c), '--annot',
                        a.annot_filename(c), '--out',
                        a.filestem(c)
                    ]
                    print(' '.join(ldscores_command))
                    outfilepath = a.filestem(c) + '.ldscoresbsub_out'
                    bsub.submit(
                            ldscores_command,
                            outfilepath,
                            jobname=self.params.annot_chr.replace('/','_') + \
                                    ',ldcores,chr='+str(c))
Example #6
0
def write(folder, args, name, background_names, mux, muy, z, corr_thresh=0.8):
    print('STORYTELLING for ', name, 'z=', z)
    refpanel = gd.Dataset(args.bfile_reg_chr)
    annot = [ga.Annotation(a) for a in args.sannot_chr
            if name in ga.Annotation(a).names(22, RV=True)][0]

    backgroundannots = [ga.Annotation(a) for a in args.background_sannot_chr]
    print('focal annotation columns:', annot.names(22, True))
    print('background annotations:', background_names)

    # get refpanel snp metadata 
    print('re-reading snps')
    snps = pd.concat([refpanel.bim_df(c) for c in args.chroms], axis=0)

    # read sumstats
    print('re-reading sumstats')
    ss = pd.concat([
        pd.read_csv(args.pss_chr+str(c)+'.pss.gz', sep='\t', usecols=['N','Winv_ahat_I'])
        for c in args.chroms])
    snps['ahat'] = ss['Winv_ahat_I']
    snps['N'] = ss['N']
    del ss

    # read annotations
    print('re-reading background annotations')
    for a in backgroundannots:
        mynames = [n for n in a.names(22, RV=True) if '.R' in n] #names of annotations
        snps = pd.concat([snps,
            pd.concat([a.RV_df(c)[mynames] for c in args.chroms], axis=0)
            ], axis=1)

    print('reading focal annotation')
    snps = pd.concat([snps,
        pd.concat([annot.RV_df(c)[name] for c in args.chroms], axis=0)
        ], axis=1)

    print('residualizing background out of focal')
    A = snps[background_names]
    snps['chi2'] = snps.N * snps.ahat**2
    snps['Rv'] = snps[name]
    snps['ahat_resid'] = snps.ahat - A.values.dot(muy)
    snps['Rv_resid'] = snps.Rv - A.values.dot(mux)
    snps['typed'] = snps.ahat_resid.notnull()
    snps = snps[snps.typed].reset_index(drop=True)
    snps['significant'] = snps.chi2 > 29.716785
    print(snps.significant.sum(), 'genome-wide significant SNPs')

    print('searching for good windows')
    # get endpoints of windows
    stride = 20
    windowsize_in_strides = 5
    windowsize = stride * windowsize_in_strides

    # find all starting points of windows containing GWAS-sig SNPs
    starts = np.concatenate([
                [ int(i/stride)*stride - k*stride
                    for k in range(0, windowsize_in_strides)]
                for i in np.where(snps.significant)[0]])
    starts = np.array(sorted(list(set(starts))))
    # compute corresponding endpoints
    ends = starts + windowsize

    # truncate any windows that extend past the ends of the genome
    starts = starts[ends < len(snps)]
    ends = ends[ends < len(snps)]
    ends = ends[starts >= 0]
    starts = starts[starts >= 0]

    print(len(starts), 'windows with GWAS hits found')

    # compute correlations
    numbers = pd.DataFrame(
            np.array([[i,j,
                np.max(snps.iloc[i:j].chi2),
                np.corrcoef(snps.iloc[i:j].Rv_resid, snps.iloc[i:j].ahat_resid)[0,1]]
                for i,j in zip(starts, ends)]),
            columns=['start','end','maxchi2','corr'])

    # keep only cases with strong correlations in the right direction
    numbers = numbers[numbers['corr']**2 >= corr_thresh]
    numbers = numbers[np.sign(numbers['corr']) == np.sign(z)]
    print(len(numbers), 'windows with GWAS hits and squared correlation with Rv >=',
            corr_thresh, 'in the right direction')
    for i,j in zip(numbers.start, numbers.end):
        i = int(i); j = int(j)
        print('saving', i,j)
        c = snps.iloc[i].CHR
        start = snps.iloc[i].BP
        end = snps.iloc[j].BP
        plt.figure()
        plt.scatter(snps.iloc[i:j].Rv_resid,
                snps.iloc[i:j].ahat_resid * np.sqrt(snps.iloc[i:j].N))
        plt.title('chr{}:{}-{}'.format(c, start, end))
        plt.xlabel(r'residual $Rv$')
        plt.ylabel(r'residual $Z$')

        filename = '{}/chr{}:{}-{}.pdf'.format(folder, c, start, end)
        fs.makedir_for_file(filename)
        plt.savefig(filename)
        plt.close()
Example #7
0
 def required_files(self, s):
     a = pa.Annotation(paths.annotations + self.params.sannot_chr)
     return [a.filestem(c) + '.testprocess' for c in s.chromosomes]
Example #8
0
def main(args):
    print('initializing...')
    import gzip, gc, time
    import numpy as np
    import pandas as pd
    import gprim.annotation as ga
    import gprim.dataset as gd
    import ypy.memo as memo

    # basic initialization
    mhc_bp = [25684587, 35455756]
    refpanel = gd.Dataset(args.bfile_chr)
    annots = [ga.Annotation(annot) for annot in args.sannot_chr]

    # read in ld blocks, remove MHC, read SNPs to print
    ldblocks = pd.read_csv(args.ld_blocks,
                           delim_whitespace=True,
                           header=None,
                           names=['chr', 'start', 'end'])
    mhcblocks = (ldblocks.chr == 'chr6') & \
            (ldblocks.end > mhc_bp[0]) & \
            (ldblocks.start < mhc_bp[1])
    ldblocks = ldblocks[~mhcblocks]
    print(len(ldblocks), 'loci after removing MHC')
    print_snps = pd.read_csv(args.print_snps, header=None, names=['SNP'])
    print_snps['printsnp'] = True
    print(len(print_snps), 'print snps')

    # process annotations
    for annot in annots:
        t0 = time.time()
        for c in args.chroms:
            print(time.time() - t0, ': loading chr', c, 'of', args.chroms)
            # get refpanel snp metadata for this chromosome
            snps = refpanel.bim_df(c)
            snps = ga.smart_merge(snps, refpanel.frq_df(c)[['SNP', 'MAF']])
            print(len(snps), 'snps in refpanel', len(snps.columns),
                  'columns, including metadata')

            # read in annotation
            print('reading annot', annot.filestem())
            names = annot.names(c)  # names of annotations
            namesR = [n + '.R' for n in names]  # names of results
            a = annot.sannot_df(c)
            if 'SNP' in a.columns:
                print(
                    'not a thinannot => doing full reconciliation of snps and allele coding'
                )
                snps = ga.reconciled_to(snps, a, names, missing_val=0)
            else:
                print(
                    'detected thinannot, so assuming that annotation is synched to refpanel'
                )
                snps = pd.concat([snps, a[names]], axis=1)

            # add information on which snps to print
            print('merging in print_snps')
            snps = pd.merge(snps, print_snps, how='left', on='SNP')
            snps.printsnp.fillna(False, inplace=True)
            snps.printsnp.astype(bool)

            # put on per-normalized-genotype scale
            if args.alpha != -1:
                print('scaling by maf according to alpha=', args.alpha)
                snps[names] = snps[names].values*\
                        np.power(2*snps.MAF.values*(1-snps.MAF.values),
                                (1.+args.alpha)/2)[:,None]

            # make room for RV and make sure annotation values are treated as floats
            snps = pd.concat([
                snps,
                pd.DataFrame(np.zeros(snps[names].shape), columns=namesR)
            ],
                             axis=1)
            snps[names] = snps[names].astype(float)

            # compute simple statistics about annotation
            print('computing basic statistics and writing')
            info = pd.DataFrame(columns=[
                'M', 'M_5_50', 'sqnorm', 'sqnorm_5_50', 'supp', 'supp_5_50'
            ])
            info['name'] = names
            info.set_index('name', inplace=True)
            info['M'] = len(snps)
            info['sqnorm'] = np.linalg.norm(snps[names], axis=0)**2
            info['supp'] = np.linalg.norm(snps[names], ord=0, axis=0)
            M_5_50 = (snps.MAF >= 0.05).values
            info['M_5_50'] = M_5_50.sum()
            info['sqnorm_5_50'] = np.linalg.norm(snps.loc[M_5_50, names],
                                                 axis=0)**2
            info['supp_5_50'] = np.linalg.norm(snps.loc[M_5_50, names],
                                               ord=0,
                                               axis=0)
            info.to_csv(annot.info_filename(c), sep='\t')

            # process ldblocks one by one
            for ldblock, X, meta, ind in refpanel.block_data(ldblocks,
                                                             c,
                                                             meta=snps):
                if meta.printsnp.sum() == 0:
                    print('no print-snps in this block')
                    continue
                print(meta.printsnp.sum(), 'print-snps')
                if (meta[names] == 0).values.all():
                    print('annotations are all 0 in this block')
                    snps.loc[ind, namesR] = 0
                else:
                    mask = meta.printsnp.values
                    V = meta[names].values
                    XV = X.dot(V)
                    snps.loc[ind[mask], namesR] = \
                            X[:,mask].T.dot(XV[:,-len(names):]) / X.shape[0]

            # write
            print('writing output')
            with gzip.open(annot.RV_filename(c), 'w') as f:
                snps.loc[snps.printsnp, ['SNP', 'A1', 'A2'] + names +
                         namesR].to_csv(f, index=False, sep='\t')

            del snps
            memo.reset()
            gc.collect()

    print('done')
Example #9
0
 def annotation(self):
     return pa.Annotation(paths.annotations + self.params.annot_chr)
Example #10
0
 def required_files(self, s):
     annots = [
         pa.Annotation(paths.annotations + aname)
         for aname in self.params.annot_chr.split(',')
     ]
     return [a.ldscores_filename(c) for c in range(1, 23) for a in annots]