Example #1
0
def main(args):
    np.random.seed(args.beta_num)
    sim = SumstatSimulation(args.sim_name)
    arch = Architecture(sim.architecture)
    d = Dataset(sim.dataset)

    # sample the beta
    beta = arch.draw_effect_sizes(sim.dataset, sim.h2g)[:, 0]

    # compute noiseless phenotypes slice by slice
    Y = np.zeros(d.N)
    t0 = time()
    for s in d.slices():
        # X will be N x M
        print(int(time() - t0), ": getting genotypes from file. SNPs", s)
        X = d.get_standardized_genotypes(s)
        print("computing phenotypes. SNPs", s)
        Y += X.dot(beta[s[0] : s[1]])
        del X

    # normalize the Y and the beta to the desired heritability
    normalization = np.std(Y) / np.sqrt(sim.h2g)
    if normalization == 0:
        normalization = 1  # just in case we have some 0s...
    Y /= normalization
    beta /= normalization

    # write the betas and the noiseless phenotypes
    pickle.dump(beta, sim.beta_file(args.beta_num, "wb"), 2)
    pickle.dump(Y, sim.noiseless_Y_file(args.beta_num, "wb"), 2)
Example #2
0
    def preprocess(self):
        if self.params.baseline and not self.baseline_preprocessing_in_progress(
        ):
            print('baseline model not found. creating...')
            self.declare_baseline_preprocessing_in_progress()
            self.create_baseline_model()

        print('submitting ld score jobs for annotation of interest')
        gs = GenomicSubset(self.params.region)

        # create the annotation file
        for chrnum in self.refpanel.chromosomes():
            d = Dataset(self.params.refpanel, chrnum=chrnum)
            ss = SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))
            SnpSubset.print_subsets(self.annotation_filename(chrnum), [ss],
                                    [self.params.region],
                                    add_other=True)

        # create the ldscores file
        for chrnum in self.refpanel.chromosomes():
            d = Dataset(self.params.refpanel, chrnum=chrnum)
            ldscores_command = [
                'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2',
                '--ld-wind-cm',
                str(self.params.ld_window / 1000.), '--bfile',
                d.genotypes_bedfile.filename, '--annot',
                self.annotation_filename(chrnum), '--out',
                self.annotation_l2_filestem(chrnum)
            ]
            print(' '.join(ldscores_command))
            outfilepath = self.annotation_l2_filestem(chrnum) + '.bsub_out'
            bsub.submit(ldscores_command,
                        outfilepath,
                        jobname=self.preprocessing_foldername() + ',chr=' +
                        str(chrnum))
Example #3
0
def main(args):
    np.random.seed(args.beta_num)
    sim = SumstatSimulation(args.sim_name)
    arch = Architecture(sim.architecture)
    d = Dataset(sim.dataset)

    # sample the beta
    beta = arch.draw_effect_sizes(sim.dataset, sim.h2g)[:, 0]

    # compute noiseless phenotypes slice by slice
    Y = np.zeros(d.N)
    t0 = time()
    for s in d.slices():
        # X will be N x M
        print(int(time() - t0), ': getting genotypes from file. SNPs', s)
        X = d.get_standardized_genotypes(s)
        print('computing phenotypes. SNPs', s)
        Y += X.dot(beta[s[0]:s[1]])
        del X

    # normalize the Y and the beta to the desired heritability
    normalization = np.std(Y) / np.sqrt(sim.h2g)
    if normalization == 0: normalization = 1  # just in case we have some 0s...
    Y /= normalization
    beta /= normalization

    # write the betas and the noiseless phenotypes
    pickle.dump(beta, sim.beta_file(args.beta_num, 'wb'), 2)
    pickle.dump(Y, sim.noiseless_Y_file(args.beta_num, 'wb'), 2)
Example #4
0
    def create_baseline_model(self):
        gss = [GenomicSubset(region) for region in LDSC.baseline_model_regions]

        # create the annotation file
        for chrnum in self.refpanel.chromosomes():
            print('creating baseline annot file for chr', chrnum)
            d = Dataset(self.params.refpanel, chrnum=chrnum)
            sss = [
                SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))
                for gs in gss
            ]
            SnpSubset.print_subsets(self.baseline_filename(chrnum), sss,
                                    LDSC.baseline_model_regions)

        # create the ldscores file
        for chrnum in self.refpanel.chromosomes():
            d = Dataset(self.params.refpanel, chrnum=chrnum)
            ldscores_command = [
                'python', '-u', paths.foreign + 'ldsc/ldsc.py', '--l2',
                '--ld-wind-cm',
                str(self.params.ld_window / 1000.), '--bfile',
                d.genotypes_bedfile.filename, '--annot',
                self.baseline_filename(chrnum), '--out',
                self.baseline_l2_filestem(chrnum)
            ]
            print(' '.join(ldscores_command))
            outfilepath = self.baseline_l2_filestem(chrnum) + '.bsub_out'
            bsub.submit(ldscores_command,
                        outfilepath,
                        jobname='baseline,chr=' + str(chrnum))
Example #5
0
def main(args):
    np.random.seed(args.beta_num + args.sample_num * 10000)
    sim = SumstatSimulation(args.sim_name)
    d = Dataset(sim.dataset)
    pretty.print_namespace(sim)
    print()

    # read in noiseless phenotypes
    Y = pickle.load(sim.noiseless_Y_file(args.beta_num))

    # choose individuals and create ensemble of Ys
    indices = np.random.choice(Y.shape[0], size=(sim.sample_size, ))
    Y = Y[indices]

    # compute how much noise to add
    sigma2e = 1 - sim.h2g
    print('adding noise. sigma2e =', sigma2e)
    Y += np.sqrt(sigma2e) * np.random.randn(*Y.shape)

    if sim.condition_on_covariates:
        print('projecting covariates out of Y')
        Y = d.project_out_covariates(Y, covariates=d.covariates[indices])

    alphahat = np.zeros(d.M)
    t0 = time()

    def compute_sumstats_for_slice(s):
        # X will be N x M
        print(int(time() - t0), ': getting genotypes from file. SNPs', s)
        X = d.get_standardized_genotypes(s)[indices]

        if sim.condition_on_covariates:
            print(int(time() - t0), ': projecting out covariates')
            X = d.project_out_covariates(X, covariates=d.covariates[indices])

        print(int(time() - t0), ': computing sumstats. SNPs', s)
        alphahat[s[0]:s[1]] = X.T.dot(Y) / sim.sample_size
        del X

    map(compute_sumstats_for_slice, d.slices())

    # write output
    def write_output():
        pickle.dump(indices,
                    sim.individuals_file(args.beta_num, args.sample_num, 'wb'),
                    2)
        pickle.dump(Y, sim.noisy_Y_file(args.beta_num, args.sample_num, 'wb'),
                    2)
        pickle.dump(alphahat,
                    sim.sumstats_file(args.beta_num, args.sample_num, 'wb'), 2)

    write_output()
Example #6
0
def main(args):
    refpanel = Dataset(args.refpanel + '.' + str(args.chrnum))
    annot_filename = '{}.{}.annot.gz'.format(args.annot_stem, args.chrnum)
    result_filename = '{}.maf1p.{}.annot.gz'.format(args.annot_stem,
                                                    args.chrnum)

    print('reading annot')
    annot = pd.read_csv(annot_filename, compression='gzip', sep='\t', header=0)
    name = annot.columns[-1]
    names = annot.columns.values

    print('reading frq')
    refpanel_frq = pd.read_csv(refpanel.genotypes_bedfile.filename + '.frq',
                               delim_whitespace=True,
                               header=0)
    refpanel_frq = refpanel_frq[['SNP', 'MAF']]

    print('merging')
    annot = annot.merge(refpanel_frq, how='left', on=['SNP'])

    print('before filtering, |A| =', np.sum(annot[name]))
    annot.ix[annot['MAF'] > 0.01, name] = 0
    print('after filtering, |A| =', np.sum(annot[name]))

    # write output
    print('writing output to', result_filename)
    with gzip.open(result_filename, 'wt') as f:
        annot[names].to_csv(f, index=False, sep='\t')
Example #7
0
def main(args):
    np.random.seed(args.beta_num + args.sample_num * 10000)
    sim = SumstatSimulation(args.sim_name)
    d = Dataset(sim.dataset)
    pretty.print_namespace(sim); print()

    # read in noiseless phenotypes
    Y = pickle.load(sim.noiseless_Y_file(args.beta_num))

    # choose individuals and create ensemble of Ys
    indices = np.random.choice(Y.shape[0], size=(sim.sample_size,))
    Y = Y[indices]

    # compute how much noise to add
    sigma2e = 1 - sim.h2g
    print('adding noise. sigma2e =', sigma2e)
    Y += np.sqrt(sigma2e) * np.random.randn(*Y.shape)

    if sim.condition_on_covariates:
        print('projecting covariates out of Y')
        Y = d.project_out_covariates(Y, covariates=d.covariates[indices])

    alphahat = np.zeros(d.M)
    t0 = time()
    def compute_sumstats_for_slice(s):
        # X will be N x M
        print(int(time() - t0), ': getting genotypes from file. SNPs', s)
        X = d.get_standardized_genotypes(s)[indices]

        if sim.condition_on_covariates:
            print(int(time() - t0), ': projecting out covariates')
            X = d.project_out_covariates(X, covariates=d.covariates[indices])

        print(int(time() - t0), ': computing sumstats. SNPs', s)
        alphahat[s[0]:s[1]] = X.T.dot(Y) / sim.sample_size
        del X
    map(compute_sumstats_for_slice, d.slices())

    # write output
    def write_output():
        pickle.dump(indices, sim.individuals_file(
                    args.beta_num, args.sample_num, 'wb'), 2)
        pickle.dump(Y, sim.noisy_Y_file(
                    args.beta_num, args.sample_num, 'wb'), 2)
        pickle.dump(alphahat, sim.sumstats_file(
                    args.beta_num, args.sample_num, 'wb'), 2)
    write_output()
Example #8
0
def main(args):
    result_filename = '{}.{}.annot.gz'.format(args.annot_stem, args.chrnum)
    name = args.annot_stem.split('/')[-1]

    print('reading refpanel bim')
    refpanel = Dataset(args.refpanel + '.' + str(args.chrnum))
    refpanel_bim = pd.read_csv(refpanel.genotypes_bedfile.filename + '.bim',
                               sep='\t',
                               names=['CHR', 'SNP', 'cM', 'BP', 'A1', 'A2'])
    print('\tthere are', len(refpanel_bim), 'SNPs')

    print('reading frq')
    refpanel_frq = pd.read_csv(refpanel.genotypes_bedfile.filename + '.frq',
                               delim_whitespace=True,
                               header=0)
    refpanel_frq = refpanel_frq[['SNP', 'MAF']]

    print('merging')
    annot = refpanel_bim.merge(refpanel_frq, how='left', on=['SNP'])
    annot[name] = 0

    print('before filtering, |A| =', np.sum(annot[name]))
    to_include = np.flatnonzero((annot['MAF'] <= 0.01).values)
    # sizes = [
    #         27285,
    #         26794,
    #         23253,
    #         18865,
    #         20449,
    #         17280,
    #         15776,
    #         13198,
    #         14079,
    #         15070,
    #         14247,
    #         12971,
    #         9039,
    #         11219,
    #         8688,
    #         11506,
    #         10187,
    #         7603,
    #         7804,
    #         6765,
    #         3747,
    #         4021
    #         ]
    # to_include = to_include[np.random.choice(len(to_include), replace=False,
    # size=sizes[args.chrnum-1])]
    annot.ix[to_include, name] = 1
    print('after filtering, |A| =', np.sum(annot[name]))
    annot.rename(columns={'cM': 'CM'}, inplace=True)
    names = ['CHR', 'BP', 'SNP', 'CM', name]

    # write output
    print('writing output')
    with gzip.open(result_filename, 'wt') as f:
        annot[names].to_csv(f, index=False, sep='\t')
Example #9
0
def submit(args):
    my_args = ['main', '--chrom', '$LSB_JOBINDEX']
    d = Dataset('UK10Khg19.22')
    outfilepath = d.auxfiles_path + '../' + \
            '%I/.preprocess.out'
    bsub.submit(['python', '-u', paths.code + 'real/preprocess.py'] + my_args,
                outfilepath,
                jobname='preprocess[1-22]',
                memory_GB=16)
Example #10
0
def get_refpanel(args):
    print('reading refpanel and refpanel bim')
    refpanel = Dataset(args.refpanel + '.' + str(args.chrnum))
    refpanel_bim = pd.read_csv(refpanel.genotypes_bedfile.filename + '.bim',
                               sep='\t',
                               names=['CHR', 'SNP', 'cM', 'BP', 'A1', 'A2'])
    refpanel_bim['refpanelINDEX'] = np.arange(len(refpanel_bim))
    print('\trefpanel contains', len(refpanel_bim), 'SNPs')
    return refpanel, refpanel_bim
Example #11
0
def main(args):
    d = Dataset(args.refpanel + '.' + str(args.chrnum))
    annot_filename = '{}.{}.annot.gz'.format(args.annot_stem, args.chrnum)
    cannot_filename = '{}.{}.cannot.gz'.format(args.annot_stem, args.chrnum)
    cannot_norm_filename = '{}.{}.cannot.norm'.format(args.annot_stem, args.chrnum)

    annot = pd.read_csv(annot_filename, compression='gzip', sep='\t', header=0)
    name = annot.columns[-1]
    v = annot.ix[:,name].values

    #TODO: use ld blocks, possibly just those that have non-trivial intersection with the
    # nonzero entries of v
    print('computing Xv')
    Xv = np.zeros(d.N)
    for s in d.slices():
        print(s)
        X = d.get_standardized_genotypes(s)
        Xv += X.dot(v[s[0]:s[1]])

    print('computing XTXv')
    XTXv = np.zeros(d.M)
    for s in d.slices():
        print(s)
        X = d.get_standardized_genotypes(s)
        XTXv[s[0]:s[1]] = X.T.dot(Xv)

    print('computing V^TRv')
    Rv = XTXv / d.N
    vTRv = v.dot(Rv)

    # write output
    print('writing output')
    annot[name+'.CONV'] = Rv
    with gzip.open(cannot_filename, 'wt') as f:
        annot.to_csv(f, index=False, sep='\t')

    with open(cannot_norm_filename, 'w') as f:
        f.write(str(vTRv))
Example #12
0
def create_annot(args):
    path = '/'.join(args.bedfile.split('/')[:-1]) + '/'
    filename = args.bedfile.split('/')[-1]
    if filename[-4:] == '.bed':
        name = filename[:-4]
    else:
        name = filename

    gs = GenomicSubset(name, path=path)
    for chrnum in range(1, 23)[::-1]:
        print('creating annot file for chr', chrnum)
        d = Dataset(args.refpanel + '.' + str(chrnum))
        sss = [SnpSubset(d, gs.restricted_to_chrom_bedtool(chrnum))]
        SnpSubset.print_subsets('{}{}.{}.annot.gz'.format(path, name, chrnum),
                                sss, [name])
Example #13
0
    def run(self, beta_num, sim):
        print('loading data set and region info')
        d = Dataset(sim.dataset)
        gs = GenomicSubset(self.params.region)
        ss = SnpSubset(d, bedtool=gs.bedtool)

        print('loading ld score info')
        ref_ldscores, w_ld, M_annot = self.ld_score_info()
        N = np.ones((d.M, 1)) * d.N

        print(('ref_ldscores shape:{}\nw_ld shape:{}\nN shape:{}\n' + \
                'M_annot shape:{}').format(
                    ref_ldscores.shape,
                    w_ld.shape,
                    N.shape,
                    M_annot.shape))

        overlaps = self.overlap_vector()
        print('num snps overlapping with each category:', overlaps)
        results = []
        variances = []
        for alphahat in sim.sumstats_files(beta_num):
            alphahat = d.N * alphahat**2
            if self.params.constrain_intercept:
                hsqhat = ldsc.ldscore.regressions.Hsq(alphahat.reshape(
                    (d.M, 1)),
                                                      ref_ldscores,
                                                      w_ld,
                                                      N,
                                                      M_annot,
                                                      intercept=1)
            else:
                hsqhat = ldsc.ldscore.regressions.Hsq(
                    alphahat.reshape((d.M, 1)), ref_ldscores, w_ld, N, M_annot)
            results.append(hsqhat.coef.dot(overlaps))
            variances.append(overlaps.dot(hsqhat.coef_cov).dot(overlaps))
            print('intercept:', hsqhat.intercept)
            print(len(results), results[-1], variances[-1])

        return np.concatenate([np.array([results]).T,
                               np.array([variances]).T],
                              axis=1)
Example #14
0
def main(args):
    d = Dataset(args.refpanel + '.' + str(args.chrnum))
    annot_filename = '{}.{}.annot.gz'.format(args.annot_stem, args.chrnum)
    cannot_filename = '{}.{}.cannot.gz'.format(args.annot_stem, args.chrnum)
    cannot_norm_filename = '{}.{}.cannot.norm'.format(args.annot_stem,
                                                      args.chrnum)

    annot = pd.read_csv(annot_filename, compression='gzip', sep='\t', header=0)
    name = annot.columns[-1]
    v = annot.ix[:, name].values

    #TODO: use ld blocks, possibly just those that have non-trivial intersection with the
    # nonzero entries of v
    print('computing Xv')
    Xv = np.zeros(d.N)
    for s in d.slices():
        print(s)
        X = d.get_standardized_genotypes(s)
        Xv += X.dot(v[s[0]:s[1]])

    print('computing XTXv')
    XTXv = np.zeros(d.M)
    for s in d.slices():
        print(s)
        X = d.get_standardized_genotypes(s)
        XTXv[s[0]:s[1]] = X.T.dot(Xv)

    print('computing V^TRv')
    Rv = XTXv / d.N
    vTRv = v.dot(Rv)

    # write output
    print('writing output')
    annot[name + '.CONV'] = Rv
    with gzip.open(cannot_filename, 'wt') as f:
        annot.to_csv(f, index=False, sep='\t')

    with open(cannot_norm_filename, 'w') as f:
        f.write(str(vTRv))
Example #15
0
        self.add_ridge(Lambda)
        self.covcsr /= (1 + Lambda)


if __name__ == '__main__':
    from primitives import Dataset, GenomicSubset, SnpSubset
    import copy
    from time import time
    import argparse
    np.random.seed(0)
    parser = argparse.ArgumentParser()
    parser.add_argument('--M', type=int, required=True, help='the number of SNPs to use')
    parser.add_argument('-check_dense', action='store_true', default=False)
    args = parser.parse_args()

    d = Dataset('GERA', forced_M=args.M)
    indivs = d.random_indivs(200)

    t0 = time()
    R = LdMatrix(d, indivs, 200)
    R.add_ridge(0.05)
    print('computing R took', time() - t0)
    print('shape of R is:', R.covcsr.shape)

    # tiny = GenomicSubset('tiny')
    # tiny_irs = SnpSubset(d, bedtool=tiny.bedtool).irs
    tiny_irs = IntRangeSet('300:350')
    RA = LdMatrix(d, indivs, 200, snpset_irs=tiny_irs, output=False)
    b = np.random.randn(d.M)

    # check inverse computation
Example #16
0

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--refpanel', type=str, required=True)
    parser.add_argument('--ldblocks',
                        type=str,
                        required=False,
                        default='pickrell_ldblocks.hg19.eur.bed')
    parser.add_argument('--region', type=str, required=True)
    parser.add_argument('--sumstats_path', type=str, required=True)

    args = parser.parse_args()

    print('loading reference panel')
    refpanel = Dataset(args.refpanel)

    print('loading region')
    A = GenomicSubset(args.region)

    print('loading ld blocks')
    blocks = BedTool(paths.reference + args.ldblocks)

    print('finding ld blocks that overlap with A')
    relevant_blocks = blocks.intersect(A.bedtool, wa=True).saveas()
    print('found', len(relevant_blocks), 'blocks that overlap with A')

    print('reading refpanel bim')
    refpanel_bim = pd.read_csv(refpanel.genotypes_bedfile.filename + '.bim',
                               sep='\t',
                               names=['CHR', 'SNP', 'cM', 'BP', 'A1', 'A2'])
Example #17
0
if __name__ == '__main__':
    from primitives import Dataset, GenomicSubset, SnpSubset
    import copy
    from time import time
    import argparse
    np.random.seed(0)
    parser = argparse.ArgumentParser()
    parser.add_argument('--M',
                        type=int,
                        required=True,
                        help='the number of SNPs to use')
    parser.add_argument('-check_dense', action='store_true', default=False)
    args = parser.parse_args()

    d = Dataset('GERA', forced_M=args.M)
    indivs = d.random_indivs(200)

    t0 = time()
    R = LdMatrix(d, indivs, 200)
    R.add_ridge(0.05)
    print('computing R took', time() - t0)
    print('shape of R is:', R.covcsr.shape)

    # tiny = GenomicSubset('tiny')
    # tiny_irs = SnpSubset(d, bedtool=tiny.bedtool).irs
    tiny_irs = IntRangeSet('300:350')
    RA = LdMatrix(d, indivs, 200, snpset_irs=tiny_irs, output=False)
    b = np.random.randn(d.M)

    # check inverse computation
Example #18
0
 def refpanel(self):
     if self.__refpanel is None:
         self.__refpanel = Dataset(self.params.refpanel)
     return self.__refpanel
Example #19
0
    print(time.time()-t0, 'variance is {} + {} + {} = {}'.format(variance1, variance2, variance3, variance))
    print('zscore:', point_estimate / np.sqrt(variance))
    return point_estimate, variance, R, RA

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--refpanel', type=str, required=True)
    parser.add_argument('--ldblocks', type=str, required=False,
            default='pickrell_ldblocks.hg19.eur.bed')
    parser.add_argument('--region', type=str, required=True)
    parser.add_argument('--sumstats_path', type=str, required=True)

    args = parser.parse_args()

    print('loading reference panel')
    refpanel = Dataset(args.refpanel)

    print('loading region')
    A = GenomicSubset(args.region)

    print('loading ld blocks')
    blocks = BedTool(paths.reference + args.ldblocks)

    print('finding ld blocks that overlap with A')
    relevant_blocks = blocks.intersect(A.bedtool, wa=True).saveas()
    print('found', len(relevant_blocks), 'blocks that overlap with A')

    print('reading refpanel bim')
    refpanel_bim = pd.read_csv(refpanel.genotypes_bedfile.filename + '.bim',
            sep='\t',
            names=['CHR', 'SNP', 'cM', 'BP', 'A1', 'A2'])