Esempio n. 1
0
    def polyloc_partitions(self, args):

        self.load_posterior_betas(args)
        self.partition_snps_to_bins(args, use_ridge=False)

        #add another partition for all SNPs not in the posterior file
        df_bim_list = []
        for chr_num in range(1, 23):
            df_bim_chr = pd.read_table(
                args.bfile_chr + '%d.bim' % (chr_num),
                delim_whitespace=True,
                names=['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'],
                header=None)
            df_bim_list.append(df_bim_chr)
        df_bim = pd.concat(df_bim_list, axis=0)
        df_bim = set_snpid_index(df_bim)
        self.df_bins = set_snpid_index(self.df_bins)

        #make sure that all variants in the posterior file are also in the plink files
        if np.any(~self.df_bins.index.isin(df_bim.index)):
            raise ValueError(
                'Found variants in posterior file that are not found in the plink files'
            )

        #add a new bin for SNPs that are not found in the posterior file (if there are any)
        if df_bim.shape[0] > self.df_bins.shape[0]:
            new_snps = df_bim.index[~df_bim.index.isin(self.df_bins.index)]
            df_bins_new = df_bim.loc[new_snps, SNP_COLUMNS].copy()
            for colname in self.df_bins.drop(columns=SNP_COLUMNS).columns:
                df_bins_new[colname] = False
            new_colname = 'snpvar_bin%d' % (df_bins_new.shape[1] -
                                            len(SNP_COLUMNS) + 1)
            self.df_bins[new_colname] = False
            df_bins_new[new_colname] = True
            self.df_bins = pd.concat([self.df_bins, df_bins_new], axis=0)

        #save the bins to disk
        self.save_bins_to_disk(args)

        #save the bin sizes to disk
        df_binsize = pd.DataFrame(
            index=np.arange(1, self.df_bins.shape[1] - len(SNP_COLUMNS) + 1))
        df_binsize.index.name = 'BIN'
        df_binsize['BIN_SIZE'] = [
            self.df_bins[c].sum()
            for c in self.df_bins.drop(columns=SNP_COLUMNS).columns
        ]  #saves memory
        df_binsize.to_csv(args.output_prefix + '.binsize',
                          sep='\t',
                          index=True)
Esempio n. 2
0
    def compute_ld_scores(self, args):
        #define the range of chromosomes to iterate over
        if args.chr is None:
            chr_range = range(1, 23)
        else:
            chr_range = range(args.chr, args.chr + 1)

        #iterate over chromosomes and compute LD-scores
        ###df_ldscores_chr_list = []
        for chr_num in tqdm(chr_range, disable=len(chr_range) == 1):

            #load or extract the bins for the current chromosome
            try:
                df_bins_chr = self.df_bins.query('CHR==%d' % (chr_num))
            except AttributeError:
                df_bins_chr = self.load_bins_chr(args, chr_num)

            #compute LD-scores for this chromosome
            if args.ld_ukb:
                if args.ld_dir is None: ld_dir = tempfile.mkdtemp()
                else: ld_dir = args.ld_dir
                df_bins_chr = set_snpid_index(df_bins_chr)
                df_ldscores_chr = compute_ldscores_chr(df_bins_chr, ld_dir)
            elif args.bfile_chr is not None:
                df_ldscores_chr = self.compute_ldscores_plink_chr(
                    args, chr_num, df_bins_chr)
            else:
                raise ValueError('no LDscore computation method specified')

            #save the LD-scores to disk
            ldscores_output_file = get_file_name(args,
                                                 'ldscores',
                                                 chr_num,
                                                 verify_exists=False)
            df_ldscores_chr.to_parquet(ldscores_output_file, index=False)
Esempio n. 3
0
def get_bcor_meta(bcor_obj):
    df_ld_snps = bcor_obj.getMeta()
    df_ld_snps.rename(columns={
        'rsid': 'SNP',
        'position': 'BP',
        'chromosome': 'CHR',
        'allele1': 'A1',
        'allele2': 'A2'
    },
                      inplace=True,
                      errors='raise')
    df_ld_snps['CHR'] = df_ld_snps['CHR'].astype(np.int)
    df_ld_snps['BP'] = df_ld_snps['BP'].astype(np.int)
    df_ld_snps = set_snpid_index(df_ld_snps)
    return df_ld_snps
Esempio n. 4
0
def read_annot(annot_file):
    try:
        df_annot = pd.read_parquet(annot_file)
    except (ArrowIOError, ArrowInvalid):
        df_annot = pd.read_table(annot_file, sep='\s+')

    assert 'CHR' in df_annot.columns
    assert 'SNP' in df_annot.columns
    assert 'BP' in df_annot.columns
    assert 'A1' in df_annot.columns
    assert 'A2' in df_annot.columns

    for c in df_annot.columns:
        if c in META_COLUMNS: continue
        if not is_numeric_dtype(df_annot[c]):
            raise ValueError('Annotation %s does not have numeric values' %
                             (c))

    df_annot = set_snpid_index(df_annot)

    return df_annot
Esempio n. 5
0
def load_ld_matrix(ld_dir, ld_prefix):

    #load the SNPs metadata
    gz_file = os.path.join(ld_dir, '%s.gz' % (ld_prefix))
    try:
        df_ld_snps = pd.read_table(gz_file, delim_whitespace=True)
    except (ArrowIOError, ArrowInvalid):
        raise IOError('Corrupt file downloaded')
    df_ld_snps.rename(columns={
        'rsid': 'SNP',
        'chromosome': 'CHR',
        'position': 'BP',
        'allele1': 'A1',
        'allele2': 'A2'
    },
                      inplace=True,
                      errors='ignore')
    assert 'SNP' in df_ld_snps.columns
    assert 'CHR' in df_ld_snps.columns
    assert 'BP' in df_ld_snps.columns
    assert 'A1' in df_ld_snps.columns
    assert 'A2' in df_ld_snps.columns
    df_ld_snps = set_snpid_index(df_ld_snps)

    #load the LD matrix
    npz_file = os.path.join(ld_dir, '%s.npz' % (ld_prefix))
    logging.info('Loading LD from file %s' % (npz_file))
    t0 = time.time()
    try:
        R = sparse.load_npz(npz_file).toarray()
        R += R.T
    except ValueError:
        raise IOError('Corrupt file downloaded')
    logging.info('Done in %0.2f seconds' % (time.time() - t0))

    #create df_R and return it
    df_R = pd.DataFrame(R, index=df_ld_snps.index, columns=df_ld_snps.index)

    return df_R
Esempio n. 6
0
    t0 = time.time()
    try:
        df_snps = pd.read_parquet(args.sumstats)
    except (ArrowIOError, ArrowInvalid):
        df_snps = pd.read_table(args.sumstats, sep='\s+')
    if 'A1' not in df_snps.columns:
        raise ValueError('missing column A1')
    if 'A2' not in df_snps.columns:
        raise ValueError('missing column A2')
    if 'CHR' not in df_snps.columns:
        raise ValueError('missing column CHR')
    if 'BP' not in df_snps.columns:
        raise ValueError('missing column BP')

    #set index
    df_snps = set_snpid_index(df_snps)
    logging.info('Done in %0.2f seconds' % (time.time() - t0))

    #make sure there aren't any duplicated SNPs
    if np.any(df_snps.index.duplicated()):
        raise ValueError(
            'duplicate SNPs found in output - please make sure there aren\'t any duplicate SNPs in your sumstats file'
        )

    #read df_meta
    logging.info('Loading meta-analyzed per-SNP-h2 files...')
    t0 = time.time()
    script_dir = os.path.dirname(os.path.realpath(__file__))
    df_meta1 = pd.read_parquet(
        os.path.join(script_dir, 'snpvar_meta.chr1_7.parquet'))
    df_meta2 = pd.read_parquet(
Esempio n. 7
0
def compute_ldscores(args):

    #read bim/snp
    array_snps = parse.PlinkBIMFile(args.bfile + '.bim')
    df_bim = array_snps.df
    if len(df_bim['CHR'].unique()) > 1:
        raise ValueError(
            'plink file includes multiple chromosomes. Please specify a plink file with a single chromosome'
        )
    df_bim = set_snpid_index(df_bim)

    #read annotations
    keep_snps = None
    if args.annot is not None:

        try:
            df_annot = pd.read_parquet(args.annot)
        except (ArrowIOError, ArrowInvalid):
            df_annot = pd.read_table(args.annot, sep='\s+')

        #Remove annotations of SNPs that are not in the .bim file
        df_annot = set_snpid_index(df_annot)
        df_annot = df_annot.loc[df_annot.index.isin(df_bim.index)]

        #make sure that all SNPs have annotations
        if np.any(~df_bim.index.isin(df_annot.index)):
            error_msg = 'Not all SNPs have annotation values'
            if args.allow_missing:
                is_good_snp = df_bim.index.isin(df_annot.index)
                if not np.any(is_good_snp):
                    raise ValueError('No SNPs have annotations')
                keep_snps = np.where(is_good_snp)[0]
                logging.warning(error_msg)
                logging.warning(
                    'Keeping only %d/%d SNPs that have annotations' %
                    (is_good_snp.sum(), len(is_good_snp)))
            else:
                raise ValueError(
                    error_msg +
                    '. If you wish to omit the missing SNPs, please use the flag --allow-missing'
                )

        #make sure that all of the annotations are numeric
        for c in df_annot.columns:
            if c in SNP_COLUMNS: continue
            if not is_numeric_dtype(df_annot[c]):
                raise ValueError('Annotation %s does not have numeric values' %
                                 (c))

    #find #individuals in bfile
    fam_file = args.bfile + '.fam'
    df_fam = pd.read_table(fam_file, header=None, usecols=[5], sep='\s+')
    n = df_fam.shape[0]

    #find keep_indivs
    if args.keep is None:
        keep_indivs = None
    else:
        array_indivs = parse.PlinkFAMFile(args.bfile + '.fam')
        keep_indivs = __filter__(args.keep, 'individuals', 'include',
                                 array_indivs)
        logging.info('after applying --keep, %d individuals remain' %
                     (len(keep_indivs)))

    #read plink file
    bed_file = args.bfile + '.bed'
    geno_array = ldscore.PlinkBEDFile(bed_file,
                                      n,
                                      array_snps,
                                      keep_snps=keep_snps,
                                      keep_indivs=keep_indivs,
                                      mafMin=None)

    #remove omitted SNPs from df_bim
    if len(geno_array.kept_snps) < df_bim.shape[0]:
        assert np.all(
            np.array(geno_array.kept_snps) == np.sort(
                np.array(geno_array.kept_snps)))
        assert geno_array.kept_snps[-1] < df_bim.shape[0]
        df_bim = df_bim.iloc[geno_array.kept_snps]

    #rearrange annotations to match the order of SNPs in the plink file
    if args.annot is not None:
        assert df_annot.shape[0] >= df_bim.shape[0]
        if (df_annot.shape[0] > df_bim.shape[0]) or np.any(
                df_annot.index != df_bim.index):
            assert np.all(df_bim.index.isin(df_annot.index))
            df_annot = df_annot.loc[df_bim.index]

    # determine block widths
    num_wind_args = np.array(
        (args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm), dtype=bool)
    if np.sum(num_wind_args) != 1:
        raise ValueError('Must specify exactly one --ld-wind option')
    if args.ld_wind_snps:
        max_dist = args.ld_wind_snps
        coords = np.array(list(range(geno_array.m)))
    elif args.ld_wind_kb:
        max_dist = args.ld_wind_kb * 1000
        coords = np.array(df_bim['BP'])
        if len(np.unique(coords)) == 1:
            raise ValueError(
                'bim file has no basepair data --- please use a different ld-wind option'
            )
    elif args.ld_wind_cm:
        max_dist = args.ld_wind_cm
        coords = np.array(df_bim['CM'])
        if len(np.unique(coords)) == 1:
            raise ValueError(
                'bim file has no CM data --- please use a different ld-wind option'
            )

    #compute LD-scores
    block_left = ldscore.getBlockLefts(coords, max_dist)
    if block_left[len(block_left) - 1] == 0:
        error_msg = 'Only a single block selected - this is probably a mistake'
        raise ValueError(error_msg)
    t0 = time.time()
    geno_array._currentSNP = 0
    annot_values = (None if args.annot is None else df_annot.drop(
        columns=SNP_COLUMNS).values)
    ldscores = geno_array.ldScoreVarBlocks(block_left,
                                           args.chunk_size,
                                           annot=annot_values)

    #create an ldscores df
    if args.annot is None:
        df_ldscores = pd.DataFrame(ldscores, columns=['base'])
    else:
        df_ldscores = pd.DataFrame(
            ldscores, columns=df_annot.drop(columns=SNP_COLUMNS).columns)

    #add SNP identifier columns
    for c in SNP_COLUMNS:
        df_ldscores[c] = df_bim[c].values
    df_ldscores = df_ldscores[
        SNP_COLUMNS + list(df_ldscores.drop(columns=SNP_COLUMNS).columns)]

    return df_ldscores
Esempio n. 8
0
     df_snps = pd.read_parquet(args.pips)
 except (ArrowIOError, ArrowInvalid):
     df_snps = pd.read_table(args.pips, sep='\s+')
 if 'A1' not in df_snps.columns:
     raise ValueError('missing column A1')
 if 'A2' not in df_snps.columns:
     raise ValueError('missing column A2')
 if 'CHR' not in df_snps.columns:
     raise ValueError('missing column CHR')
 if 'BP' not in df_snps.columns:
     raise ValueError('missing column BP')
 if 'PIP' not in df_snps.columns:
     raise ValueError('missing column PIP')
         
 #set index
 df_snps = set_snpid_index(df_snps)
 
 #restrict to SNPs with a large PIP
 df_snps = df_snps.query('PIP>=%s'%(args.pip_cutoff))
 if df_snps.shape[0]==0:
     raise ValueError('No SNPs with PIP>=%s found'%(args.pip_cutoff))
             
 #read df_annot
 logging.info('Loading annotations file...')
 t0 = time.time()
 try:
     df_annot = pd.read_parquet(args.annot)
 except (ArrowIOError, ArrowInvalid):
     df_annot = pd.read_table(args.annot, sep='\s+')
 df_annot = set_snpid_index(df_annot)
 logging.info('Done in %0.2f seconds'%(time.time() - t0))
Esempio n. 9
0
    def compute_ldscores_plink_chr(self, args, chr_num, df_bins_chr):

        # read bim/snp
        bim_file = get_file_name(args, 'bim', chr_num)
        array_snps = parse.PlinkBIMFile(bim_file)
        df_bim = array_snps.df
        df_bim = set_snpid_index(df_bim)

        #Remove annotations of SNPs that are not in the .bim file
        df_bins_chr = set_snpid_index(df_bins_chr)
        df_bins_chr = df_bins_chr.loc[df_bins_chr.index.isin(df_bim.index)]

        #make sure that all SNPs have a bin
        keep_snps = None
        if np.any(~df_bim.index.isin(df_bins_chr.index)):
            error_msg = 'Not all SNPs were assigned a bin (meaning some SNPS are not in the annotation files)'
            if args.allow_missing:
                is_good_snp = df_bim.index.isin(df_bins_chr.index)
                if not np.any(is_good_snp):
                    raise ValueError(
                        'No SNPs in chromosome %d have annotations' %
                        (chr_num))
                keep_snps = np.where(is_good_snp)[0]
                logging.warning(error_msg)
                logging.warning(
                    'Keeping only %d/%d SNPs in chromosome %d that have annotations'
                    % (df_bim.shape[0], len(is_good_snp), chr_num))
            else:
                raise ValueError(
                    error_msg +
                    '. If you wish to omit the missing SNPs, please use the flag --allow-missing'
                )

        #find #individuals in bfile
        fam_file = get_file_name(args, 'fam', chr_num)
        df_fam = pd.read_table(fam_file, header=None, usecols=[5], sep='\s+')
        n = df_fam.shape[0]

        #find keep_indivs
        if args.keep is None:
            keep_indivs = None
        else:
            array_indivs = parse.PlinkFAMFile(args.bfile + '.fam')
            keep_indivs = __filter__(args.keep, 'individuals', 'include',
                                     array_indivs)
            logging.info('after applying --keep, %d individuals remain' %
                         (len(keep_indivs)))

        #read plink file
        logging.info('Loading SNP file...')
        bed_file = get_file_name(args, 'bed', chr_num)
        geno_array = ldscore.PlinkBEDFile(bed_file,
                                          n,
                                          array_snps,
                                          keep_snps=keep_snps,
                                          keep_indivs=keep_indivs,
                                          mafMin=None)

        #remove omitted SNPs from df_bim
        if len(geno_array.kept_snps) != df_bim.shape[0]:
            assert np.all(
                np.array(geno_array.kept_snps) == np.sort(
                    np.array(geno_array.kept_snps)))
            assert geno_array.kept_snps[-1] < df_bim.shape[0]
            df_bim = df_bim.iloc[geno_array.kept_snps]

        #rearrange annotations to match the order of SNPs in the plink file
        assert df_bins_chr.shape[0] >= df_bim.shape[0]
        if (df_bins_chr.shape[0] > df_bim.shape[0]) or np.any(
                df_bins_chr.index != df_bim.index):
            assert np.all(df_bim.index.isin(df_bins_chr.index))
            df_bins_chr = df_bins_chr.loc[df_bim.index]

        # determine block widths
        num_wind_args = np.array(
            (args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm), dtype=bool)
        if np.sum(num_wind_args) != 1:
            raise ValueError('Must specify exactly one --ld-wind option')
        if args.ld_wind_snps:
            max_dist = args.ld_wind_snps
            coords = np.array(list(range(geno_array.m)))
        elif args.ld_wind_kb:
            max_dist = args.ld_wind_kb * 1000
            coords = np.array(df_bim['BP'])
            if len(np.unique(coords)) == 1:
                raise ValueError(
                    'bim file has no basepair data --- please use a different ld-wind option'
                )
        elif args.ld_wind_cm:
            max_dist = args.ld_wind_cm
            coords = np.array(df_bim['CM'])
            if len(np.unique(coords)) == 1:
                raise ValueError(
                    'bim file has no CM data --- please use a different ld-wind option'
                )

        #compute LD-scores
        block_left = ldscore.getBlockLefts(coords, max_dist)
        if block_left[len(block_left) - 1] == 0:
            error_msg = 'Only a single block selected - this is probably a mistake'
            raise ValueError(error_msg)
        t0 = time.time()
        geno_array._currentSNP = 0
        logging.info('Computing LD scores for chromosome %d' % (chr_num))
        ldscores = geno_array.ldScoreVarBlocks(
            block_left,
            args.chunk_size,
            annot=df_bins_chr.drop(columns=SNP_COLUMNS).values)

        #create an ldscores df
        df_ldscores = pd.DataFrame(
            ldscores,
            index=df_bins_chr.index,
            columns=df_bins_chr.drop(columns=SNP_COLUMNS).columns)
        df_ldscores = pd.concat((df_bins_chr[SNP_COLUMNS], df_ldscores),
                                axis=1)
        return df_ldscores
Esempio n. 10
0
    def save_snpvar_to_disk(self, args, use_ridge, constrain_range):
        if constrain_range:
            logging.info('Saving constrained SNP variances to disk')
        else:
            logging.info('Saving SNP variances to disk')

        #determine which df_snpvar to use
        if use_ridge: df_snpvar = self.df_snpvar_ridge
        else: df_snpvar = self.df_snpvar

        #constrain the ratio between the largest and smallest snp-var
        if constrain_range:
            df_snpvar = df_snpvar.copy()
            h2_total = df_snpvar['SNPVAR'].sum()
            min_snpvar = df_snpvar['SNPVAR'].max() / args.q
            df_snpvar.loc[df_snpvar['SNPVAR'] < min_snpvar,
                          'SNPVAR'] = min_snpvar
            df_snpvar['SNPVAR'] *= h2_total / df_snpvar['SNPVAR'].sum()
            assert np.isclose(df_snpvar['SNPVAR'].sum(), h2_total)

        #merge snpvar with sumstats
        try:
            df_sumstats = pd.read_parquet(args.sumstats)
        except (ArrowIOError, ArrowInvalid):
            df_sumstats = pd.read_table(args.sumstats, sep='\s+')
        df_sumstats.drop(columns=['SNP'], errors='ignore', inplace=True)
        for col in ['CHR', 'BP', 'A1', 'A2']:
            if col not in df_sumstats.columns:
                raise ValueError('sumstats file has a missing column: %s' %
                                 (col))
        df_snpvar = set_snpid_index(df_snpvar, copy=True)
        df_sumstats = set_snpid_index(df_sumstats)
        svpvar_cols = df_snpvar.columns.copy()
        df_snpvar.drop(columns=['CHR', 'BP', 'A1', 'A2'], inplace=True)
        df_snpvar = df_snpvar.merge(df_sumstats,
                                    left_index=True,
                                    right_index=True)
        df_snpvar = df_snpvar[
            list(svpvar_cols) +
            [c for c in df_sumstats.columns if c not in list(svpvar_cols)]]
        if df_snpvar.shape[0] < df_sumstats.shape[0]:
            error_message = 'not all SNPs in the sumstats file are also in the annotations file'
            if args.allow_missing:
                logging.warning(error_message + '. Keeping %d/%d SNPs' %
                                (df_snpvar.shape[0], df_sumstats.shape[0]))
            else:
                raise ValueError(
                    error_message +
                    '. If you wish to omit the missing SNPs, please use the flag --allow-missing'
                )

        #iterate over chromosomes
        for chr_num in tqdm(range(1, 23)):

            #define output file name
            output_fname = 'snpvar'
            if use_ridge: output_fname += '_ridge'
            if constrain_range: output_fname += '_constrained'
            snpvar_chr_file = get_file_name(args,
                                            output_fname,
                                            chr_num,
                                            verify_exists=False)

            #save snpvar to file
            df_snpvar_chr = df_snpvar.query('CHR==%d' % (chr_num))
            df_snpvar_chr.to_csv(snpvar_chr_file,
                                 index=False,
                                 sep='\t',
                                 compression='gzip',
                                 float_format='%0.4e')
Esempio n. 11
0
def main(args):

    #read sumstats file
    try:
        df_sumstats = pd.read_parquet(args.sumstats)
    except (ArrowIOError, ArrowInvalid):
        df_sumstats = pd.read_table(args.sumstats, sep='\s+')

    #compute p-values if needed
    if args.pvalue_cutoff is not None:
        df_sumstats['P'] = stats.chi2(1).sf(df_sumstats['Z']**2)

    #read regions file
    df_regions = pd.read_table(args.regions_file)
    if args.chr is not None:
        df_regions = df_regions.query('CHR==%d' % (args.chr))
        if df_regions.shape[0] == 0:
            raise ValueError('no SNPs found in chromosome %d' % (args.chr))
    df_regions = df_regions.loc[df_regions.apply(lambda r: np.any(
        (df_sumstats['CHR'] == r['CHR']) &
        (df_sumstats['BP'].between(r['START'], r['END']))),
                                                 axis=1)]

    #aggregate outputs
    df_sumstats_list = []
    logging.info('Aggregating results...')
    for _, r in tqdm(df_regions.iterrows()):
        chr_num, start, end, url_prefix = r['CHR'], r['START'], r['END'], r[
            'URL_PREFIX']

        #apply p-value filter if needed
        if args.pvalue_cutoff is not None:
            df_sumstats_r = df_sumstats.query('CHR==%d & %d <= BP <= %d' %
                                              (chr_num, start, end))
            if np.all(df_sumstats_r['P'] > args.pvalue_cutoff): continue

        output_file_r = '%s.chr%s.%s_%s.gz' % (args.out_prefix, chr_num, start,
                                               end)
        if not os.path.exists(output_file_r):
            err_msg = 'output file for chromosome %d bp %d-%d doesn\'t exist' % (
                chr_num, start, end)
            if args.allow_missing_jobs:
                logging.warning(err_msg)
                continue
            else:
                raise IOError(
                    err_msg +
                    '.\nTo override this error, please provide the flag --allow-missing-jobs'
                )
        df_sumstats_r = pd.read_table(output_file_r)

        #mark distance from center
        middle = (start + end) // 2
        df_sumstats_r['DISTANCE_FROM_CENTER'] = np.abs(df_sumstats_r['BP'] -
                                                       middle)
        df_sumstats_list.append(df_sumstats_r)
    if len(df_sumstats_list) == 0:
        raise ValueError('no output files found')

    #keep only the most central result for each SNP
    df_sumstats = pd.concat(df_sumstats_list, axis=0)
    df_sumstats.sort_values('DISTANCE_FROM_CENTER',
                            inplace=True,
                            ascending=True)
    df_sumstats = set_snpid_index(df_sumstats, allow_duplicates=True)
    df_sumstats = df_sumstats.loc[~df_sumstats.index.duplicated(keep='first')]
    del df_sumstats['DISTANCE_FROM_CENTER']
    df_sumstats.sort_values(['CHR', 'BP'], inplace=True, ascending=True)

    #write output file
    if args.adjust_beta_freq:
        df_sumstats['BETA_MEAN'] /= np.sqrt(2 * df_sumstats['MAF'] *
                                            (1 - df_sumstats['MAF']))
        df_sumstats['BETA_SD'] /= np.sqrt(2 * df_sumstats['MAF'] *
                                          (1 - df_sumstats['MAF']))
    df_sumstats.to_csv(args.out, sep='\t', index=False)
    logging.info('Wrote aggregated results to %s' % (args.out))
Esempio n. 12
0
def compute_prs_for_file(args,
                         plink_file,
                         df_betas,
                         temp_dir,
                         ranges_file=None,
                         keep_file=None):

    #read the bim file
    plink_file_prefix = plink_file[:plink_file.rfind('.')]
    df_bim = pd.read_csv(plink_file_prefix + '.bim',
                         header=None,
                         names=['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'],
                         delim_whitespace=True)
    df_bim = set_snpid_index(df_bim)

    #keep only relevant SNPs
    df_betas = df_betas.merge(df_bim[['SNP']],
                              left_index=True,
                              right_index=True,
                              suffixes=('_betas', '_bim'))
    if df_betas.shape[0] == 0:
        raise ValueError('No betas found for SNPs in plink file %s' %
                         (plink_file_prefix))

    #create temp files
    betas_file = os.path.join(temp_dir, next(tempfile._get_candidate_names()))
    outfile_prs_temp = os.path.join(temp_dir,
                                    next(tempfile._get_candidate_names()))

    #save the betas to a file
    df_betas[['SNP_bim', 'A1', 'BETA']].to_csv(betas_file,
                                               header=False,
                                               index=False,
                                               sep='\t',
                                               float_format='%0.8e')

    #Run plink
    plink_exe = args.plink_exe if (args.plink_exe
                                   is not None) else args.plink2_exe
    plink_cmd = '%s --allow-no-sex --extract %s --out %s --memory %d --threads %d' % (
        plink_exe, betas_file, outfile_prs_temp, args.memory * 1024,
        args.threads)
    if plink_file.endswith('.pgen'):
        plink_cmd += ' --bpfile %s --score %s cols=scoresums' % (
            plink_file_prefix, betas_file)
    elif plink_file.endswith('.bed'):
        plink_cmd += ' --bfile %s --score %s sum' % (plink_file_prefix,
                                                     betas_file)
    else:
        raise ValueError('neither --bed nor --pgen specified')
    if ranges_file is not None:
        scores_file = os.path.join(temp_dir,
                                   next(tempfile._get_candidate_names()))
        df_betas[['SNP_bim',
                  'score']].drop_duplicates('SNP_bim').to_csv(scores_file,
                                                              sep='\t',
                                                              header=False,
                                                              index=False)
        plink_cmd += ' --q-score-range %s %s' % (ranges_file, scores_file)
    if keep_file is not None:
        plink_cmd += ' --keep %s' % (keep_file)
    os.system(plink_cmd)

    #read results
    if ranges_file is None:
        if plink_file.endswith('.bed'):
            df_prs = pd.read_csv(outfile_prs_temp + '.profile',
                                 delim_whitespace=True)
        elif plink_file.endswith('.pgen'):
            df_prs = pd.read_csv(outfile_prs_temp + '.sscore',
                                 delim_whitespace=True)
            df_prs.rename(columns={
                '#IID': 'IID',
                'SCORE1_SUM': 'SCORESUM'
            },
                          inplace=True)
            df_prs['FID'] = df_prs['IID']
        else:
            raise ValueError('neither --bed nor --pgen specified')
        df_prs.set_index('IID', inplace=True, drop=True)
        if np.any(df_prs.index.duplicated()):
            raise ValueError('duplicated iids found in %s' %
                             (plink_file_prefix))
    else:
        df_prs = None
        if plink_file.endswith('.bed'):
            jk_files = glob(outfile_prs_temp + '.*.profile')
        elif plink_file.endswith('.pgen'):
            jk_files = glob(outfile_prs_temp + '.*.sscore')
        else:
            raise ValueError('neither --bed nor --pgen specified')
        for jk_file in jk_files:
            df_jk = pd.read_csv(jk_file, delim_whitespace=True)
            df_jk.rename(columns={
                '#IID': 'IID',
                'SCORE1_SUM': 'SCORESUM'
            },
                         inplace=True)
            df_jk['FID'] = df_jk['IID']
            df_jk.set_index('IID', inplace=True, drop=True)
            if np.any(df_jk.index.duplicated()):
                raise ValueError('duplicated iids found in %s' %
                                 (plink_file_prefix))
            jk_file_basename = os.path.basename(jk_file)
            block_name = jk_file_basename.split('.')[-2]
            block_num = int(block_name[5:])
            scoresum_colname = 'SCORESUM.jk%d' % (block_num)
            df_jk.rename(columns={'SCORESUM': scoresum_colname}, inplace=True)
            if df_prs is None:
                df_prs = df_jk
                df_prs['SCORESUM'] = 0
            else:
                assert np.all(df_jk.index == df_prs.index)
                df_prs[scoresum_colname] = df_jk[scoresum_colname]
            df_prs['SCORESUM'] += df_jk[scoresum_colname]

    if df_prs is None:
        raise ValueError('The following plink command failed:\n%s' %
                         (plink_cmd))
    return df_prs
Esempio n. 13
0
def load_betas_files(betas_file, verbose=True):

    if verbose:
        logging.info('Loading betas file %s...' % (betas_file))
        t0 = time.time()
    try:
        df_betas = pd.read_parquet(betas_file)
        if len(df_betas.index.names) > 1:
            df_betas.reset_index(inplace=True)
    except (ArrowIOError, ArrowInvalid):
        if betas_file.endswith('.parquet'):
            raise IOError('corrupt parquet file: %s' % (betas_file))
        df_betas = pd.read_csv(betas_file, delim_whitespace=True)
    if verbose:
        logging.info('done in %0.2f seconds' % (time.time() - t0))

    #rename columns if needed
    df_betas.rename(columns={
        'sid': 'SNP',
        'nt1': 'A1',
        'nt2': 'A2',
        'BETA_MEAN': 'BETA',
        'ldpred_inf_beta': 'BETA',
        'chrom': 'CHR',
        'Chrom': 'CHR',
        'pos': 'BP'
    },
                    inplace=True,
                    errors='ignore')
    if not is_numeric_dtype(df_betas['CHR']):
        if df_betas['CHR'].str.startswith('chrom_').all():
            df_betas['CHR'] = df_betas['CHR'].str[6:].astype(np.int)
        else:
            raise ValueError('unknown CHR format')
    df_betas.rename(columns={
        'BETA_joint': 'BETA',
        'ALLELE1': 'A1',
        'ALLELE0': 'A2',
        'beta_mean': 'BETA',
        'MAF_BOLT': 'A1Frq',
        'Name': 'SNP',
        'A1Effect': 'BETA',
        'Name': 'SNP',
        'Chrom': 'CHR',
        'Position': 'BP',
        'beta': 'BETA'
    },
                    inplace=True,
                    errors='ignore')

    #create index
    df_betas = set_snpid_index(df_betas)

    #subset SNPs according to extract file
    if args.extract is not None:
        df_extract = pd.read_csv(args.extract, header=None, squeeze=True)
        df_betas = df_betas.loc[df_betas['SNP'].isin(df_extract)]
        if df_betas.shape[0] == 0:
            raise ValueError('No SNPs remained after applying --extract')
        if verbose:
            logging.info('#SNPs after --extract: %s' % (df_betas.shape[0]))

    return df_betas