def process(self):
     all_sites = pd.read_csv(self.sites_file, usecols=['chr', 'coordinate'])
     all_sites = get_winid.convert_chr_to_num(all_sites)
     chrs = np.sort(all_sites['chr'].unique())
     all_sites_closest = []
     for chr in chrs:
         print('processing sites on chr ' + str(chr))
         chr_file = self.data_dir + 'chr' + chr + '.tsv'
         if not os.path.exists(chr_file):
             self.split_by_chr()
         chr_sites = all_sites.query('chr==@chr')
         chr_sites['coordinate'] = chr_sites['coordinate'].astype('i8')
         chr_sites['end'] = chr_sites['coordinate'] + 1
         chr_sites = BedTool([tuple(x[1]) for x in chr_sites.iterrows()])
         chr_sites_closest = chr_sites.closest(chr_file,
                                               d=True,
                                               nonamecheck=True)
         for row in chr_sites_closest:
             all_sites_closest.extend(
                 [[row[0], row[1], row[6], row[7], row[8]]])
         del chr_sites_closest
         del chr_sites
         gc.collect()
     all_sites_closest = pd.DataFrame(all_sites_closest,
                                      columns=[
                                          'chr', 'coordinate',
                                          'eigen_phred', 'eigen_pc_phred',
                                          'distiance_to_nearest_eigen'
                                      ])
     all_sites_closest = all_sites_closest.groupby(
         ['chr', 'coordinate']).apply(self.mean_max).reset_index()
     with pd.HDFStore(self.additional_feature_file, 'a') as h5s:
         h5s['Eigen'] = all_sites_closest
Esempio n. 2
0
    def process(self):
        all_sites = pd.read_csv(self.sites_file)
        all_sites = get_winid.convert_chr_to_num(all_sites)
        dann_scores = []
        dann_file = self.data_dir + 'DANN_whole_genome_SNVs.tsv.bgz'
        logger.info('DANN raw file is {}'.format(dann_file))
        tabix = pysam.Tabixfile(dann_file)
        i = 0
        for site in all_sites.values:
            scores_one_site = []
            chrm = convert_num_to_chrstr(int(site[1]))
            pos = int(site[2])
            left = pos
            right = pos - 1
            while len(scores_one_site) == 0:
                left = left - 1
                right = right + 1
                for row in tabix.fetch(chrm,
                                       left,
                                       right,
                                       parser=pysam.asTuple()):
                    scores_one_site.extend([float(row[-1])])
            average_score = np.mean(scores_one_site)
            max_score = np.max(scores_one_site)
            dann_scores.extend(
                [[convert_chrstr_to_num(chrm), pos, max_score, average_score]])
            i += 1
            if i % 1000 == 0:
                logger.info('Processed {} sites...'.format(i))
                #print([chrm,pos,max_score,average_score])

        with pd.HDFStore(self.additional_feature_file, 'a') as h5s:
            h5s['DANN'] = pd.DataFrame(dann_scores,
                                       columns=[
                                           'chr', 'coordinate',
                                           'DANN_max_score', 'DANN_avg_score'
                                       ])
            logger.info(
                'DANN features of sites in {} are outputted to {}'.format(
                    self.sites_file, self.additional_feature_file))


#data_dir = '/Users/Xiaobo/Desktop/test.tsv'
#sites_file = '/Users/Xiaobo/Jobs/CpG/data/all_sites_winid.csv'
#win_path = '/home/ec2-user/CpGPython/data/wins.txt'

#---------------

#all_sites.sort_values(['chr','coordinate'],inplace=True)

#wins = get_winid.read_wins(win_path,chrs)
#-----------------
Esempio n. 3
0
    def cal_counts(self, h5s, file, wins):
        logger.info('start binning {}-{}'.format(self.data_type, file))
        if self.data_type == 'WGBS':
            bed = self.read_WGBS((self.data_dir + file))
        else:
            bed = pd.read_csv(self.data_dir + file,
                              usecols=[0, 1, 2, 5],
                              header=None,
                              names=['chr', 'pos1', 'pos2', 'strand'],
                              sep='\s+')
            bed = bed[bed['chr'].apply(lambda x: x.startswith('chr'))]
            #bed['chr'] = bed['chr'].apply(lambda x: 'chr'+x.split('.')[-1] if not x.startswith('chr') else x)
            #bed = self.read_bed((self.data_dir+file))
        bed = get_winid.convert_chr_to_num(bed, self.chrs)

        if self.data_type == 'WGBS':
            bed = pd.merge(wins,
                           bed,
                           left_on=['oldChr', 'oldCoordinate'],
                           right_on=['chr', 'coordinate'],
                           how='left').dropna()
            bed = bed.drop(
                ['chr_y', 'coordinate_y', 'oldChr', 'oldCoordinate'],
                axis=1).rename(columns={
                    'chr_x': 'chr',
                    'coordinate_x': 'coordinate'
                }).sort_values(['chr', 'coordinate']).reset_index(drop=True)
            bed_counts = bed.groupby(['winid']).aggregate({
                'count': np.mean
            }).reset_index()
        else:
            logger.info(
                'Getting window reads proportional to overlapping length on windows'
            )
            bed = get_winid.get_window_reads(
                wins, bed, start_index=0).dropna()  #.sort_values(['winid'])
            bed_counts = bed.groupby(['winid']).aggregate({
                'count': sum
            }).reset_index()
            del bed
            gc.collect()
        bed_counts.rename(
            columns={'count': file[:-4] + '_' + self.data_type + '_counts'},
            inplace=True)
        h5s[file[:-4]] = bed_counts
        logger.info('binning {} is done'.format(file))
Esempio n. 4
0
 def process(self):
     all_sites = pd.read_csv(self.sites_file)
     all_sites = get_winid.convert_chr_to_num(all_sites)
     counts_at_targets = pd.DataFrame(
         all_sites['winid'])  #.sort_values(['winid'])
     if self.data_type == 'ATAC' or self.data_type == 'WGBS':
         with pd.HDFStore(self.h5s_file, 'r') as h5s:
             for key in h5s.keys():
                 bed_counts = h5s[key]
                 counts_at_targets = pd.merge(counts_at_targets,
                                              bed_counts,
                                              on=['winid'],
                                              how='left')
                 counts_at_targets[key[1:] + '_' + self.data_type +
                                   '_counts'].fillna(0, inplace=True)
                 logger.info(
                     'merging select sites with {} {} is done'.format(
                         self.data_type, key))
     elif self.data_type == 'RNASeq':
         for f in [
                 f for f in os.listdir(self.h5s_file)
                 if os.path.isfile(os.path.join(self.h5s_file, f))
         ]:
             with pd.HDFStore(self.h5s_file + f, 'r') as h5s:
                 logger.info(
                     "processing RNASeq feature of experiment {}".format(
                         self.h5s_file + f))
                 for key in h5s.keys():
                     bed_counts = h5s[key]
                     counts_at_targets = pd.merge(counts_at_targets,
                                                  bed_counts,
                                                  on=['winid'],
                                                  how='left')
                     counts_at_targets[key[1:] + '_' + self.data_type +
                                       '_counts'].fillna(0, inplace=True)
                     logger.info(
                         'processing RNASeq feature of sample {} is done'.
                         format(key))
     else:
         logger.error('Unsupported data type: ' + self.data_type)
         exit()
     with pd.HDFStore(self.additional_feature_file, 'a') as h5s:
         h5s[self.data_type] = counts_at_targets
         logger.info(
             'Merged {} features to selected sites are saved to {}'.format(
                 self.data_type, self.additional_feature_file))
Esempio n. 5
0
    def readcount_WGBS(
            self, h5s,
            file):  ####convert each hg38 WGBS file to hg19 coordinate

        bed = pd.read_csv(
            self.data_dir + file,
            usecols=[0, 1, 2, 5, 9, 10],
            header=None,
            names=['chr', 'pos1', 'pos2', 'strand', 'total', 'percent'],
            sep='\s+')
        bed.dropna(inplace=True)
        bed['coordinate'] = np.where(
            bed['strand'] == '+', bed['pos1'],
            bed['pos1'] - 1)  ##read 0-based WGBS bed, merge +/- strand
        bed.drop(['pos1', 'pos2'], axis=1, inplace=True)
        bed['count'] = np.round(bed['total'] * bed['percent'] / 100.0)
        bed.drop(['total', 'percent'], axis=1, inplace=True)
        bed = get_winid.convert_chr_to_num(bed, self.chrs)
        bed = bed.groupby(['chr', 'coordinate']).aggregate({
            'count': sum
        }).reset_index()
        bed = pd.merge(self.hg19_sites,
                       bed,
                       left_on=['hg38chr', 'hg38coordinate'],
                       right_on=['chr', 'coordinate'],
                       how='left').dropna()
        bed = bed.drop(['chr_y', 'coordinate_y', 'hg38chr', 'hg38coordinate'],
                       axis=1).rename(columns={
                           'chr_x': 'chr',
                           'coordinate_x': 'coordinate'
                       }).sort_values(['chr',
                                       'coordinate']).reset_index(drop=True)
        bed = bed.groupby(['chr', 'coordinate']).aggregate({
            'count': sum
        }).reset_index()
        bed.rename(columns={'count': file[:-4] + '_WGBS_counts'}, inplace=True)
        h5s[file[:-4]] = bed
        logger.info("WGBS: " + file + ' is coordinate converted')
Esempio n. 6
0
    def process(self):
        all_sites = pd.read_csv(self.sites_file)
        all_sites = get_winid.convert_chr_to_num(all_sites)
        #all_sites.sort_values(['chr','coordinate'],inplace=True)

        #reg = re.compile('^Eigen.*bgz$')
        #reg1 = re.compile('chr[0-9]{1,2}')
        #files = os.listdir(data_dir)
        #files = [f for f in files if (len(reg.findall(f))>0) and (reg1.findall(f)[0][3:] in chrs)]
        eigen_scores = []
        i = 0
        for site in all_sites.values:
            #raw_scores_one_site = []
            phred_one_site = []
            #pc_raw_scores_one_site = []
            pc_phred_one_site = []
            chrm = convert_num_to_chrstr(int(site[1]))
            pos = int(site[2])
            left = pos
            right = pos - 1
            eigen_file = self.data_dir + 'Eigen_hg19_noncoding_annot_chr' + chrm + '.tab.bgz'
            tabix = pysam.Tabixfile(eigen_file)
            while len(phred_one_site) == 0:
                left = left - 1
                right = right + 1
                for row in tabix.fetch(chrm,
                                       left,
                                       right,
                                       parser=pysam.asTuple()):
                    #raw_scores_one_site.extend([float(row[-4])])
                    phred_one_site.extend([float(row[-3])])
                    #pc_raw_scores_one_site.extend([float(row[-2])])
                    pc_phred_one_site.extend([float(row[-1])])
            #average_raw = np.mean(raw_scores_one_site)
            #max_raw = np.max(raw_scores_one_site)
            average_phred = np.mean(phred_one_site)
            max_phred = np.max(phred_one_site)
            #average_pc_raw = np.mean(pc_raw_scores_one_site)
            #max_pc_raw = np.max(pc_raw_scores_one_site)
            average_pc_phred = np.mean(pc_phred_one_site)
            max_pc_phred = np.max(pc_phred_one_site)
            eigen_scores.extend([[
                convert_chrstr_to_num(chrm), pos, max_phred, average_phred,
                max_pc_phred, average_pc_phred
            ]])
            #eigen_scores.extend([[chrm,pos,max_raw,average_raw,max_phred,average_phred,max_pc_raw,average_pc_raw,max_pc_phred,average_pc_phred]])
            i += 1
            if i % 1000 == 0:
                #print([chrm,pos,max_raw,average_raw,max_phred,average_phred,max_pc_raw,average_pc_raw,max_pc_phred,average_pc_phred])
                logger.info('Eigen raw file for chromsome {} is {}'.format(
                    chrm, eigen_file))
                logger.info('Processed {} sites...'.format(i))
                #print([chrm,pos,max_phred,average_phred,max_pc_phred,average_pc_phred])

        with pd.HDFStore(self.additional_feature_file, 'a') as h5s:
            #h5s['Eigen'] = pd.DataFrame(eigen_scores,columns=['chr','coordinate','eigen_max_raw','eigen_avg_raw','eigen_max_phred','egien_avg_phred','eigen_max_pc_raw','eigen_avg_pc_raw','eigen_max_pc_phred','egien_avg_pc_phred'])
            h5s['Eigen'] = pd.DataFrame(eigen_scores,
                                        columns=[
                                            'chr', 'coordinate',
                                            'eigen_max_phred',
                                            'egien_avg_phred',
                                            'eigen_max_pc_phred',
                                            'egien_avg_pc_phred'
                                        ])
            logger.info(
                'Eigen features of sites in {} are outputted to {}'.format(
                    self.sites_file, self.additional_feature_file))


#data_dir = '/Users/Xiaobo/Desktop/test.tsv'
#sites_file = '/Users/Xiaobo/Jobs/CpG/data/all_sites_winid.csv'
Esempio n. 7
0
all_sites.drop(['start', 'end'], axis=1, inplace=True)

additional_features = [
    'ATAC', 'CADD', 'DANN', 'Eigen', 'GenoCanyon', 'RNASeq', 'WGBS', 'GWAVA'
]
#merge with additional features
with pd.HDFStore(feature_dir + 'addtional_features', 'r') as h5s:
    for feature in additional_features:
        feature_frame = h5s[feature]
        all_sites = pd.concat([
            all_sites.reset_index(drop=True),
            feature_frame.reset_index(drop=True)
        ],
                              axis=1)
all_sites = all_sites.loc[:, ~all_sites.columns.duplicated()]
all_sites['chr'] = all_sites['chr'].astype('i8')

#nearest tss distance
chrs = all_sites['chr'].unique()
cols = ['chr', 'coordinate', 'strand']
tss = pd.read_csv(home + 'data/commons/tss.txt',
                  sep='\s+',
                  header=None,
                  names=cols,
                  skiprows=1)
tss = get_winid.convert_chr_to_num(tss, chrs)
tss.sort_values(['chr', 'coordinate'], inplace=True)
all_sites = nearest_tss(tss, all_sites)

with pd.HDFStore(home + 'data/' + dataset + '/all_features', 'w') as h5s:
    h5s['all_features'] = all_sites