Exemple #1
0
    def run(self, ):
        # make directory f it doesn't exist
        outdir = self.outdir
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # load bed data
        beddata = processing.load_bed(self.in_bed().path).set_index('name')

        # load ss energy
        ss_dG_data_all = {}
        for windowsize, subdict in self.in_secstructure.items():
            ss_dG_data = {}
            for constraint, target in subdict.items():
                ss_dG_data[constraint] = pd.read_table(target().path,
                                                       header=None,
                                                       index_col=0,
                                                       squeeze=True,
                                                       names=['name', 'dG'])
            ss_dG_data = pd.concat(ss_dG_data,
                                   names=['constraint']).unstack(level=0)
            ss_dG_data_all['ss_ddG_%d' % windowsize] = (
                ss_dG_data.loc[:, True] -
                ss_dG_data.loc[:, False]).rename('ss_ddG')
        ss_dG_data_all = pd.concat(ss_dG_data_all, axis=1)

        # combine
        out_data = pd.concat(
            [beddata, ss_dG_data_all],
            axis=1).reset_index().loc[:, variables.bed_fields +
                                      ss_dG_data_all.columns.tolist()]

        out_data.to_csv(self.out_table().path, sep='\t', compression='gzip')
    def run(self, ):
        # make directory f it doesn't exist
        outdir = self.outdir
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # load counts
        interval_radius = 40 #1/2 width to sum over
        offset = 15
        start_loc = int(self.window_size/2-interval_radius-offset)
        end_loc = start_loc + interval_radius*2
        counts = {}

        for key, target in self.in_counts.items():

            counts['%s'%(key)] = pd.read_csv(target().path, compression='gzip', index_col=0).iloc[:, start_loc:end_loc].sum(axis=1)
        counts = pd.concat(counts).unstack(level=0)
        

        # load bed data
        beddata = processing.load_bed(self.in_bed().path, additional_cols=variables.motif_fields_additional).set_index('name')
        # load tpm
        expression = pd.read_table(self.in_tpm().path, index_col=0, squeeze=True)
            

        # combine
        out_data = pd.concat([beddata, expression, counts], axis=1)


        out_data.to_csv(self.out_table().path, sep='\t', compression='gzip')    
 def run(self):
     # check directory exists
     if not os.path.exists(self.outdir):
         os.makedirs(self.outdir)
     # load tpm files       
     rep1 = pd.read_table(self.in_rna1().path)
     rep2 = pd.read_table(self.in_rna2().path)
     tpm_data = pd.concat([rep1.set_index('transcript_id').TPM.rename('rep1'), rep2.set_index('transcript_id').TPM.rename('rep2')], axis=1).reset_index().rename(columns={'transcript_id':'transcript_idx'})
     tpm_data.index = [s.split('.')[0] for s in tpm_data.transcript_idx]
     tpm_combined = np.exp(np.log(tpm_data.loc[:, ['rep1', 'rep2']]).mean(axis=1))
     
     # load the biomart data mapping transcript id to refseq id
     biomart_data = pd.read_table(self.biomart_file, names=['gene_id', 'transcript_id', 'gene_name', 'refseq_id', 'refseq_nc'], header=0)
     # process to add nc to id column if id column is nan
     biomart_data.loc[:, 'refseq_comb'] = [refseq_id if not str(refseq_id)=='nan' else refseq_nc for idx, refseq_id, refseq_nc in biomart_data.loc[:, ['refseq_id', 'refseq_nc']].itertuples()]
     
     # annotate tpm data with refseq id
     biomart_data.loc[:, 'tpm'] = tpm_combined.loc[biomart_data.transcript_id].values
     # take whichever refseq id has the most tpm (or the most)
     tpm_refseq = biomart_data.groupby('refseq_id')['tpm'].max()
     
     # load bed data
     bed_data = processing.load_bed(self.in_bed().path, additional_cols=variables.motif_fields_additional) 
     bed_data.loc[:, 'tpm'] = tpm_refseq.loc[bed_data.refseq_id].values
     log.info('%d out of %d motif sites had no TPM data'%(bed_data.tpm.isnull().sum(), len(bed_data)))
     bed_data.loc[:, ['name', 'tpm']].to_csv(self.out_motif_tpm().path, sep='\t', index=False)
    def run(self, ):
        # make directory f it doesn't exist
        outdir = self.outdir
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # load counts
        counts = {}
        for key, target in self.in_counts.items():
            data_table = pd.read_csv(target().path,
                                     compression='gzip',
                                     index_col=0)
            counts['%s' % (key)] = processing.get_counts_from_counts_table(
                data_table, )
        counts = pd.concat(counts).unstack(level=0)

        # load seqdata
        seqdata = pd.read_table(self.in_seq().path,
                                compression='gzip',
                                index_col=0)

        # load seqdata_effects
        seqeffect = pd.read_table(self.in_effect().path,
                                  compression='gzip',
                                  index_col=0)
        #seqeffect.loc[:, 'flag'] = pd.Series({idx:seqmodel.flag_ensemble(row.drop('ddG')-row.ddG) for idx, row in seqeffect.iterrows()})

        noflip_cols = [idx for idx in seqeffect if idx.find('noflip') == 0]
        flip_cols = [
            idx for idx in seqeffect
            if idx.find('flip') == 0 or idx.find('doubleflip') == 0
        ]
        seqeffect.loc[:, 'ddG_noflip_noens'] = seqeffect.loc[:, noflip_cols[0]]
        seqeffect.loc[:, 'ddG_noflip'] = seqmodel.compute_ensemble_ddG_set(
            seqeffect.loc[:, noflip_cols], self.temperature)
        seqeffect.loc[:, 'ddG_flip'] = seqmodel.compute_ensemble_ddG_set(
            seqeffect.loc[:, flip_cols], self.temperature)
        keep_cols = [idx for idx in seqeffect if idx.find('ddG') == 0]

        # load bed data
        beddata = processing.load_bed(
            self.in_bed().path,
            additional_cols=variables.motif_fields_additional).set_index(
                'name')

        # load tpm
        expression = pd.read_table(self.in_tpm().path,
                                   index_col=0,
                                   squeeze=True)

        # combine
        out_data = pd.concat([
            beddata, counts, expression, seqdata, seqeffect.loc[:, keep_cols]
        ],
                             axis=1)
        out_data.loc[:, 'clip_signal_per_tpm'] = (out_data.rep1 +
                                                  out_data.rep2) / out_data.tpm
        out_data.loc[:, 'clip_input_per_tpm'] = (out_data.input) / out_data.tpm

        out_data.to_csv(self.out_table().path, sep='\t', compression='gzip')
Exemple #5
0
    def run(self):
        # make out directory if it doesn't exist
        dirname = os.path.dirname(self.out_bed().path)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        combined_data = pd.concat(
            [processing.load_bed(target().path) for target in self.in_beds])
        processing.save_bed(combined_data.sort_values(['chrm', 'start']),
                            self.out_bed().path)
    def run(self, ):
        # make directory f it doesn't exist
        outdir = self.outdir
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # load counts
        interval_radius = 40 #1/2 width to sum over
        offset = 15
        start_loc = int(self.window_size/2-interval_radius-offset)
        end_loc = start_loc + interval_radius*2
        counts = {}

        for key, target in self.in_counts.items():

            counts['%s'%(key)] = pd.read_csv(target().path, compression='gzip', index_col=0).iloc[:, start_loc:end_loc].sum(axis=1)
        counts = pd.concat(counts).unstack(level=0)
        
        # load seqdata
        seqdata = pd.read_table(self.in_seq().path, compression='gzip', index_col=0)

        # load seqdata_effects
        seqeffect =  pd.read_table(self.in_effect().path, compression='gzip', index_col=0)
        seqeffect.loc[:, 'flag'] = pd.Series({idx:seqmodel.flag_ensemble(row.drop('ddG')-row.ddG) for idx, row in seqeffect.iterrows()})

        kT = seqmodel.get_ddG_conversion(temperature)
        noflip_cols = [idx for idx in seqeffect if idx.find('noflip')==0]
        seqeffect.loc[:, 'ddG_noflip'] = kT*np.log(np.exp(seqeffect.loc[:, noflip_cols]/kT).sum(axis=1))

        # load bed data
        beddata = processing.load_bed(self.in_bed().path, additional_cols=variables.motif_fields_additional).set_index('name')
        
        # load tpm
        expression = pd.read_table(self.in_tpm().path, index_col=0, squeeze=True)
    
        # load ss energy
        ss_dG_data = {}
        for constraint, target in self.in_secstructure.items():
            ss_dG_data[constraint] = pd.read_table(target().path, header=None, index_col=0, squeeze=True, names=['motif', 'dG'])
        ss_dG_data = pd.concat(ss_dG_data, names=['constraint']).unstack(level=0)
        ss_dG_diff = (ss_dG_data.loc[:, True] - ss_dG_data.loc[:, False]).rename('ss_ddG')
        
        # combine
        out_data = pd.concat([beddata, counts, expression, seqdata, seqeffect.ddG, seqeffect.flag, ss_dG_diff], axis=1)
        out_data.loc[:, 'clip_signal_per_tpm'] = (out_data.rep1 + out_data.rep2)/out_data.tpm
        out_data.loc[:, 'clip_input_per_tpm'] = (out_data.input)/out_data.tpm

        out_data.to_csv(self.out_table().path, sep='\t', compression='gzip')
    def run(self):

        # make out directory if it doesn't exist
        dirname = os.path.dirname(self.out_bed().path)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        # load regions
        regions = processing.load_bed(self.in_regions().path)
        regions.loc[:, 'int_size'] = regions.stop - regions.start
        regions.loc[:, 'cum_int_size'] = np.cumsum(regions.int_size)

        # find n start sites
        n = self.num_random
        if not pd.isnull(self.seed):
            np.random.seed(self.seed)
        vals = np.sort(
            np.random.choice(np.arange(regions.int_size.sum()), size=n))
        locations = np.searchsorted(regions.cum_int_size, vals)
        strands = np.random.choice(['+', '-'], size=n)

        # go through each location and get the interval
        new_regions = []
        for i, (loc, val) in enumerate(zip(locations, vals)):
            region = regions.iloc[loc]
            diff = region.cum_int_size - val
            new_start = region.start + diff
            new_stop = new_start + self.window_size
            new_region = pd.Series({
                'chrm': region.chrm,
                'start': new_start,
                'stop': new_stop,
                'name': '%s_%d' % (region.loc['name'], i),
                'strand': strands[i],
                'score': '.'
            })
            new_regions.append(new_region)
        new_regions = pd.concat(new_regions,
                                axis=1).transpose().loc[:,
                                                        variables.bed_fields]
        processing.save_bed(new_regions, self.out_bed().path)
    def run(self):
        annfile = self.in_filt_dat().path
        interfile_basename = os.path.splitext(self.in_filt_dat().path)[0]
        filtfile = interfile_basename + '.filt.bed.tmp'

        # filter
        process_annotations = (
            'awk \'BEGIN {FS="\\t"}{OFS="\\t"}{'
            'if ($NF=="") {gene_type="NA"} else {gene_type=$NF}; '
            'n=index($8, " ("); '
            'if (n>0) {ann=substr($8, 1, n-1); notann=substr($8, n+2, length($8)-n-2);} '
            'else {ann=$8; notann=""}; '
            'm=index(notann, ","); '
            'if (m>0) {gene=substr(notann, 1, m-1); exon=substr(notann, m+2, length(notann));} '
            'else {gene=notann; exon=""}; '
            'print $2, $3-1, $4, $1, $6, $5, ann, gene, exon, gene_type}\'')

        # only include protein coding genes or the NORAD subset
        #apply_filter = ('awk \'BEGIN {FS="\\t"}{OFS="\\t"}{'
        #                 'if (($NF=="protein-coding" &&($7=="exon" || index($7, "UTR")==4)) || ($NF=="ncRNA" && $8=="%s")) print}\'')%self.nc_gene
        if self.filter_genetype:
            apply_filter = (
                'awk \'BEGIN {FS="\\t"}{OFS="\\t"}{'
                'if (($NF=="protein-coding" &&($7=="exon" || index($7, "UTR")==4)) || '
                '($NF=="ncRNA" && $7=="non-coding")) print}\'')
            filter_call_sub = process_annotations + ' | ' + apply_filter
        else:
            filter_call_sub = process_annotations

        # process and apply filter
        filter_call = (
            'tail -n+2 %s | ' + filter_call_sub +
            ' | grep -v chrUn | grep -v random | bedtools sort -i stdin > %s'
        ) % (annfile, filtfile)

        log.info(filter_call)
        subprocess.call(filter_call, shell=True)

        # find closest transcript and only keep that which aligns
        strand_call = ((
            'bedtools closest -d -a %s -b %s | '
            'awk -F "\\t" \'{OFS="\\t"}{'
            'distance=$NF; '
            'genemotif=$8; '
            'geneclosest=$14; '
            'strandmotif=$6; '
            'strandgene=$16; '
            'if ((genemotif==geneclosest && strandmotif==strandgene) || genemotif=="") print}\' | '
            'cut -f 1-10 | awk \'{print}\'> %s') %
                       (filtfile, self.transcript_bed, self.out_file().path))

        # do calls

        log.info(strand_call)
        subprocess.call(strand_call, shell=True)

        # remove duplicates
        bed_data = processing.load_bed(
            self.out_file().path,
            additional_cols=variables.motif_fields_additional)
        bed_data = bed_data.groupby('name').first().reset_index(
        ).loc[:, variables.bed_fields + variables.motif_fields_additional]
        bed_data.to_csv(self.out_file().path,
                        index=False,
                        header=False,
                        sep='\t')