def run(self, ): # make directory f it doesn't exist outdir = self.outdir if not os.path.exists(outdir): os.makedirs(outdir) # load bed data beddata = processing.load_bed(self.in_bed().path).set_index('name') # load ss energy ss_dG_data_all = {} for windowsize, subdict in self.in_secstructure.items(): ss_dG_data = {} for constraint, target in subdict.items(): ss_dG_data[constraint] = pd.read_table(target().path, header=None, index_col=0, squeeze=True, names=['name', 'dG']) ss_dG_data = pd.concat(ss_dG_data, names=['constraint']).unstack(level=0) ss_dG_data_all['ss_ddG_%d' % windowsize] = ( ss_dG_data.loc[:, True] - ss_dG_data.loc[:, False]).rename('ss_ddG') ss_dG_data_all = pd.concat(ss_dG_data_all, axis=1) # combine out_data = pd.concat( [beddata, ss_dG_data_all], axis=1).reset_index().loc[:, variables.bed_fields + ss_dG_data_all.columns.tolist()] out_data.to_csv(self.out_table().path, sep='\t', compression='gzip')
def run(self, ): # make directory f it doesn't exist outdir = self.outdir if not os.path.exists(outdir): os.makedirs(outdir) # load counts interval_radius = 40 #1/2 width to sum over offset = 15 start_loc = int(self.window_size/2-interval_radius-offset) end_loc = start_loc + interval_radius*2 counts = {} for key, target in self.in_counts.items(): counts['%s'%(key)] = pd.read_csv(target().path, compression='gzip', index_col=0).iloc[:, start_loc:end_loc].sum(axis=1) counts = pd.concat(counts).unstack(level=0) # load bed data beddata = processing.load_bed(self.in_bed().path, additional_cols=variables.motif_fields_additional).set_index('name') # load tpm expression = pd.read_table(self.in_tpm().path, index_col=0, squeeze=True) # combine out_data = pd.concat([beddata, expression, counts], axis=1) out_data.to_csv(self.out_table().path, sep='\t', compression='gzip')
def run(self): # check directory exists if not os.path.exists(self.outdir): os.makedirs(self.outdir) # load tpm files rep1 = pd.read_table(self.in_rna1().path) rep2 = pd.read_table(self.in_rna2().path) tpm_data = pd.concat([rep1.set_index('transcript_id').TPM.rename('rep1'), rep2.set_index('transcript_id').TPM.rename('rep2')], axis=1).reset_index().rename(columns={'transcript_id':'transcript_idx'}) tpm_data.index = [s.split('.')[0] for s in tpm_data.transcript_idx] tpm_combined = np.exp(np.log(tpm_data.loc[:, ['rep1', 'rep2']]).mean(axis=1)) # load the biomart data mapping transcript id to refseq id biomart_data = pd.read_table(self.biomart_file, names=['gene_id', 'transcript_id', 'gene_name', 'refseq_id', 'refseq_nc'], header=0) # process to add nc to id column if id column is nan biomart_data.loc[:, 'refseq_comb'] = [refseq_id if not str(refseq_id)=='nan' else refseq_nc for idx, refseq_id, refseq_nc in biomart_data.loc[:, ['refseq_id', 'refseq_nc']].itertuples()] # annotate tpm data with refseq id biomart_data.loc[:, 'tpm'] = tpm_combined.loc[biomart_data.transcript_id].values # take whichever refseq id has the most tpm (or the most) tpm_refseq = biomart_data.groupby('refseq_id')['tpm'].max() # load bed data bed_data = processing.load_bed(self.in_bed().path, additional_cols=variables.motif_fields_additional) bed_data.loc[:, 'tpm'] = tpm_refseq.loc[bed_data.refseq_id].values log.info('%d out of %d motif sites had no TPM data'%(bed_data.tpm.isnull().sum(), len(bed_data))) bed_data.loc[:, ['name', 'tpm']].to_csv(self.out_motif_tpm().path, sep='\t', index=False)
def run(self, ): # make directory f it doesn't exist outdir = self.outdir if not os.path.exists(outdir): os.makedirs(outdir) # load counts counts = {} for key, target in self.in_counts.items(): data_table = pd.read_csv(target().path, compression='gzip', index_col=0) counts['%s' % (key)] = processing.get_counts_from_counts_table( data_table, ) counts = pd.concat(counts).unstack(level=0) # load seqdata seqdata = pd.read_table(self.in_seq().path, compression='gzip', index_col=0) # load seqdata_effects seqeffect = pd.read_table(self.in_effect().path, compression='gzip', index_col=0) #seqeffect.loc[:, 'flag'] = pd.Series({idx:seqmodel.flag_ensemble(row.drop('ddG')-row.ddG) for idx, row in seqeffect.iterrows()}) noflip_cols = [idx for idx in seqeffect if idx.find('noflip') == 0] flip_cols = [ idx for idx in seqeffect if idx.find('flip') == 0 or idx.find('doubleflip') == 0 ] seqeffect.loc[:, 'ddG_noflip_noens'] = seqeffect.loc[:, noflip_cols[0]] seqeffect.loc[:, 'ddG_noflip'] = seqmodel.compute_ensemble_ddG_set( seqeffect.loc[:, noflip_cols], self.temperature) seqeffect.loc[:, 'ddG_flip'] = seqmodel.compute_ensemble_ddG_set( seqeffect.loc[:, flip_cols], self.temperature) keep_cols = [idx for idx in seqeffect if idx.find('ddG') == 0] # load bed data beddata = processing.load_bed( self.in_bed().path, additional_cols=variables.motif_fields_additional).set_index( 'name') # load tpm expression = pd.read_table(self.in_tpm().path, index_col=0, squeeze=True) # combine out_data = pd.concat([ beddata, counts, expression, seqdata, seqeffect.loc[:, keep_cols] ], axis=1) out_data.loc[:, 'clip_signal_per_tpm'] = (out_data.rep1 + out_data.rep2) / out_data.tpm out_data.loc[:, 'clip_input_per_tpm'] = (out_data.input) / out_data.tpm out_data.to_csv(self.out_table().path, sep='\t', compression='gzip')
def run(self): # make out directory if it doesn't exist dirname = os.path.dirname(self.out_bed().path) if not os.path.exists(dirname): os.makedirs(dirname) combined_data = pd.concat( [processing.load_bed(target().path) for target in self.in_beds]) processing.save_bed(combined_data.sort_values(['chrm', 'start']), self.out_bed().path)
def run(self, ): # make directory f it doesn't exist outdir = self.outdir if not os.path.exists(outdir): os.makedirs(outdir) # load counts interval_radius = 40 #1/2 width to sum over offset = 15 start_loc = int(self.window_size/2-interval_radius-offset) end_loc = start_loc + interval_radius*2 counts = {} for key, target in self.in_counts.items(): counts['%s'%(key)] = pd.read_csv(target().path, compression='gzip', index_col=0).iloc[:, start_loc:end_loc].sum(axis=1) counts = pd.concat(counts).unstack(level=0) # load seqdata seqdata = pd.read_table(self.in_seq().path, compression='gzip', index_col=0) # load seqdata_effects seqeffect = pd.read_table(self.in_effect().path, compression='gzip', index_col=0) seqeffect.loc[:, 'flag'] = pd.Series({idx:seqmodel.flag_ensemble(row.drop('ddG')-row.ddG) for idx, row in seqeffect.iterrows()}) kT = seqmodel.get_ddG_conversion(temperature) noflip_cols = [idx for idx in seqeffect if idx.find('noflip')==0] seqeffect.loc[:, 'ddG_noflip'] = kT*np.log(np.exp(seqeffect.loc[:, noflip_cols]/kT).sum(axis=1)) # load bed data beddata = processing.load_bed(self.in_bed().path, additional_cols=variables.motif_fields_additional).set_index('name') # load tpm expression = pd.read_table(self.in_tpm().path, index_col=0, squeeze=True) # load ss energy ss_dG_data = {} for constraint, target in self.in_secstructure.items(): ss_dG_data[constraint] = pd.read_table(target().path, header=None, index_col=0, squeeze=True, names=['motif', 'dG']) ss_dG_data = pd.concat(ss_dG_data, names=['constraint']).unstack(level=0) ss_dG_diff = (ss_dG_data.loc[:, True] - ss_dG_data.loc[:, False]).rename('ss_ddG') # combine out_data = pd.concat([beddata, counts, expression, seqdata, seqeffect.ddG, seqeffect.flag, ss_dG_diff], axis=1) out_data.loc[:, 'clip_signal_per_tpm'] = (out_data.rep1 + out_data.rep2)/out_data.tpm out_data.loc[:, 'clip_input_per_tpm'] = (out_data.input)/out_data.tpm out_data.to_csv(self.out_table().path, sep='\t', compression='gzip')
def run(self): # make out directory if it doesn't exist dirname = os.path.dirname(self.out_bed().path) if not os.path.exists(dirname): os.makedirs(dirname) # load regions regions = processing.load_bed(self.in_regions().path) regions.loc[:, 'int_size'] = regions.stop - regions.start regions.loc[:, 'cum_int_size'] = np.cumsum(regions.int_size) # find n start sites n = self.num_random if not pd.isnull(self.seed): np.random.seed(self.seed) vals = np.sort( np.random.choice(np.arange(regions.int_size.sum()), size=n)) locations = np.searchsorted(regions.cum_int_size, vals) strands = np.random.choice(['+', '-'], size=n) # go through each location and get the interval new_regions = [] for i, (loc, val) in enumerate(zip(locations, vals)): region = regions.iloc[loc] diff = region.cum_int_size - val new_start = region.start + diff new_stop = new_start + self.window_size new_region = pd.Series({ 'chrm': region.chrm, 'start': new_start, 'stop': new_stop, 'name': '%s_%d' % (region.loc['name'], i), 'strand': strands[i], 'score': '.' }) new_regions.append(new_region) new_regions = pd.concat(new_regions, axis=1).transpose().loc[:, variables.bed_fields] processing.save_bed(new_regions, self.out_bed().path)
def run(self): annfile = self.in_filt_dat().path interfile_basename = os.path.splitext(self.in_filt_dat().path)[0] filtfile = interfile_basename + '.filt.bed.tmp' # filter process_annotations = ( 'awk \'BEGIN {FS="\\t"}{OFS="\\t"}{' 'if ($NF=="") {gene_type="NA"} else {gene_type=$NF}; ' 'n=index($8, " ("); ' 'if (n>0) {ann=substr($8, 1, n-1); notann=substr($8, n+2, length($8)-n-2);} ' 'else {ann=$8; notann=""}; ' 'm=index(notann, ","); ' 'if (m>0) {gene=substr(notann, 1, m-1); exon=substr(notann, m+2, length(notann));} ' 'else {gene=notann; exon=""}; ' 'print $2, $3-1, $4, $1, $6, $5, ann, gene, exon, gene_type}\'') # only include protein coding genes or the NORAD subset #apply_filter = ('awk \'BEGIN {FS="\\t"}{OFS="\\t"}{' # 'if (($NF=="protein-coding" &&($7=="exon" || index($7, "UTR")==4)) || ($NF=="ncRNA" && $8=="%s")) print}\'')%self.nc_gene if self.filter_genetype: apply_filter = ( 'awk \'BEGIN {FS="\\t"}{OFS="\\t"}{' 'if (($NF=="protein-coding" &&($7=="exon" || index($7, "UTR")==4)) || ' '($NF=="ncRNA" && $7=="non-coding")) print}\'') filter_call_sub = process_annotations + ' | ' + apply_filter else: filter_call_sub = process_annotations # process and apply filter filter_call = ( 'tail -n+2 %s | ' + filter_call_sub + ' | grep -v chrUn | grep -v random | bedtools sort -i stdin > %s' ) % (annfile, filtfile) log.info(filter_call) subprocess.call(filter_call, shell=True) # find closest transcript and only keep that which aligns strand_call = (( 'bedtools closest -d -a %s -b %s | ' 'awk -F "\\t" \'{OFS="\\t"}{' 'distance=$NF; ' 'genemotif=$8; ' 'geneclosest=$14; ' 'strandmotif=$6; ' 'strandgene=$16; ' 'if ((genemotif==geneclosest && strandmotif==strandgene) || genemotif=="") print}\' | ' 'cut -f 1-10 | awk \'{print}\'> %s') % (filtfile, self.transcript_bed, self.out_file().path)) # do calls log.info(strand_call) subprocess.call(strand_call, shell=True) # remove duplicates bed_data = processing.load_bed( self.out_file().path, additional_cols=variables.motif_fields_additional) bed_data = bed_data.groupby('name').first().reset_index( ).loc[:, variables.bed_fields + variables.motif_fields_additional] bed_data.to_csv(self.out_file().path, index=False, header=False, sep='\t')