def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None): count1 = 0 count2 = 0 pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')] quant_df['genome coord'] = quant_df['chromosome'].str.cat(quant_df['position'].values.astype(str), sep=':') quant_df.index = quant_df['genome coord'] quant_df = quant_df.drop('index', axis=1) column_dict = {'intron size':[], 'alt splicing':[], '5p score':[], '3p score':[], 'seq5':[], 'seq3':[]} new_index = [] seq5 = [] seq3 = [] for coord in quant_df.index: coord_df = quant_df[quant_df.index == coord] three_site = None alt3 = False if len(coord_df) > 0: coord_df = coord_df.sort_values('height', ascending=False).ix[0] introns = ss_dict[coord_df['transcript']] if 'prime' in coord_df['type']: peak_range = range(coord_df['position']-5,coord_df['position']+5) for intron in introns: if intron[0] in peak_range: five_site = intron[0] three_site = intron[1] break if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0: alt3=True else: if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']: five_site = coord_df['position'] three_df = quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')] three_df = three_df.sort_values('height', ascending=False) three_site = three_df.ix[0]['position'] if three_site is not None: new_index.append(coord) size = abs(three_site-five_site)/1000. column_dict['intron size'].append(size) column_dict['alt splicing'].append(alt3) if coord_df['strand'] == '+': s5 = fa_dict[coord_df['chromosome']][five_site-2:five_site+6] s3 = fa_dict[coord_df['chromosome']][three_site-6:three_site+2] elif coord_df['strand'] == '-': s5 = fa_dict[coord_df['chromosome']][five_site-6:five_site+2] s5 = SP.reverse_complement(s5) s3 = fa_dict[coord_df['chromosome']][three_site-2:three_site+6] s3 = SP.reverse_complement(s3) column_dict['seq5'].append(s5) column_dict['seq3'].append(s3) scores = SP.simple_score_junction(s5, s3, pssm) column_dict['3p score'].append(scores[1]) column_dict['5p score'].append(scores[0]) new_quant_df = quant_df[quant_df.index.isin(new_index)][['genome coord','chromosome', 'strand','transcript','position','type']] for column, data in column_dict.iteritems(): new_quant_df[column] = data new_quant_df = new_quant_df.drop_duplicates(subset='genome coord', keep='first').set_index('genome coord') new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) #for n in range(len(new_quant_df['seq5'].iloc[0])): # new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] #for n in range(len(new_quant_df['seq3'].iloc[0])): # new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) new_quant_df = SP.find_score_branches_ppy(new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict) return new_quant_df
def main(): '''Usage: run SP_pipeline.py config_file untagged_sample_name organism config file : file that lists all branch, junction and peak files untagged_sample_name : prefix for untagged sample organism : pombe, crypto or cerevisiae''' junc_beds = [] branch_bams = [] CP_out = [] CP_untagged = None quant_bams = {} # Read configuration file with open(sys.argv[1], 'r') as config: for line in config: if 'junctions.bed' in line.lower(): junc_beds.append(line.strip()) elif 'branch' in line.lower(): branch_bams.append(line.strip()) elif sys.argv[2] in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = sys.argv[1].split('/')[-1].split('_config')[0] base_dir = sys.argv[1].split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: " + base_dir + name print "\nJunction bed files" print junc_beds print "\nBranch bam files" if len(branch_bams) == 2: print branch_bams use_branches = True elif len(branch_bams) == 0: print "No data for branches, continuing with only junctions" use_branches = False print "\nUntagged peaks" print CP_untagged print "\nChangepoint peaks" print CP_out print '' if CP_untagged is None: print "\n Error: no untagged file indicated" return None organism = sys.argv[3] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) #### Generate peak df if name + '_peaks_w_branch.csv' not in os.listdir( base_dir) or name + '_peaks_w_junc.csv' not in os.listdir( base_dir): if name + '_all_peaks.pickle' not in os.listdir(base_dir): peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name + '_CP_peaks') peak_df.to_pickle(base_dir + name + '_all_peaks.pickle') else: peak_df = pd.read_pickle(base_dir + name + '_all_peaks.pickle') #### Junction to peak comparison if name + '_peaks_w_junc.csv' not in os.listdir(base_dir): print "Generating peaks vs. junctions dataframe..." peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name) else: peaks_w_junc = pd.read_pickle(base_dir + name + '_peaks_w_junc.pickle') print "Peaks vs. junction dataframe already exists" #### Branch to peak comparison if use_branches is True: if name + '_peaks_w_branch.csv' not in os.listdir(base_dir): print "Generating peaks vs. branches dataframe..." peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name) else: peaks_w_branch = pd.read_pickle(base_dir + name + '_peaks_w_branch.pickle') print "Peaks vs. branches dataframe already exists" #### Clean up dataframe for quantitation if name + '_quantitation.csv' not in os.listdir(base_dir): quant_df, lariat_df = SP.make_quant_df(peaks_w_junc, peaks_w_branch, gff3, fa_dict, organism=organism) quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch, fa_dict) print "Counting reads in transcripts and at peaks..." quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir + name + '_quantitation.pickle') quant_df.to_csv(base_dir + name + '_quantitation.csv') lariat_df.to_pickle(base_dir + name + '_lariats.pickle') lariat_df.to_csv(base_dir + name + '_lariats.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) else: quant_df = pd.read_pickle(base_dir + name + '_quantitation.pickle') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) print "\n****Finished****"
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None): count1 = 0 count2 = 0 pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')] quant_df['genome coord'] = quant_df['chromosome'].str.cat( quant_df['position'].values.astype(str), sep=':') quant_df.index = quant_df['genome coord'] quant_df = quant_df.drop('index', axis=1) column_dict = { 'intron size': [], 'alt splicing': [], '5p score': [], '3p score': [], 'seq5': [], 'seq3': [] } new_index = [] seq5 = [] seq3 = [] for coord in quant_df.index: coord_df = quant_df[quant_df.index == coord] three_site = None alt3 = False if len(coord_df) > 0: coord_df = coord_df.sort_values('height', ascending=False).ix[0] introns = ss_dict[coord_df['transcript']] if 'prime' in coord_df['type']: peak_range = range(coord_df['position'] - 5, coord_df['position'] + 5) for intron in introns: if intron[0] in peak_range: five_site = intron[0] three_site = intron[1] break if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0: alt3 = True else: if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']: five_site = coord_df['position'] three_df = quant_df[ (quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')] three_df = three_df.sort_values('height', ascending=False) three_site = three_df.ix[0]['position'] if three_site is not None: new_index.append(coord) size = abs(three_site - five_site) / 1000. column_dict['intron size'].append(size) column_dict['alt splicing'].append(alt3) if coord_df['strand'] == '+': s5 = fa_dict[coord_df['chromosome']][five_site - 2:five_site + 6] s3 = fa_dict[coord_df['chromosome']][three_site - 6:three_site + 2] elif coord_df['strand'] == '-': s5 = fa_dict[coord_df['chromosome']][five_site - 6:five_site + 2] s5 = SP.reverse_complement(s5) s3 = fa_dict[coord_df['chromosome']][three_site - 2:three_site + 6] s3 = SP.reverse_complement(s3) column_dict['seq5'].append(s5) column_dict['seq3'].append(s3) scores = SP.simple_score_junction(s5, s3, pssm) column_dict['3p score'].append(scores[1]) column_dict['5p score'].append(scores[0]) new_quant_df = quant_df[quant_df.index.isin(new_index)][[ 'genome coord', 'chromosome', 'strand', 'transcript', 'position', 'type' ]] for column, data in column_dict.iteritems(): new_quant_df[column] = data new_quant_df = new_quant_df.drop_duplicates( subset='genome coord', keep='first').set_index('genome coord') new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) #for n in range(len(new_quant_df['seq5'].iloc[0])): # new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] #for n in range(len(new_quant_df['seq3'].iloc[0])): # new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) new_quant_df = SP.find_score_branches_ppy( new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict) return new_quant_df
def main(): '''Usage: run SP_pipeline.py config_file untagged_sample_name organism config file : file that lists all branch, junction and peak files untagged_sample_name : prefix for untagged sample organism : pombe, crypto or cerevisiae''' junc_beds = [] branch_bams = [] CP_out = [] CP_untagged = None quant_bams = {} # Read configuration file with open(sys.argv[1], 'r') as config: for line in config: if 'junctions.bed' in line.lower(): junc_beds.append(line.strip()) elif 'branch' in line.lower(): branch_bams.append(line.strip()) elif sys.argv[2] in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = sys.argv[1].split('/')[-1].split('_config')[0] base_dir = sys.argv[1].split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: "+base_dir+name print "\nJunction bed files" print junc_beds print "\nBranch bam files" if len(branch_bams) == 2: print branch_bams use_branches = True elif len(branch_bams) == 0: print "No data for branches, continuing with only junctions" use_branches = False print "\nUntagged peaks" print CP_untagged print "\nChangepoint peaks" print CP_out print '' if CP_untagged is None: print "\n Error: no untagged file indicated" return None organism = sys.argv[3] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) #### Generate peak df if name+'_peaks_w_branch.csv' not in os.listdir(base_dir) or name+'_peaks_w_junc.csv' not in os.listdir(base_dir): if name+'_all_peaks.pickle' not in os.listdir(base_dir): peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks') peak_df.to_pickle(base_dir+name+'_all_peaks.pickle') else: peak_df = pd.read_pickle(base_dir+name+'_all_peaks.pickle') #### Junction to peak comparison if name+'_peaks_w_junc.csv' not in os.listdir(base_dir): print "Generating peaks vs. junctions dataframe..." peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name) else: peaks_w_junc = pd.read_pickle(base_dir+name+'_peaks_w_junc.pickle') print "Peaks vs. junction dataframe already exists" #### Branch to peak comparison if use_branches is True: if name+'_peaks_w_branch.csv' not in os.listdir(base_dir): print "Generating peaks vs. branches dataframe..." peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name) else: peaks_w_branch = pd.read_pickle(base_dir+name+'_peaks_w_branch.pickle') print "Peaks vs. branches dataframe already exists" #### Clean up dataframe for quantitation if name+'_quantitation.csv' not in os.listdir(base_dir): quant_df, lariat_df = SP.make_quant_df(peaks_w_junc, peaks_w_branch, gff3, fa_dict, organism=organism) quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch, fa_dict) print "Counting reads in transcripts and at peaks..." quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir+name+'_quantitation.pickle') quant_df.to_csv(base_dir+name+'_quantitation.csv') lariat_df.to_pickle(base_dir+name+'_lariats.pickle') lariat_df.to_csv(base_dir+name+'_lariats.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) else: quant_df = pd.read_pickle(base_dir+name+'_quantitation.pickle') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) print "\n****Finished****"