def peaks_only(config_file, untagged, organism): CP_out = [] quant_bams = {} with open(config_file, 'r') as config: for line in config: if untagged in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = config_file.split('/')[-1].split('_config')[0] base_dir = config_file.split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: "+base_dir+name organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks') peak_df.to_pickle(base_dir+name+'_all_peaks.pickle') quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism) quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir+name+'_quantitation.pickle') quant_df.to_csv(base_dir+name+'_quantitation.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
def peaks_only(config_file, untagged, organism): CP_out = [] quant_bams = {} with open(config_file, 'r') as config: for line in config: if untagged in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = config_file.split('/')[-1].split('_config')[0] base_dir = config_file.split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: " + base_dir + name organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name + '_CP_peaks') peak_df.to_pickle(base_dir + name + '_all_peaks.pickle') quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism) quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir + name + '_quantitation.pickle') quant_df.to_csv(base_dir + name + '_quantitation.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
def main(): '''Usage: run SP_pipeline.py config_file untagged_sample_name organism config file : file that lists all branch, junction and peak files untagged_sample_name : prefix for untagged sample organism : pombe, crypto or cerevisiae''' junc_beds = [] branch_bams = [] CP_out = [] CP_untagged = None quant_bams = {} # Read configuration file with open(sys.argv[1], 'r') as config: for line in config: if 'junctions.bed' in line.lower(): junc_beds.append(line.strip()) elif 'branch' in line.lower(): branch_bams.append(line.strip()) elif sys.argv[2] in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = sys.argv[1].split('/')[-1].split('_config')[0] base_dir = sys.argv[1].split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: " + base_dir + name print "\nJunction bed files" print junc_beds print "\nBranch bam files" if len(branch_bams) == 2: print branch_bams use_branches = True elif len(branch_bams) == 0: print "No data for branches, continuing with only junctions" use_branches = False print "\nUntagged peaks" print CP_untagged print "\nChangepoint peaks" print CP_out print '' if CP_untagged is None: print "\n Error: no untagged file indicated" return None organism = sys.argv[3] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) #### Generate peak df if name + '_peaks_w_branch.csv' not in os.listdir( base_dir) or name + '_peaks_w_junc.csv' not in os.listdir( base_dir): if name + '_all_peaks.pickle' not in os.listdir(base_dir): peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name + '_CP_peaks') peak_df.to_pickle(base_dir + name + '_all_peaks.pickle') else: peak_df = pd.read_pickle(base_dir + name + '_all_peaks.pickle') #### Junction to peak comparison if name + '_peaks_w_junc.csv' not in os.listdir(base_dir): print "Generating peaks vs. junctions dataframe..." peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name) else: peaks_w_junc = pd.read_pickle(base_dir + name + '_peaks_w_junc.pickle') print "Peaks vs. junction dataframe already exists" #### Branch to peak comparison if use_branches is True: if name + '_peaks_w_branch.csv' not in os.listdir(base_dir): print "Generating peaks vs. branches dataframe..." peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name) else: peaks_w_branch = pd.read_pickle(base_dir + name + '_peaks_w_branch.pickle') print "Peaks vs. branches dataframe already exists" #### Clean up dataframe for quantitation if name + '_quantitation.csv' not in os.listdir(base_dir): quant_df, lariat_df = SP.make_quant_df(peaks_w_junc, peaks_w_branch, gff3, fa_dict, organism=organism) quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch, fa_dict) print "Counting reads in transcripts and at peaks..." quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir + name + '_quantitation.pickle') quant_df.to_csv(base_dir + name + '_quantitation.csv') lariat_df.to_pickle(base_dir + name + '_lariats.pickle') lariat_df.to_csv(base_dir + name + '_lariats.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) else: quant_df = pd.read_pickle(base_dir + name + '_quantitation.pickle') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) print "\n****Finished****"
def main(): '''Usage: run SP_pipeline.py config_file untagged_sample_name organism config file : file that lists all branch, junction and peak files untagged_sample_name : prefix for untagged sample organism : pombe, crypto or cerevisiae''' junc_beds = [] branch_bams = [] CP_out = [] CP_untagged = None quant_bams = {} # Read configuration file with open(sys.argv[1], 'r') as config: for line in config: if 'junctions.bed' in line.lower(): junc_beds.append(line.strip()) elif 'branch' in line.lower(): branch_bams.append(line.strip()) elif sys.argv[2] in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = sys.argv[1].split('/')[-1].split('_config')[0] base_dir = sys.argv[1].split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: "+base_dir+name print "\nJunction bed files" print junc_beds print "\nBranch bam files" if len(branch_bams) == 2: print branch_bams use_branches = True elif len(branch_bams) == 0: print "No data for branches, continuing with only junctions" use_branches = False print "\nUntagged peaks" print CP_untagged print "\nChangepoint peaks" print CP_out print '' if CP_untagged is None: print "\n Error: no untagged file indicated" return None organism = sys.argv[3] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) #### Generate peak df if name+'_peaks_w_branch.csv' not in os.listdir(base_dir) or name+'_peaks_w_junc.csv' not in os.listdir(base_dir): if name+'_all_peaks.pickle' not in os.listdir(base_dir): peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks') peak_df.to_pickle(base_dir+name+'_all_peaks.pickle') else: peak_df = pd.read_pickle(base_dir+name+'_all_peaks.pickle') #### Junction to peak comparison if name+'_peaks_w_junc.csv' not in os.listdir(base_dir): print "Generating peaks vs. junctions dataframe..." peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name) else: peaks_w_junc = pd.read_pickle(base_dir+name+'_peaks_w_junc.pickle') print "Peaks vs. junction dataframe already exists" #### Branch to peak comparison if use_branches is True: if name+'_peaks_w_branch.csv' not in os.listdir(base_dir): print "Generating peaks vs. branches dataframe..." peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name) else: peaks_w_branch = pd.read_pickle(base_dir+name+'_peaks_w_branch.pickle') print "Peaks vs. branches dataframe already exists" #### Clean up dataframe for quantitation if name+'_quantitation.csv' not in os.listdir(base_dir): quant_df, lariat_df = SP.make_quant_df(peaks_w_junc, peaks_w_branch, gff3, fa_dict, organism=organism) quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch, fa_dict) print "Counting reads in transcripts and at peaks..." quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir+name+'_quantitation.pickle') quant_df.to_csv(base_dir+name+'_quantitation.csv') lariat_df.to_pickle(base_dir+name+'_lariats.pickle') lariat_df.to_csv(base_dir+name+'_lariats.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) else: quant_df = pd.read_pickle(base_dir+name+'_quantitation.pickle') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) print "\n****Finished****"
def main(): '''Usage: run SPBranch.py unmapped1 unmapped2 threads organism [config_file] [untagged] Parameters ----------- unmapped1 : bam or fastq file of unmapped reads from tophat or bowtie unmapped2 : bam or fastq file of unmapped reads from tophat or bowtie threads : number of processors to use organism : 'pombe or 'crypto' config_file : if using peaks to call - list of changepoint output file names and where to find them untagged : untagged sample name (must be in file name) Output ------ bam files with aligned reads. Will be interpreted by SP_pipeline. ''' unmapped1 = sys.argv[1] unmapped2 = sys.argv[2] threads = int(sys.argv[3]) if unmapped1.endswith('bam'): btf_args = 'bamToFastq -i {0} -fq {1}'.format(unmapped1, unmapped1.split('.bam')[-1]+'.fq') call(btf_args, shell=False) unmapped1 = unmapped1.split('.bam')[-1]+'.fq' if unmapped2.endswith('bam'): btf_args = 'bamToFastq -i {0} -fq {1}'.format(unmapped2, unmapped2.split('.bam')[-1]+'.fq') call(btf_args, shell=False) unmapped2 = unmapped2.split('.bam')[-1]+'.fq' cat_args = 'cat {0} {1} > unmapped_all.fq'.format(unmapped1, unmapped2) call(cat_args, shell=True) organism = sys.argv[4] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) peaks = False if len(sys.argv) == 7: peaks = True with open(sys.argv[5], 'r') as config: for line in config: if sys.argv[6] in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name='CP_peaks') ann_seqs = collect_intron_seq(gff3, fa_dict) print "Finding unaligned reads with annotated 5' splice sites" find_split_reads('unmapped_all.fq', ann_seqs, 'Ann_branches', threads=threads) print "Aligning split reads to the genome with Bowtie" bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Ann_branches_split.fa --sam Ann_branches.sam'.format(threads, bowtie_index) call(bowtie_args, shell=True) # sort and index print "Sorting and indexing bam file" samtools1 = 'samtools view -Sbo Ann_branches.bam Ann_branches.sam' call(samtools1, shell=True) samtools2 = 'samtools sort Ann_branches.bam -o Ann_branches_sorted.bam' call(samtools2, shell=True) samtools3 = 'samtools index Ann_branches_sorted.bam' call(samtools3, shell=True) if peaks is True: print "Finding unaligned reads with unpredicted splicing events" peak_seqs = collect_intron_seq(gff3, fa_dict, peak_df=peak_df) find_split_reads('Ann_branches_unsplit.fa', peak_seqs, 'Peak_branches', threads=threads) print "Aligning split reads to the genome with Bowtie" bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Peak_branches_split.fa --sam Peak_branches.sam'.format(threads, bowtie_index) call(bowtie_args, shell=True) print "Sorting and indexing bam file" samtools1 = 'samtools view -Sbo Peak_branches.bam Peak_branches.sam' call(samtools1, shell=True) samtools2 = 'samtools sort Peak_branches.bam -o Peak_branches_sorted.bam' call(samtools2, shell=True) samtools3 = 'samtools index Peak_branches_sorted.bam' call(samtools3, shell=True)
def main(): '''Usage: run SPBranch.py unmapped1 unmapped2 threads organism [config_file] [untagged] Parameters ----------- unmapped1 : bam or fastq file of unmapped reads from tophat or bowtie unmapped2 : bam or fastq file of unmapped reads from tophat or bowtie threads : number of processors to use organism : 'pombe or 'crypto' config_file : if using peaks to call - list of changepoint output file names and where to find them untagged : untagged sample name (must be in file name) Output ------ bam files with aligned reads. Will be interpreted by SP_pipeline. ''' unmapped1 = sys.argv[1] unmapped2 = sys.argv[2] threads = int(sys.argv[3]) if unmapped1.endswith('bam'): btf_args = 'bamToFastq -i {0} -fq {1}'.format( unmapped1, unmapped1.split('.bam')[-1] + '.fq') call(btf_args, shell=False) unmapped1 = unmapped1.split('.bam')[-1] + '.fq' if unmapped2.endswith('bam'): btf_args = 'bamToFastq -i {0} -fq {1}'.format( unmapped2, unmapped2.split('.bam')[-1] + '.fq') call(btf_args, shell=False) unmapped2 = unmapped2.split('.bam')[-1] + '.fq' cat_args = 'cat {0} {1} > unmapped_all.fq'.format(unmapped1, unmapped2) call(cat_args, shell=True) organism = sys.argv[4] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) peaks = False if len(sys.argv) == 7: peaks = True with open(sys.argv[5], 'r') as config: for line in config: if sys.argv[6] in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name='CP_peaks') ann_seqs = collect_intron_seq(gff3, fa_dict) print "Finding unaligned reads with annotated 5' splice sites" find_split_reads('unmapped_all.fq', ann_seqs, 'Ann_branches', threads=threads) print "Aligning split reads to the genome with Bowtie" bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Ann_branches_split.fa --sam Ann_branches.sam'.format( threads, bowtie_index) call(bowtie_args, shell=True) # sort and index print "Sorting and indexing bam file" samtools1 = 'samtools view -Sbo Ann_branches.bam Ann_branches.sam' call(samtools1, shell=True) samtools2 = 'samtools sort Ann_branches.bam -o Ann_branches_sorted.bam' call(samtools2, shell=True) samtools3 = 'samtools index Ann_branches_sorted.bam' call(samtools3, shell=True) if peaks is True: print "Finding unaligned reads with unpredicted splicing events" peak_seqs = collect_intron_seq(gff3, fa_dict, peak_df=peak_df) find_split_reads('Ann_branches_unsplit.fa', peak_seqs, 'Peak_branches', threads=threads) print "Aligning split reads to the genome with Bowtie" bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Peak_branches_split.fa --sam Peak_branches.sam'.format( threads, bowtie_index) call(bowtie_args, shell=True) print "Sorting and indexing bam file" samtools1 = 'samtools view -Sbo Peak_branches.bam Peak_branches.sam' call(samtools1, shell=True) samtools2 = 'samtools sort Peak_branches.bam -o Peak_branches_sorted.bam' call(samtools2, shell=True) samtools3 = 'samtools index Peak_branches_sorted.bam' call(samtools3, shell=True)