def write_intron_fasta(transcript_dict, fasta_dict, prefix='introns', sense=True): seq_dict = {} for transcript, values in transcript_dict.iteritems(): start = values[0] end = values[1] strand = values[2] chrom = values[3] CDS_start_list = values[4] CDS_end_list = values[5] for n in range(len(CDS_start_list)-1): if strand == '+': seq = fasta_dict[chrom][CDS_end_list[n]:CDS_start_list[n+1]-1] elif strand == '-': intron = len(CDS_start_list)-n-1 seq = fasta_dict[chrom][CDS_end_list[intron]:CDS_start_list[intron-1]-1] seq = SP.reverse_complement(seq) if sense is False: seq = SP.reverse_complement(seq) seq_dict[transcript+'_'+str(n)] = seq with open('{}.fa'.format(prefix), 'w') as fout: for transcript, seq in seq_dict.iteritems(): fout.write('>'+transcript+'\n') fout.write(seq+'\n') return seq_dict
def add_seq(branch_df, fa_dict): five_seqs = [] branch_seqs = [] for ix, r in branch_df.iterrows(): five = fa_dict[r['chromosome']][r['5p splice site']-8:r['5p splice site']+8] branch = fa_dict[r['chromosome']][r['branch site']-8:r['branch site']+8] if r['strand'] == '-': five = SP.reverse_complement(five) branch = SP.reverse_complement(branch) if 'GT' in five[4:11]: ix = five.index('GT') five = five[ix-2:ix+6] else: five = five[4:12] if 'AG' in branch[4:11]: ix = branch.index('AG') branch = branch[ix-4:ix+4] elif 'AA' in branch[4:11]: ix = branch.index('AA') branch = branch[ix-4:ix+4] elif 'GA' in branch[4:11]: ix = branch.index('GA') branch = branch[ix-4:ix+4] else: branch = branch[4:13] five_seqs.append(five) branch_seqs.append(branch) branch_df['5p seq'] = five_seqs branch_df['Branch seq'] = branch_seqs receptors = ['AG', 'AA', 'GA'] branch_df = branch_df[branch_df['Branch seq'].str[4:6].isin(receptors)] return branch_df
def generate_all_ss_seqs(gff3, fasta_dict, organism): transcript_dict = SP.build_transcript_dict(gff3, organism=organism) ss, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss) all_seq5 = [] all_seq3 = [] for transcript, introns in ss_dict.iteritems(): if organism == 'pombe': isoform = transcript+'.1' else: isoform = transcript+'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: if strand == '+': seq5 = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)] elif strand == '-': seq5 = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)] seq5 = SP.reverse_complement(seq5) all_seq5.append(seq5) if strand == '+': seq3 = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)] elif strand == '-': seq3 = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)] seq3 = SP.reverse_complement(seq3) all_seq3.append(seq3) return all_seq5, all_seq3
def add_intron_size(peaks_df, gff3, organism=None): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) no_peaks = ss_dict intron_sizes = [] for index, row in peaks_df.iterrows(): if row['type'] != 'intronic': intron_sizes.append(np.NaN) else: sites = ss_dict[row['transcript']] assigned = False for pair in sites: if pair[0] > pair[1]: if row['position'] >= pair[1] and row['position'] <= pair[ 0]: intron_sizes.append(pair[0] - pair[1]) assigned = True no_peaks[row['transcript']].remove(pair) break else: if row['position'] >= pair[0] and row['position'] <= pair[ 1]: intron_sizes.append(pair[1] - pair[0]) assigned = True no_peaks[row['transcript']].remove(pair) break if assigned is False: intron_sizes.append(np.NaN) peaks_df['intron size'] = intron_sizes return peaks_df, no_peaks
def add_intron_size(peaks_df, gff3, organism=None): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) no_peaks = ss_dict intron_sizes = [] for index, row in peaks_df.iterrows(): if row['type'] != 'intronic': intron_sizes.append(np.NaN) else: sites = ss_dict[row['transcript']] assigned=False for pair in sites: if pair[0] > pair[1]: if row['position'] >= pair[1] and row['position'] <= pair[0]: intron_sizes.append(pair[0]-pair[1]) assigned=True no_peaks[row['transcript']].remove(pair) break else: if row['position'] >= pair[0] and row['position'] <= pair[1]: intron_sizes.append(pair[1]-pair[0]) assigned=True no_peaks[row['transcript']].remove(pair) break if assigned is False: intron_sizes.append(np.NaN) peaks_df['intron size'] = intron_sizes return peaks_df, no_peaks
def add_seq(branch_df, fa_dict): five_seqs = [] branch_seqs = [] for ix, r in branch_df.iterrows(): five = fa_dict[r['chromosome']][r['5p splice site'] - 8:r['5p splice site'] + 8] branch = fa_dict[r['chromosome']][r['branch site'] - 8:r['branch site'] + 8] if r['strand'] == '-': five = SP.reverse_complement(five) branch = SP.reverse_complement(branch) if 'GT' in five[4:11]: ix = five.index('GT') five = five[ix - 2:ix + 6] else: five = five[4:12] if 'AG' in branch[4:11]: ix = branch.index('AG') branch = branch[ix - 4:ix + 4] elif 'AA' in branch[4:11]: ix = branch.index('AA') branch = branch[ix - 4:ix + 4] elif 'GA' in branch[4:11]: ix = branch.index('GA') branch = branch[ix - 4:ix + 4] else: branch = branch[4:13] five_seqs.append(five) branch_seqs.append(branch) branch_df['5p seq'] = five_seqs branch_df['Branch seq'] = branch_seqs receptors = ['AG', 'AA', 'GA'] branch_df = branch_df[branch_df['Branch seq'].str[4:6].isin(receptors)] return branch_df
def peaks_only(config_file, untagged, organism): CP_out = [] quant_bams = {} with open(config_file, 'r') as config: for line in config: if untagged in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = config_file.split('/')[-1].split('_config')[0] base_dir = config_file.split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: "+base_dir+name organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks') peak_df.to_pickle(base_dir+name+'_all_peaks.pickle') quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism) quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir+name+'_quantitation.pickle') quant_df.to_csv(base_dir+name+'_quantitation.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
def find_3p_site(branch_df, gff3, organism=None): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) three_coord = [] for ix, r in branch_df.iterrows(): introns = ss_dict[r['transcript'][:-2]] matched = False for intron in introns: if r['5p splice site'] in range(intron[0] - 1, intron[0] + 2): three_coord.append(intron[1]) matched = True break if matched is False: three_coord.append(np.NaN) branch_df['3p splice site'] = three_coord branch_df['intron size'] = branch_df['5p splice site'] - branch_df[ '3p splice site'] branch_df['intron size'] = branch_df['intron size'].apply(abs) branch_df['Branch to 3p distance'] = branch_df['branch site'] - branch_df[ '3p splice site'] branch_df['Branch to 3p distance'] = branch_df[ 'Branch to 3p distance'].apply(abs) return branch_df
def generate_all_ss_seqs(gff3, fasta_dict, organism): transcript_dict = SP.build_transcript_dict(gff3, organism=organism) ss, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss) all_seq5 = [] all_seq3 = [] for transcript, introns in ss_dict.iteritems(): if organism == 'pombe': isoform = transcript + '.1' else: isoform = transcript + 'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: if strand == '+': seq5 = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)] elif strand == '-': seq5 = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)] seq5 = SP.reverse_complement(seq5) all_seq5.append(seq5) if strand == '+': seq3 = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)] elif strand == '-': seq3 = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)] seq3 = SP.reverse_complement(seq3) all_seq3.append(seq3) return all_seq5, all_seq3
def peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name): # Load in junctions junc_df1 = SP.build_junction_df(junc_beds[0], gff3, fa_dict, organism=organism) junc_df2 = SP.build_junction_df(junc_beds[1], gff3, fa_dict, organism=organism) junc_df = SP.combine_junctions(junc_df1, junc_df2) #print junc_df # Compare peaks and junctions peaks_w_junc = SP.compare_peak_junc_df(peak_df, junc_df, organism=organism) peaks_w_junc = SP.score_peaks(peaks_w_junc, gff3, fa_dict) # Reformat dataframe - add index, sort so that the annotated intron is first in each cluster peaks_w_junc.index = peaks_w_junc['genome coord'] peaks_w_junc['type index'] = np.where(peaks_w_junc['junction type'] == 'Annotated', 0, 1) peaks_w_junc = peaks_w_junc.sort_values('type index') peaks_w_junc.groupby(peaks_w_junc.index).first() peaks_w_junc = peaks_w_junc.drop(['index', 'type index'], axis=1) peaks_w_junc['intron tuple'] = zip(peaks_w_junc['transcript'].tolist(),peaks_w_junc['annotated intron size'].tolist()) print "\nPeaks with corresponding exon-exon junctions:" print len(peaks_w_junc) print str(len(set(peaks_w_junc[~peaks_w_junc['type'].str.contains('prime')]['genome coord'])))+" unpredicted" peaks_w_junc.to_csv(base_dir+name+'_peaks_w_junc.csv') peaks_w_junc.to_pickle(base_dir+name+'_peaks_w_junc.pickle') return peaks_w_junc
def peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name): branches = SP.list_branch_points(branch_bams[0], gff3, fa_dict, organism=organism) branch_df = SP.create_branch_df(branches, gff3, fa_dict, organism=organism) if len(branch_bams) == 2: branches2 = SP.list_branch_points(branch_bams[1], gff3, fa_dict, organism=organism) branch_df2 = SP.create_branch_df(branches2, gff3, fa_dict, organism=organism) branch_df = branch_df.append(branch_df2) bed1 = branch_bams[0].split('_sorted.bam')[0]+'.bed' bed2 = branch_bams[1].split('_sorted.bam')[0]+'.bed' cat_args = "cat {0} {1} > {2}_all_branches.bed".format(bed1, bed2, name) call(cat_args, shell=True) os.remove(bed1) os.remove(bed2) # Compare peaks and branches peaks_w_branch = branch_df[branch_df['genome coord'].isin(peak_df['genome coord'])] peaks_w_branch = peaks_w_branch.merge(peak_df[['type','genome coord']], right_on='genome coord', left_on='genome coord', how='left') peaks_w_branch.index = peaks_w_branch['branch coord'] print "\nPeaks with corresponding branches:" print len(peaks_w_branch) print str(len(set(peaks_w_branch['genome coord'])))+" unpredicted" peaks_w_branch.to_csv(base_dir+name+'_peaks_w_branch.csv') peaks_w_branch.to_pickle(base_dir+name+'_peaks_w_branch.pickle') return peaks_w_branch
def peak_seq_enrichment(df, organism): organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) nuc_prob = SP.gc_content(fa_dict) p_dict = {'A':nuc_prob[0], 'T':nuc_prob[2], 'C':nuc_prob[1], 'G':nuc_prob[3]} unpeaks = df[df['type'] == 'other'] unpeaks = unpeaks.append(df[df['type'] == 'intronic']) print "Number of unpredicted peaks:" print len(unpeaks) nucs = ['G','A','C','T'] dinucs = set() for nuc in nucs: for nuc2 in nucs: dinucs.add(nuc+nuc2) five = {} three = {} for dinuc in dinucs: five[dinuc] = len(unpeaks[unpeaks['sequence'].str[6:8].str.contains(dinuc)]) three[dinuc] = len(unpeaks[unpeaks['sequence'].str[4:6].str.contains(dinuc)]) five_LO = {} three_LO = {} for dinuc in five.keys(): p_dinuc = p_dict[dinuc[0]]*p_dict[dinuc[1]] phat_dinuc = five[dinuc]/float(len(unpeaks)) phat_dinuc2 = three[dinuc]/float(len(unpeaks)) SE = np.sqrt(phat_dinuc*(1-phat_dinuc)/len(unpeaks)) SE2 = np.sqrt(phat_dinuc2*(1-phat_dinuc2)/len(unpeaks)) Z = (phat_dinuc-p_dinuc)/SE Z2 = (phat_dinuc2-p_dinuc)/SE2 pvalue = stats.norm.sf(Z) pvalue2 = stats.norm.sf(Z2) LO = np.log((1-pvalue)/pvalue) LO2 = np.log((1-pvalue2)/pvalue2) five_LO[dinuc] = LO three_LO[dinuc] = LO2 fig, ax = plt.subplots(figsize=(12,6)) width = 0.35 ind = np.arange(len(five_LO.keys())) rects2 = ax.bar(ind, three_LO.values(), width, color='crimson', edgecolor='crimson', label='Before peak') rects1 = ax.bar(ind + width, five_LO.values(), width, color='indigo', edgecolor='indigo', label='After peak') ax.plot([-1,17],[0,0],'-', color='black') ax.plot([-1,17],[2.94,2.94], '--', color='0.7', label='95% CI') ax.plot([-1,17],[-2.94,-2.94], '--', color='0.7') ax.set_xlim([-1,17]) ax.set_xticklabels(five_LO.keys(), fontsize=12) ax.set_xticks(ind + width / 2) ax.set_ylabel('Log odds dinucleotide enrichment', fontsize=14) ax.set_title('Unpredicted peaks', fontsize=14) ax.legend(fontsize=12) return fig
def write_intergenic_fasta(transcript_dict, fasta_dict, bps_us=0, bps_ds=0, all_intergenic=True, prefix='intergenic_transcripts'): seq_dict = {} if all_intergenic is False: for transcript, values in transcript_dict.iteritems(): start = values[0] end = values[1] strand = values[2] chrom = values[3] if bps_us > 0: if strand == '+': seq_us_sense = fasta_dict[chrom][start-bps_us:start] elif strand == '-': seq_us_sense = fasta_dict[chrom][end:end+bps_us] seq_us_sense = SP.reverse_complement(seq_us_sense) seq_us_antisense = SP.reverse_complement(seq_us_sense) seq_dict[transcript+'_us_sense'] = seq_us_sense seq_dict[transcript+'_us_antisense'] = seq_us_antisense if bps_ds > 0: if strand == '+': seq_ds_sense = fasta_dict[chrom][end:bps_ds+end] elif strand == '-': seq_ds_sense = fasta_dict[chrom][start-bps_ds:start] seq_ds_sense = SP.reverse_complement(seq_ds_sense) seq_ds_antisense = SP.reverse_complement(seq_ds_sense) seq_dict[transcript+'_ds_sense'] = seq_ds_sense seq_dict[transcript+'_ds_antisense'] = seq_ds_antisense elif all_intergenic is True: chroms = fasta_dict.keys() for chrom in chroms: chrom_transcripts = dict((k, transcript_dict[k]) for k in transcript_dict if transcript_dict[k][3] == chrom) chr_txs_df = pd.DataFrame.from_dict(chrom_transcripts, orient='index') chr_txs_df.sort_values([0], inplace=True) sorted_transcripts = chr_txs_df.index.tolist() n = 0 for n in range(len(sorted_transcripts)-1): transcript = sorted_transcripts[n] next_transcript = sorted_transcripts[n+1] transcript_end = chr_txs_df[1][transcript] next_start = chr_txs_df[0][next_transcript] if next_start > transcript_end: seq_plus = fasta_dict[chrom][transcript_end:next_start] seq_dict[transcript+'_'+next_transcript+'_plus'] = seq_plus seq_dict[transcript+'_'+next_transcript+'_minus'] = SP.reverse_complement(seq_plus) else: print 'Overlapping transcripts:' print transcript print next_transcript with open('{}.fa'.format(prefix), 'w') as fout: for transcript, seq in seq_dict.iteritems(): fout.write('>'+transcript+'\n') fout.write(seq+'\n') return seq_dict
def sort_bedgraphs(directory, transcript_dict): bedgraph_list = [] for file in os.listdir(directory): if file.lower().endswith(".bedgraph"): print file bedgraph_list.append(directory + file) for bedgraph in bedgraph_list: SP.build_bedgraph_dict(transcript_dict, bedgraph)
def sort_bedgraphs(directory, transcript_dict): bedgraph_list = [] for file in os.listdir(directory): if file.lower().endswith(".bedgraph"): print file bedgraph_list.append(directory+file) for bedgraph in bedgraph_list: SP.build_bedgraph_dict(transcript_dict, bedgraph)
def get_sequence(coord_dict, gff3_file, fasta_file): if 'pombe' in gff3_file: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) is str: fasta_dict = make_fasta_dict(fasta_file) else: fasta_dict = fasta_file seq_dict = {} counter5 = 0 counter3 = 0 other = 0 for transcript, coord_sets in coord_dict.iteritems(): seq_dict[transcript] = [] chrom = transcript_dict[transcript][3] #if chrom in rom_lat: chrom = rom_lat[chrom] strand = transcript_dict[transcript][2] for coord in coord_sets[0]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord - 9):(coord + 11)] elif strand == "-": sequence = fasta_dict[chrom][(coord - 10):(coord + 10)] sequence = SP.reverse_complement(sequence) if sequence[10:12] == 'GT' or sequence[10:12] == 'GC': seq_type = "5'" counter5 += 1 seq_dict[transcript].append((sequence, seq_type)) for coord in coord_sets[1]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord - 9):(coord + 11)] elif strand == "-": sequence = fasta_dict[chrom][(coord - 10):(coord + 10)] sequence = SP.reverse_complement(sequence) if sequence[8:10] == 'AG': seq_type = "3'" counter3 += 1 seq_dict[transcript].append((sequence, seq_type)) #print str(counter5)+" 5' splice sites" #print str(counter3)+" 3' splice sites" return seq_dict
def read_sorted_bedgraphs(directory, transcript_dict): stranded_bedgraphs = {} for file in os.listdir(directory): if file.endswith("_CNAGsort.bedgraph"): if "plus" in file: if file.split('_plus')[0] not in stranded_bedgraphs: stranded_bedgraphs[file.split('_plus')[0]] = [None, None] stranded_bedgraphs[file.split('_plus')[0]][0] = SP.read_CNAGsort_bedgraph2(file, transcript_dict, organism='pombe') elif 'minus' in file: if file.split('_minus')[0] not in stranded_bedgraphs: stranded_bedgraphs[file.split('_minus')[0]] = [None, None] stranded_bedgraphs[file.split('_minus')[0]][1] = SP.read_CNAGsort_bedgraph2(file, transcript_dict, organism='pombe') return stranded_bedgraphs
def get_sequence(coord_dict, gff3_file, fasta_file): if 'pombe' in gff3_file: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) is str: fasta_dict = make_fasta_dict(fasta_file) else: fasta_dict = fasta_file seq_dict = {} counter5 = 0 counter3 = 0 other = 0 for transcript, coord_sets in coord_dict.iteritems(): seq_dict[transcript] = [] chrom = transcript_dict[transcript][3] #if chrom in rom_lat: chrom = rom_lat[chrom] strand = transcript_dict[transcript][2] for coord in coord_sets[0]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord-9):(coord+11)] elif strand == "-": sequence = fasta_dict[chrom][(coord-10):(coord+10)] sequence = SP.reverse_complement(sequence) if sequence[10:12] == 'GT' or sequence[10:12] == 'GC': seq_type = "5'" counter5 += 1 seq_dict[transcript].append((sequence, seq_type)) for coord in coord_sets[1]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord-9):(coord+11)] elif strand == "-": sequence = fasta_dict[chrom][(coord-10):(coord+10)] sequence = SP.reverse_complement(sequence) if sequence[8:10] == 'AG': seq_type = "3'" counter3 += 1 seq_dict[transcript].append((sequence, seq_type)) #print str(counter5)+" 5' splice sites" #print str(counter3)+" 3' splice sites" return seq_dict
def gene_patches(tx, tx_dict, ax, arrow=False): iso_list = [x for x in tx_dict if tx in x] if len(iso_list) == 0: return None for n, iso in enumerate(iso_list): start, end, strand, CDS_start, CDS_end, exons, chrom = SP.tx_info(iso, tx_dict) if arrow is False: tx_patch = patches.Rectangle((start,0.8-n*0.15),end-start,0.04,edgecolor='0.1',facecolor='0.1') ax.add_patch(tx_patch) else: if strand == '+': ax.arrow(start, 0.9, end-start-0.02*(end-start), 0, linewidth=2, head_width=0.1, head_length=0.02*(end-start), fc='k', ec='k') elif strand == '-': ax.arrow(end, 0.9, start-end-0.02*(start-end), 0, linewidth=2, head_width=0.1, head_length=0.02*(end-start), fc='k', ec='k') if exons is not None: exon_patches = [] for exon_start, exon_stop in exons: exon_patches.append(patches.Rectangle((exon_start, 0.775-n*0.15), exon_stop-exon_start, 0.10, edgecolor='0.1',facecolor='0.1')) for patch in exon_patches: ax.add_patch(patch) else: CDS_patch = patches.Rectangle((CDS_start, 0.75-n*0.15),CDS_end-CDS_start, 0.10, edgecolor='0.1', facecolor='0.1') ax.add_patch(CDS_patch) ax.get_yaxis().set_ticks([]) return strand
def build_transcript_dict(gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", expand=False, convert_chroms=False): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') lat_rom = {'chr1':'I','chr2':'II','chr3':'III','MT':'MT'} if convert_chroms is True: transcript_dict = {k:[start, end, strand, lat_rom[chrom], cds_start, cds_end] for k, [start, end, strand, chrom, cds_start, cds_end] in transcript_dict.items()} chrom_lengths = {'I':5818680, 'II':4744158, 'III':2598968,'chr1':5818680, 'chr2':4744158, 'chr3':2598968} if expand is True: expanded_dict = {} for tx, info in transcript_dict.iteritems(): new_start = info[0]-300 if new_start < 0: new_start = 0 new_end = info[1]+300 if info[3] in chrom_lengths: if new_end > chrom_lengths[info[3]]: new_end = chrom_lengths[info[3]] #else: print info[3] if len(info[4]) == 0: info[4] = [info[0]] if len(info[5]) == 0: info[5] = [info[1]] expanded_dict[tx] = [new_start, new_end, info[2], info[3], info[4], info[5]] transcript_dict = expanded_dict return transcript_dict
def main(): gff3 = '/home/jordan/GENOMES/CNA3_all_transcripts.gff3' fasta = '/home/jordan/GENOMES/H99_fa.json' chrom_lengths = '/home/jordan/GENOMES/H99_chrom_lengths.json' prefix = sys.argv[1].split('/')[-1].split('.')[0] print prefix tx_dict = SP.build_transcript_dict(gff3) tx_by_chrom = sort_tx_by_chrom(tx_dict) int_dict = make_promoter_dict(tx_dict, chrom_lengths) peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1], cutoff=2) #peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1]) peak_df = find_best_peaks(peak_df, int_dict, max_genes=300) if len(sys.argv) == 3: gene_list_file = sys.argv[2] in_list, other = split_by_gene(peak_df, gene_list_file) in_list.to_csv(prefix+'_by_gene_in_list.csv') other.to_csv(prefix+'_by_gene_other.csv') generate_sequence_file(in_list, int_dict, fasta, prefix+'_in_list') generate_sequence_file(other, int_dict, fasta, prefix+'_other') split = True minsites = [int(0.75*len(in_list)),int(0.75*len(other))] if minsites[0] > 600: minsites[0] = 600 if minsites[1] > 600: minsites[1] = 600 else: peak_df.to_csv(prefix+'_by_gene.csv') generate_sequence_file(peak_df, int_dict, fasta, prefix) split = False minsites = int(0.75*len(peak_df)) if minsites > 600: minsites = 600 call_meme(prefix, minsites, split=split)
def add_cdf_to_plot(ax, value_lists, label_list, color_list, ks_list, log2=False): all_cdfs = [] all_lists = [] n = 0 for n in range(len(value_lists)): if log2 is True: new_list = [np.log2(x) for x in value_lists[n]] else: new_list = value_lists[n] new_list = [x for x in new_list if (str(x) != 'inf' and str(x) != '-inf' and str(x) != 'nan') ] all_lists = all_lists+new_list cumulative, base = SP.cdf_values(new_list) ax.plot(base[1:], cumulative, c=color_list[n], linewidth=3.0, label=label_list[n]) all_cdfs.append(cumulative) xmin = np.percentile(all_lists, 1) xmax = np.percentile(all_lists, 99) ax.set_xlim([xmin,xmax]) ax.tick_params(axis='x', labelsize=12) ax.tick_params(axis='y', labelsize=12) if ks_list is not None: text = "p-values: \n"+ks_list[0]+' \n'+ks_list[1]+' ' if len(ks_list) == 4: text = text+' \n'+ks_list[2]+' \n'+ks_list[3]+' ' ax.annotate(text, xy=(xmax,0.0), horizontalalignment='right', fontsize=12) return ax
def find_polyA_sites(transcript_dict, window=220): polyA_bg = SP.read_CNAGsort_bedgraph2( '/home/jordan/GENOMES/POMBE/polyA_sites_CNAGsort.bedgraph', transcript_dict, organism='pombe') pA_dict = {} for tx, s in polyA_bg.iteritems(): s = s[s > 0] if len(s) > 0: if transcript_dict[tx][2] == '+': #pA_site = max(s.index) s.sort_values(ascending=False, inplace=True) pA_site = s.index[0] pA_dict[tx] = [ pA_site - window, pA_site + window, transcript_dict[tx][2], transcript_dict[tx][3] ] elif transcript_dict[tx][2] == '-': #pA_site = min(s.index) s.sort_values(ascending=False, inplace=True) pA_site = s.index[0] pA_dict[tx] = [ pA_site - window, pA_site + window, transcript_dict[tx][2], transcript_dict[tx][3] ] return pA_dict
def by_pos_plots(df, metrics=['Intermediate Level', 'Precursor']): col5 = [x for x in df.columns if 'Base 5' in x[1]] col3 = [x for x in df.columns if 'Base 3' in x[1]] for direction in ['Up', 'Down']: for metric in metrics: if len(df[df[('All', metric + ' change')] == direction]) > 5: for n in range(len(col5)): if n == 0: s5 = df[df[('All', metric + ' change')] == direction][col5[n]] else: s5 = s5.str.cat( df[df[('All', metric + ' change')] == direction][col5[n]]) print len(s5) for n in range(len(col3)): if n == 0: s3 = df[df[('All', metric + ' change')] == direction][col3[n]] else: s3 = s3.str.cat( df[df[('All', metric + ' change')] == direction][col3[n]]) print metric + ' ' + direction fig = SP.position_wise_scores2(s5, s3, 'crypto')
def main(): gff3 = '/home/jordan/GENOMES/CNA3_all_transcripts.gff3' fasta = '/home/jordan/GENOMES/H99_fa.json' chrom_lengths = '/home/jordan/GENOMES/H99_chrom_lengths.json' prefix = sys.argv[1].split('/')[-1].split('.')[0] print prefix tx_dict = SP.build_transcript_dict(gff3) tx_by_chrom = sort_tx_by_chrom(tx_dict) int_dict = make_promoter_dict(tx_dict, chrom_lengths) peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1], cutoff=2) #peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1]) peak_df = find_best_peaks(peak_df, int_dict, max_genes=300) if len(sys.argv) == 3: gene_list_file = sys.argv[2] in_list, other = split_by_gene(peak_df, gene_list_file) in_list.to_csv(prefix + '_by_gene_in_list.csv') other.to_csv(prefix + '_by_gene_other.csv') generate_sequence_file(in_list, int_dict, fasta, prefix + '_in_list') generate_sequence_file(other, int_dict, fasta, prefix + '_other') split = True minsites = [int(0.75 * len(in_list)), int(0.75 * len(other))] if minsites[0] > 600: minsites[0] = 600 if minsites[1] > 600: minsites[1] = 600 else: peak_df.to_csv(prefix + '_by_gene.csv') generate_sequence_file(peak_df, int_dict, fasta, prefix) split = False minsites = int(0.75 * len(peak_df)) if minsites > 600: minsites = 600 call_meme(prefix, minsites, split=split)
def peak_to_seq_pipeline(untagged_peak_file, tagged1_peak_file, tagged2_peak_file, gff3, fasta, junction_df=None, branch_df=None, cutoff=5, name='CP_peaks'): if 'pombe' in gff3: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3, organism=organism) print "Finding peaks in transcripts..." print untagged_peak_file untagged = CP_peaks_by_gene(untagged_peak_file, transcript_dict, cutoff=cutoff) print tagged1_peak_file tagged1 = CP_peaks_by_gene(tagged1_peak_file, transcript_dict, cutoff=cutoff) print tagged2_peak_file tagged2 = CP_peaks_by_gene(tagged2_peak_file, transcript_dict, cutoff=cutoff) print "Comparing peaks between replicates..." peaks = CP_compare_reps(untagged, tagged1, tagged2) print "Checking peaks against annotation..." ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) peak_df = CP_compare_to_annotation(peaks, ss_dict, transcript_dict) peak_df = collapse_unpredicted_peaks(peak_df) peak_df['genome coord'] = peak_df['chromosome'].str.cat(peak_df['position'].apply(int).apply(str), sep=':') if type(fasta) == str: fasta = SP.make_fasta_dict(fasta) print "Adding sequences..." peak_seq_df = add_sequence_to_df(peak_df, fasta, flag=flag) print "Writing bedgraph..." with open(name+'.bedgraph', 'w') as fout: for ix, r in peak_seq_df.iterrows(): if r['strand'] == '+': position2 = r['position']+1 height = r['height'] elif r['strand'] == '-': position2 = r['position']-1 height = r['height']*-1 line_list = [r['chromosome'], r['position'], position2, height, '\n'] line_list = [str(x) for x in line_list] line = '\t'.join(line_list) fout.write(line) print "Completed" return peak_seq_df
def position_wise_scores2(seq5_list, seq3_list, organism, title='Intron position strength'): '''Uses chi-contingency test to score base proportions at each position in sample against population''' organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) all_5p, all_3p = generate_all_ss_seqs(gff3, fa_dict, organism) pop_5p = seq_list_to_totals(all_5p) pop_3p = seq_list_to_totals(all_3p) samp_5p = seq_list_to_totals(seq5_list) samp_3p = seq_list_to_totals(seq3_list) print samp_5p.shape p5 = [] for n in range(samp_5p.shape[1]): if n == 2 or n == 3: p5.append(1) else: conting = np.array([samp_5p[:,n],pop_5p[:,n]]) chi2, p, dof, expected = stats.chi2_contingency(conting) p5.append(np.log10(p)*-1) p3 = [] for n in range(samp_3p.shape[1]): if n == 4 or n == 5: p3.append(1) else: conting = np.array([samp_3p[:,n],pop_3p[:,n]]) chi2, p, dof, expected = stats.chi2_contingency(conting) p3.append(np.log10(p)*-1) fig, ax = plt.subplots(2, 1, figsize=(4,4)) width = 0.7 max_y = max(p5+p3) + 0.1*max(p5+p3) ind5 = np.arange(len(p5)) ax[0].bar(ind5, p5, color='k') ax[0].plot([0,8], [2,2], '--', color='0.7') ax[0].set_xlim([0,len(p5)]) ax[0].set_ylabel("5' splice site\n-log10(p-value)") ax[0].set_title(title) ax[0].set_ylim([0,max_y]) ind3 = np.arange(len(p3)) ax[1].bar(ind3, p3, color='k') ax[1].plot([0,8], [2,2], '--', color='0.7') ax[1].set_xlim([0,len(p3)]) ax[1].set_ylabel("3' splice site\n-log10(p-value)") ax[1].set_ylim([0,max_y]) ax[0].set_xticks(ind3 + width / 2) ax[1].set_xticks(ind3 + width / 2) ax[0].set_xticklabels(np.arange(-2,6)) ax[1].set_xticklabels(np.arange(-5,3)) fig.tight_layout() plt.show() return fig
def read_sorted_bedgraphs(directory, transcript_dict): stranded_bedgraphs = {} for file in os.listdir(directory): if file.endswith("_CNAGsort.bedgraph"): if "plus" in file: if file.split('_plus')[0] not in stranded_bedgraphs: stranded_bedgraphs[file.split('_plus')[0]] = [None, None] stranded_bedgraphs[file.split('_plus') [0]][0] = SP.read_CNAGsort_bedgraph2( file, transcript_dict, organism='pombe') elif 'minus' in file: if file.split('_minus')[0] not in stranded_bedgraphs: stranded_bedgraphs[file.split('_minus')[0]] = [None, None] stranded_bedgraphs[file.split('_minus') [0]][1] = SP.read_CNAGsort_bedgraph2( file, transcript_dict, organism='pombe') return stranded_bedgraphs
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) == dict: fasta_dict = fasta_file elif fasta_file.endswith('json'): with open(fasta_file, 'r') as f: fasta_dict = json.load(f) else: fasta_dict = make_fasta_dict(fasta_file) if ss_dict is not None: ss_dict=ss_dict elif junction_bed is not None: ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) elif peak_df is not None: ss_dict = {} peak_df = peak_df[~peak_df['type'].str.contains('prime')] for ix, r in peak_df.iterrows(): if r['transcript'] not in ss_dict: ss_dict[r['transcript']] = [] if r['strand'] == '+': ss_dict[r['transcript']].append((r['position'],r['position']+50)) elif r['strand'] == '-': ss_dict[r['transcript']].append((r['position'],r['position']-50)) else: ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) seq_dict = {} for transcript, introns in ss_dict.iteritems(): if junction_bed is None: if organism == 'pombe': transcript = transcript+'.1' else: transcript = transcript+'T0' introns = list(introns) strand = transcript_dict[transcript][2] chrom = transcript_dict[transcript][3] n = 0 for n in range(len(introns)): if strand == '+': seq_dict[transcript+'-'+chrom+':'+str(introns[n][0]+1)] = fasta_dict[chrom][introns[n][0]+2:introns[n][0]+17] elif strand == '-': seq = fasta_dict[chrom][introns[n][0]-16:introns[n][0]-1] seq_dict[transcript+'-'+chrom+':'+str(introns[n][0])] = SP.reverse_complement(seq) return seq_dict
def seq_simple(chrom, start, end, strand, fasta_dict): if type(fasta_dict) == str: with open(fasta_dict, 'r') as f: fasta_dict = json.load(f) seq = fasta_dict[chrom][start:end+1] if strand == '-': seq = SP.reverse_complement(seq) return seq
def check_intron_position(transcript, position, gff3, organism): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) first = False last = False introns = ss_dict[transcript] for n, intron in enumerate(introns): if intron[0] in range(position - 3, position + 3): if n == 0: first = True elif n == len(intron): last = True break return first, last
def check_intron_position(transcript, position, gff3, organism): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) first=False last=False introns = ss_dict[transcript] for n, intron in enumerate(introns): if intron[0] in range(position-3,position+3): if n == 0: first = True elif n == len(intron): last = True break return first, last
def seq_simple(chrom, start, end, strand, fasta_dict): if type(fasta_dict) == str: with open(fasta_dict, 'r') as f: fasta_dict = json.load(f) seq = fasta_dict[chrom][start:end + 1] if strand == '-': seq = SP.reverse_complement(seq) return seq
def gene_patches(tx, tx_dict, ax, arrow=False): iso_list = [x for x in tx_dict if tx in x] if len(iso_list) == 0: return None for n, iso in enumerate(iso_list): start, end, strand, CDS_start, CDS_end, exons, chrom = SP.tx_info( iso, tx_dict) if arrow is False: tx_patch = patches.Rectangle((start, 0.8 - n * 0.15), end - start, 0.04, edgecolor='0.1', facecolor='0.1') ax.add_patch(tx_patch) else: if strand == '+': ax.arrow(start, 0.9, end - start - 0.02 * (end - start), 0, linewidth=2, head_width=0.1, head_length=0.02 * (end - start), fc='k', ec='k') elif strand == '-': ax.arrow(end, 0.9, start - end - 0.02 * (start - end), 0, linewidth=2, head_width=0.1, head_length=0.02 * (end - start), fc='k', ec='k') if exons is not None: exon_patches = [] for exon_start, exon_stop in exons: exon_patches.append( patches.Rectangle((exon_start, 0.775 - n * 0.15), exon_stop - exon_start, 0.10, edgecolor='0.1', facecolor='0.1')) for patch in exon_patches: ax.add_patch(patch) else: CDS_patch = patches.Rectangle((CDS_start, 0.75 - n * 0.15), CDS_end - CDS_start, 0.10, edgecolor='0.1', facecolor='0.1') ax.add_patch(CDS_patch) ax.get_yaxis().set_ticks([]) return strand
def peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name): branches = SP.list_branch_points(branch_bams[0], gff3, fa_dict, organism=organism) branch_df = SP.create_branch_df(branches, gff3, fa_dict, organism=organism) if len(branch_bams) == 2: branches2 = SP.list_branch_points(branch_bams[1], gff3, fa_dict, organism=organism) branch_df2 = SP.create_branch_df(branches2, gff3, fa_dict, organism=organism) branch_df = branch_df.append(branch_df2) bed1 = branch_bams[0].split('_sorted.bam')[0] + '.bed' bed2 = branch_bams[1].split('_sorted.bam')[0] + '.bed' cat_args = "cat {0} {1} > {2}_all_branches.bed".format( bed1, bed2, name) call(cat_args, shell=True) os.remove(bed1) os.remove(bed2) # Compare peaks and branches peaks_w_branch = branch_df[branch_df['genome coord'].isin( peak_df['genome coord'])] peaks_w_branch = peaks_w_branch.merge(peak_df[['type', 'genome coord']], right_on='genome coord', left_on='genome coord', how='left') peaks_w_branch.index = peaks_w_branch['branch coord'] print "\nPeaks with corresponding branches:" print len(peaks_w_branch) print str(len(set(peaks_w_branch['genome coord']))) + " unpredicted" peaks_w_branch.to_csv(base_dir + name + '_peaks_w_branch.csv') peaks_w_branch.to_pickle(base_dir + name + '_peaks_w_branch.pickle') return peaks_w_branch
def score_PyTract(df, fa_dict, alt_column_name=None, from_branches=False): py_score1 = [] py_score2 = [] alt_py1 = [] alt_py2 = [] for ix, r in df.iterrows(): strand = r['strand'] chrom = r['chromosome'] coord = r['annotated intron coords'][1] alt_coord = r['junction coords'][1] if strand == '+': if coord is not None: seq1 = fa_dict[chrom][coord-15:coord] seq2 = fa_dict[chrom][coord-30:coord-15] alt1 = fa_dict[chrom][alt_coord-15:alt_coord] alt2 = fa_dict[chrom][alt_coord-30:alt_coord-15] if strand == '-': if coord is not None: seq1 = fa_dict[chrom][coord:coord+15] seq2 = fa_dict[chrom][coord+15:coord+30] seq1 = SP.reverse_complement(seq1) seq2 = SP.reverse_complement(seq2) alt1 = fa_dict[chrom][alt_coord:alt_coord+15] alt2 = fa_dict[chrom][alt_coord+15:alt_coord+30] alt1 = SP.reverse_complement(alt1) alt2 = SP.reverse_complement(alt2) alt_py1.append(percent_py(alt1)) alt_py2.append(percent_py(alt2)) if coord is not None: py_score1.append(percent_py(seq1)) py_score2.append(percent_py(seq2)) else: py_score1.append(np.NaN) py_score2.append(np.NaN) df['Py score annotated -15:0'] = py_score1 df['Py score annotated -30:-15'] = py_score2 df['Py score alternative -15:0'] = alt_py1 df['Py score alternative -30:-15'] = alt_py2 return df
def score_PyTract(df, fa_dict, alt_column_name=None, from_branches=False): py_score1 = [] py_score2 = [] alt_py1 = [] alt_py2 = [] for ix, r in df.iterrows(): strand = r['strand'] chrom = r['chromosome'] coord = r['annotated intron coords'][1] alt_coord = r['junction coords'][1] if strand == '+': if coord is not None: seq1 = fa_dict[chrom][coord - 15:coord] seq2 = fa_dict[chrom][coord - 30:coord - 15] alt1 = fa_dict[chrom][alt_coord - 15:alt_coord] alt2 = fa_dict[chrom][alt_coord - 30:alt_coord - 15] if strand == '-': if coord is not None: seq1 = fa_dict[chrom][coord:coord + 15] seq2 = fa_dict[chrom][coord + 15:coord + 30] seq1 = SP.reverse_complement(seq1) seq2 = SP.reverse_complement(seq2) alt1 = fa_dict[chrom][alt_coord:alt_coord + 15] alt2 = fa_dict[chrom][alt_coord + 15:alt_coord + 30] alt1 = SP.reverse_complement(alt1) alt2 = SP.reverse_complement(alt2) alt_py1.append(percent_py(alt1)) alt_py2.append(percent_py(alt2)) if coord is not None: py_score1.append(percent_py(seq1)) py_score2.append(percent_py(seq2)) else: py_score1.append(np.NaN) py_score2.append(np.NaN) df['Py score annotated -15:0'] = py_score1 df['Py score annotated -30:-15'] = py_score2 df['Py score alternative -15:0'] = alt_py1 df['Py score alternative -30:-15'] = alt_py2 return df
def peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name): # Load in junctions junc_df1 = SP.build_junction_df(junc_beds[0], gff3, fa_dict, organism=organism) junc_df2 = SP.build_junction_df(junc_beds[1], gff3, fa_dict, organism=organism) junc_df = SP.combine_junctions(junc_df1, junc_df2) #print junc_df # Compare peaks and junctions peaks_w_junc = SP.compare_peak_junc_df(peak_df, junc_df, organism=organism) peaks_w_junc = SP.score_peaks(peaks_w_junc, gff3, fa_dict) # Reformat dataframe - add index, sort so that the annotated intron is first in each cluster peaks_w_junc.index = peaks_w_junc['genome coord'] peaks_w_junc['type index'] = np.where( peaks_w_junc['junction type'] == 'Annotated', 0, 1) peaks_w_junc = peaks_w_junc.sort_values('type index') peaks_w_junc.groupby(peaks_w_junc.index).first() peaks_w_junc = peaks_w_junc.drop(['index', 'type index'], axis=1) peaks_w_junc['intron tuple'] = zip( peaks_w_junc['transcript'].tolist(), peaks_w_junc['annotated intron size'].tolist()) print "\nPeaks with corresponding exon-exon junctions:" print len(peaks_w_junc) print str( len( set(peaks_w_junc[~peaks_w_junc['type'].str.contains('prime')] ['genome coord']))) + " unpredicted" peaks_w_junc.to_csv(base_dir + name + '_peaks_w_junc.csv') peaks_w_junc.to_pickle(base_dir + name + '_peaks_w_junc.pickle') return peaks_w_junc
def peaks_only(config_file, untagged, organism): CP_out = [] quant_bams = {} with open(config_file, 'r') as config: for line in config: if untagged in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = config_file.split('/')[-1].split('_config')[0] base_dir = config_file.split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: " + base_dir + name organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name + '_CP_peaks') peak_df.to_pickle(base_dir + name + '_all_peaks.pickle') quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism) quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir + name + '_quantitation.pickle') quant_df.to_csv(base_dir + name + '_quantitation.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
def build_tss_dict(gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", window=220): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') tss_dict = {} for tx, info in transcript_dict.iteritems(): if info[2] == '+': start = info[0]-window end = info[0]+window tss_dict[tx] = [start, end, info[2], info[3]] elif info[2] == '-': start = info[1]-window end = info[1]+window tss_dict[tx] = [start, end, info[2], info[3]] return tss_dict
def find_3p_site(branch_df, gff3, organism=None): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) three_coord = [] for ix, r in branch_df.iterrows(): introns = ss_dict[r['transcript'][:-2]] matched = False for intron in introns: if r['5p splice site'] in range(intron[0]-1,intron[0]+2): three_coord.append(intron[1]) matched = True break if matched is False: three_coord.append(np.NaN) branch_df['3p splice site'] = three_coord branch_df['intron size'] = branch_df['5p splice site']-branch_df['3p splice site'] branch_df['intron size'] = branch_df['intron size'].apply(abs) branch_df['Branch to 3p distance'] = branch_df['branch site']-branch_df['3p splice site'] branch_df['Branch to 3p distance'] = branch_df['Branch to 3p distance'].apply(abs) return branch_df
def write_transcript_fasta(transcript_dict, fasta_dict, prefix='transcripts', sense=True, spliced=False): seq_dict = {} for transcript, values in transcript_dict.iteritems(): start = values[0] end = values[1] strand = values[2] chrom = values[3] CDS_start_list = values[4] CDS_end_list = values[5] if spliced is False: seq = fasta_dict[chrom][start-1:end] if strand == '-': seq = SP.reverse_complement(seq) elif spliced is True: seq = '' for n in range(len(CDS_start_list)): if strand == '+': seq = seq+fasta_dict[chrom][CDS_start_list[n]-1:CDS_end_list[n]] elif strand == '-': new_seq = fasta_dict[chrom][CDS_start_list[n]-1:CDS_end_list[n]] new_seq = SP.reverse_complement(new_seq) seq = seq+new_seq if sense is False: seq = SP.reverse_complement(seq) seq_dict[transcript] = seq with open('{}.fa'.format(prefix), 'w') as fout: for transcript, seq in seq_dict.iteritems(): fout.write('>'+transcript+'\n') fout.write(seq+'\n') return seq_dict
def build_tss_dict( gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", window=220): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') tss_dict = {} for tx, info in transcript_dict.iteritems(): if info[2] == '+': start = info[0] - window end = info[0] + window tss_dict[tx] = [start, end, info[2], info[3]] elif info[2] == '-': start = info[1] - window end = info[1] + window tss_dict[tx] = [start, end, info[2], info[3]] return tss_dict
def create_branch_df(branch_dict, gff3, fa_dict, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) chroms = [] fives = [] transcripts = [] branches = [] depths = [] strands = [] distances = [] for tx, five_sites in branch_dict.iteritems(): for five_site in five_sites: chrom = five_site[0].split(':')[0] pos = int(five_site[0].split(':')[1]) n = 0 for n in range(len(five_site[1])): if abs(five_site[1][n] - pos) > 5 and abs(five_site[1][n] - pos) <= 1000 and five_site[2][n] >= 5: chroms.append(chrom) fives.append(pos) transcripts.append(tx) branches.append(five_site[1][n]) depths.append(five_site[2][n]) strands.append(tx_dict[tx][2]) if tx_dict[tx][2] == '+': distances.append(five_site[1][n] - pos) elif tx_dict[tx][2] == '-': distances.append(pos - five_site[1][n]) branch_df = pd.DataFrame(index=range(len(fives))) branch_df['transcript'] = transcripts branch_df['chromosome'] = chroms branch_df['5p splice site'] = fives branch_df['branch site'] = branches branch_df['depth'] = depths branch_df['distance'] = distances branch_df['strand'] = strands branch_df = branch_df[branch_df['distance'] > 0] branch_df['genome coord'] = branch_df['chromosome'].str.cat( branch_df['5p splice site'].apply(int).apply(str), sep=':') branch_df['branch coord'] = branch_df['chromosome'].str.cat( branch_df['branch site'].apply(int).apply(str), sep=':') branch_df = add_seq(branch_df, fa_dict) branch_df = find_3p_site(branch_df, gff3, organism=organism) return branch_df
def find_polyA_sites(transcript_dict, window=220): polyA_bg = SP.read_CNAGsort_bedgraph2('/home/jordan/GENOMES/POMBE/polyA_sites_CNAGsort.bedgraph', transcript_dict, organism='pombe') pA_dict = {} for tx, s in polyA_bg.iteritems(): s = s[s > 0] if len(s) > 0: if transcript_dict[tx][2] == '+': #pA_site = max(s.index) s.sort_values(ascending=False, inplace=True) pA_site = s.index[0] pA_dict[tx] = [pA_site-window, pA_site+window, transcript_dict[tx][2], transcript_dict[tx][3]] elif transcript_dict[tx][2] == '-': #pA_site = min(s.index) s.sort_values(ascending=False, inplace=True) pA_site = s.index[0] pA_dict[tx] = [pA_site-window, pA_site+window, transcript_dict[tx][2], transcript_dict[tx][3]] return pA_dict
def build_transcript_dict( gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", expand=False, convert_chroms=False): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') lat_rom = {'chr1': 'I', 'chr2': 'II', 'chr3': 'III', 'MT': 'MT'} if convert_chroms is True: transcript_dict = { k: [start, end, strand, lat_rom[chrom], cds_start, cds_end] for k, [start, end, strand, chrom, cds_start, cds_end] in transcript_dict.items() } chrom_lengths = { 'I': 5818680, 'II': 4744158, 'III': 2598968, 'chr1': 5818680, 'chr2': 4744158, 'chr3': 2598968 } if expand is True: expanded_dict = {} for tx, info in transcript_dict.iteritems(): new_start = info[0] - 300 if new_start < 0: new_start = 0 new_end = info[1] + 300 if info[3] in chrom_lengths: if new_end > chrom_lengths[info[3]]: new_end = chrom_lengths[info[3]] #else: print info[3] if len(info[4]) == 0: info[4] = [info[0]] if len(info[5]) == 0: info[5] = [info[1]] expanded_dict[tx] = [ new_start, new_end, info[2], info[3], info[4], info[5] ] transcript_dict = expanded_dict return transcript_dict
def add_cdf_to_plot(ax, value_lists, label_list, color_list, ks_list, log2=False): all_cdfs = [] all_lists = [] n = 0 for n in range(len(value_lists)): if log2 is True: new_list = [np.log2(x) for x in value_lists[n]] else: new_list = value_lists[n] new_list = [ x for x in new_list if (str(x) != 'inf' and str(x) != '-inf' and str(x) != 'nan') ] all_lists = all_lists + new_list cumulative, base = SP.cdf_values(new_list) ax.plot(base[1:], cumulative, c=color_list[n], linewidth=3.0, label=label_list[n]) all_cdfs.append(cumulative) xmin = np.percentile(all_lists, 1) xmax = np.percentile(all_lists, 99) ax.set_xlim([xmin, xmax]) ax.tick_params(axis='x', labelsize=12) ax.tick_params(axis='y', labelsize=12) if ks_list is not None: text = "p-values: \n" + ks_list[0] + ' \n' + ks_list[1] + ' ' if len(ks_list) == 4: text = text + ' \n' + ks_list[2] + ' \n' + ks_list[3] + ' ' ax.annotate(text, xy=(xmax, 0.0), horizontalalignment='right', fontsize=12) return ax
def create_branch_df(branch_dict, gff3, fa_dict, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) chroms = [] fives = [] transcripts = [] branches = [] depths = [] strands = [] distances = [] for tx, five_sites in branch_dict.iteritems(): for five_site in five_sites: chrom = five_site[0].split(':')[0] pos = int(five_site[0].split(':')[1]) n=0 for n in range(len(five_site[1])): if abs(five_site[1][n]-pos) > 5 and abs(five_site[1][n]-pos) <= 1000 and five_site[2][n] >= 5: chroms.append(chrom) fives.append(pos) transcripts.append(tx) branches.append(five_site[1][n]) depths.append(five_site[2][n]) strands.append(tx_dict[tx][2]) if tx_dict[tx][2] == '+': distances.append(five_site[1][n]-pos) elif tx_dict[tx][2] == '-': distances.append(pos-five_site[1][n]) branch_df = pd.DataFrame(index = range(len(fives))) branch_df['transcript'] = transcripts branch_df['chromosome'] = chroms branch_df['5p splice site'] = fives branch_df['branch site'] = branches branch_df['depth'] = depths branch_df['distance'] = distances branch_df['strand'] = strands branch_df = branch_df[branch_df['distance'] > 0] branch_df['genome coord'] = branch_df['chromosome'].str.cat(branch_df['5p splice site'].apply(int).apply(str), sep=':') branch_df['branch coord'] = branch_df['chromosome'].str.cat(branch_df['branch site'].apply(int).apply(str), sep=':') branch_df = add_seq(branch_df, fa_dict) branch_df = find_3p_site(branch_df, gff3, organism=organism) return branch_df
def by_pos_plots(df, metrics=['Intermediate Level', 'Precursor']): col5 = [x for x in df.columns if 'Base 5' in x[1]] col3 = [x for x in df.columns if 'Base 3' in x[1]] for direction in ['Up','Down']: for metric in metrics: if len(df[df[('All',metric+' change')] == direction]) > 5: for n in range(len(col5)): if n == 0: s5 = df[df[('All',metric+' change')] == direction][col5[n]] else: s5 = s5.str.cat(df[df[('All',metric+' change')] == direction][col5[n]]) print len(s5) for n in range(len(col3)): if n == 0: s3 = df[df[('All',metric+' change')] == direction][col3[n]] else: s3 = s3.str.cat(df[df[('All',metric+' change')] == direction][col3[n]]) print metric+' '+direction fig = SP.position_wise_scores2(s5, s3, 'crypto')
def position_wise_scores(seq_5p, seq_3p, organism): organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) PSSM_5p, PSSM_3p = generate_consensus_matrix(gff3, fa_dict, PSSM=True) base_dict = {"A":0, "C":1, "T":2, "G":3} seq_5p = [x for x in seq_5p if x is not None] seq_3p = [x for x in seq_3p if x is not None] score_5prime = np.empty([2,len(seq_5p[0])]) score_3prime = np.empty([2,len(seq_3p[0])]) all_5p = np.empty([len(seq_5p), len(seq_5p[0])]) all_3p = np.empty([len(seq_3p), len(seq_3p[0])]) n=0 for n in range(len(seq_5p)): for a, base in enumerate(seq_5p[n]): all_5p[n,a] = PSSM_5p[base_dict[base], a] a=0 for a in range(len(score_5prime[0])): score_5prime[0,a] = np.median(all_5p[0:,a]) score_5prime[1,a] = (max(all_5p[0:,a])-min(all_5p[0:,a]))/2. print score_5prime m=0 for m in range(len(seq_3p)): for b, base in enumerate(seq_3p[m]): all_3p[m,b] = PSSM_3p[base_dict[base], b] b=0 for b in range(len(score_3prime[0])): score_3prime[0,b] = np.median(all_3p[0:,b]) score_3prime[1,b] = (max(all_3p[0:,b])-min(all_3p[0:,b]))/2. print score_3prime return all_5p, all_3p
def position_wise_scores(seq_5p, seq_3p, organism): organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) PSSM_5p, PSSM_3p = generate_consensus_matrix(gff3, fa_dict, PSSM=True) base_dict = {"A": 0, "C": 1, "T": 2, "G": 3} seq_5p = [x for x in seq_5p if x is not None] seq_3p = [x for x in seq_3p if x is not None] score_5prime = np.empty([2, len(seq_5p[0])]) score_3prime = np.empty([2, len(seq_3p[0])]) all_5p = np.empty([len(seq_5p), len(seq_5p[0])]) all_3p = np.empty([len(seq_3p), len(seq_3p[0])]) n = 0 for n in range(len(seq_5p)): for a, base in enumerate(seq_5p[n]): all_5p[n, a] = PSSM_5p[base_dict[base], a] a = 0 for a in range(len(score_5prime[0])): score_5prime[0, a] = np.median(all_5p[0:, a]) score_5prime[1, a] = (max(all_5p[0:, a]) - min(all_5p[0:, a])) / 2. print score_5prime m = 0 for m in range(len(seq_3p)): for b, base in enumerate(seq_3p[m]): all_3p[m, b] = PSSM_3p[base_dict[base], b] b = 0 for b in range(len(score_3prime[0])): score_3prime[0, b] = np.median(all_3p[0:, b]) score_3prime[1, b] = (max(all_3p[0:, b]) - min(all_3p[0:, b])) / 2. print score_3prime return all_5p, all_3p
def make_transcript_df(gff3): '''Creates a dataframe with all annotated transcripts from the gff3 file Parameters ---------- gff3 : str Your favorite annotation file Returns ------ df : pandas.DataFrame Pandas dataframe instance with location of transcripts from gff3 file''' if 'pombe' in gff3.lower(): organism='pombe' else: organism=None # Get transcript dictionary tx_dict = SP.build_transcript_dict(gff3, organism=organism) # Organize by transcript tx_dict = OrderedDict(sorted(tx_dict.items(), key=lambda t: t[0])) # Convert to dataframe tx_df = pd.DataFrame(index=tx_dict.keys(), columns=['start','end','strand','chromosome']) for n, col in enumerate(tx_df.columns): tx_df.loc[:,col] = zip(*tx_dict.values())[n] # Add CDS starts and ends CDS_starts = [min(l) if len(l) > 0 else np.NaN for l in zip(*tx_dict.values())[4]] CDS_ends = [max(l) if len(l) > 0 else np.NaN for l in zip(*tx_dict.values())[5]] tx_df.loc[:,'CDS start'] = CDS_starts tx_df.loc[:,'CDS end'] = CDS_ends return tx_df
def log2_Zscore_df(df, wt, mut, metrics=['Intermediate Level', 'Precursor'], Z=2, by_pos_scores=False): if type(df) == str: new_df = pd.read_pickle(df) else: new_df = copy.deepcopy(df) print len(new_df) mutA = [x for x in new_df.columns if (x[0] == mut) and (x[1][-2:] == '-A')] #wtA = [x for x in new_df.columns if (x[0] == wt) and (x[1][-2:] == '-A')] new_df = new_df[new_df[mutA].sum(axis=1) >= 10] print len(new_df) for metric in metrics: columns = [x for x in new_df.columns if (metric in x[1]) and ('avg' not in x[1])] wt_cols = [x for x in columns if (x[0] == wt) and ('avg' not in x[1])] mut_cols = [x for x in columns if (x[0] == mut) and ('avg' not in x[1])] for column in columns: new_df[(column[0], column[1]+' log2')] = s_log2(new_df[column]) if len(wt_cols) != len(mut_cols): print "Number of WT reps must match number of mutant reps!" print wt_cols print mut_cols return None for n, wt_col in enumerate(wt_cols): new_df[('All',metric+' log2 ratio'+str(n+1))] = s_log2(new_df[mut_cols[n]]/new_df[wt_col]) new_index = [x+'-'+str(n) for x in new_df.index] wt_s = new_df[wt_col] wt_s.index = new_index mut_s = new_df[mut_cols[n]] mut_s.index = new_index if n == 0: wt_s_for_Z = wt_s mut_s_for_Z = mut_s else: wt_s_for_Z = wt_s_for_Z.append(wt_s) mut_s_for_Z = mut_s_for_Z.append(mut_s) Zlist = s_log2_ratio_Zscore(wt_s_for_Z.dropna(), mut_s_for_Z.dropna()) for n, wt_col in enumerate(wt_cols): n_up = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist >= Z)] n_up = [x[:-2] for x in n_up.index] n_down = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist <= -1*Z)] n_down = [x[:-2] for x in n_down.index] n_other = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist < Z) & (Zlist > -1*Z)] n_other = [x[:-2] for x in n_other.index] if n == 0: up = set(n_up) down = set(n_down) other = set(n_other) else: up = up.intersection(n_up) up = up.difference(n_down).difference(n_other) down = down.intersection(n_down) down = down.difference(n_up).difference(n_other) other = other.intersection(n_other) other = other.difference(n_up).difference(n_down) print len(up) print len(down) new_df[('All',metric+' change')] = None new_df.loc[up, ('All',metric+' change')] = 'Up' new_df.loc[down, ('All',metric+' change')] = 'Down' new_df.loc[other, ('All', metric+' change')] = 'Other' plot_df = copy.deepcopy(new_df) fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(8,8)) groups = {'Other':'0.8','Up':'tomato','Down':'cornflowerblue'} for group in ['Other','Up','Down']: gr_df = plot_df[plot_df[('All',metric+' change')] == group] if len(gr_df) >= 15: for n, wt_col in enumerate(wt_cols): ax[0][n].scatter(s_log2(gr_df[wt_col]), s_log2(gr_df[mut_cols[n]]), color=groups[group], alpha=0.9, label=group, s=20) ax[0][n].set_xlabel(wt_col[0]+' log2 '+metric, fontsize=12) ax[0][n].set_ylabel(mut_cols[n][0]+' log2 '+metric, fontsize=12) ax[0][n].set_title('Replicate '+str(n+1), fontsize=14) ax[0][n], limits = SP.draw_diagonal(ax[0][n]) ax[0][n].legend(fontsize=12) sns.kdeplot(gr_df[('Peaks','intron size')], ax=ax[1][0], bw=2, cumulative=True, linewidth=3, color=groups[group], label=group) ax[1][0].set_xlim([30, 400]) sns.kdeplot(gr_df[('Peaks','5p score')], ax=ax[1][1], bw=2, cumulative=True, linewidth=3, color=groups[group], label=group) ax[1][0].set_xlabel('Intron size (nt)') ax[1][0].set_ylabel('Fraction of introns') ax[1][1].set_xlabel('5prime splice site score') ax[1][1].set_ylabel('Fraction of introns') ax[1][1].set_xlim([np.percentile(plot_df[('Peaks','5p score')], 0.5), np.percentile(plot_df[('Peaks','5p score')], 99.9)+5]) fig.tight_layout() plt.show() plt.clf() if by_pos_scores is True: SP.by_pos_plots(new_df, metrics=metrics) new_df[('Peaks','predicted')] = True new_df.loc[~new_df[('Peaks','type')].str.contains('prime'), ('Peaks','predicted')] = False return new_df
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None): count1 = 0 count2 = 0 pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')] quant_df['genome coord'] = quant_df['chromosome'].str.cat( quant_df['position'].values.astype(str), sep=':') quant_df.index = quant_df['genome coord'] quant_df = quant_df.drop('index', axis=1) column_dict = { 'intron size': [], 'alt splicing': [], '5p score': [], '3p score': [], 'seq5': [], 'seq3': [] } new_index = [] seq5 = [] seq3 = [] for coord in quant_df.index: coord_df = quant_df[quant_df.index == coord] three_site = None alt3 = False if len(coord_df) > 0: coord_df = coord_df.sort_values('height', ascending=False).ix[0] introns = ss_dict[coord_df['transcript']] if 'prime' in coord_df['type']: peak_range = range(coord_df['position'] - 5, coord_df['position'] + 5) for intron in introns: if intron[0] in peak_range: five_site = intron[0] three_site = intron[1] break if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0: alt3 = True else: if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']: five_site = coord_df['position'] three_df = quant_df[ (quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')] three_df = three_df.sort_values('height', ascending=False) three_site = three_df.ix[0]['position'] if three_site is not None: new_index.append(coord) size = abs(three_site - five_site) / 1000. column_dict['intron size'].append(size) column_dict['alt splicing'].append(alt3) if coord_df['strand'] == '+': s5 = fa_dict[coord_df['chromosome']][five_site - 2:five_site + 6] s3 = fa_dict[coord_df['chromosome']][three_site - 6:three_site + 2] elif coord_df['strand'] == '-': s5 = fa_dict[coord_df['chromosome']][five_site - 6:five_site + 2] s5 = SP.reverse_complement(s5) s3 = fa_dict[coord_df['chromosome']][three_site - 2:three_site + 6] s3 = SP.reverse_complement(s3) column_dict['seq5'].append(s5) column_dict['seq3'].append(s3) scores = SP.simple_score_junction(s5, s3, pssm) column_dict['3p score'].append(scores[1]) column_dict['5p score'].append(scores[0]) new_quant_df = quant_df[quant_df.index.isin(new_index)][[ 'genome coord', 'chromosome', 'strand', 'transcript', 'position', 'type' ]] for column, data in column_dict.iteritems(): new_quant_df[column] = data new_quant_df = new_quant_df.drop_duplicates( subset='genome coord', keep='first').set_index('genome coord') new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) #for n in range(len(new_quant_df['seq5'].iloc[0])): # new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] #for n in range(len(new_quant_df['seq3'].iloc[0])): # new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) new_quant_df = SP.find_score_branches_ppy( new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict) return new_quant_df
def main(): '''Each line will be : bam_file,genotype,sample e.g. CM763-A_sorted.bam,WT,A1''' bam_dict = {} with open(sys.argv[1], 'r') as config: for line in config: info = line.split(',') genotype = info[1] sample = info[2].strip() if genotype not in bam_dict: bam_dict[genotype] = {} bam_dict[genotype][sample] = info[0] prefix = sys.argv[1].split('_config')[0] organism = sys.argv[3] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) columns = [ '5p score', 'exon size (us)', 'exon size (ds)', 'introns in transcript', 'type', 'transcript size', 'intron size', 'chromosome', 'position', 'alt splicing', '3p score', 'transcript', 'intron position', 'strand', 'peak', 'Base 5-0', 'Base 5-1', 'Base 5-2', 'Base 5-3', 'Base 5-4', 'Base 5-5', 'Base 5-6', 'Base 5-7', 'Base 3-0', 'Base 3-1', 'Base 3-2', 'Base 3-3', 'Base 3-4', 'Base 3-5', 'Base 3-6', 'Base 3-7', 'branch score', 'branch to 3p distance', 'percent pPy', 'branch-0', 'branch-1', 'branch-2', 'branch-3', 'branch-4' ] quant_df = pd.read_csv(sys.argv[2], index_col=0) try: quant_df = quant_df[columns] except KeyError: print "Columns missing from dataframe..." print columns print quant_df.columns return None final_df = copy.deepcopy(quant_df) final_df.columns = pd.MultiIndex.from_product([['Peaks'], final_df.columns]) for genotype, samples in bam_dict.iteritems(): # Determine if whole cell extract samples are present Ws = [x for x in samples.keys() if "W" in x] if len(Ws) > 1: W = True else: W = False # Quantitate all samples with genotype new_df = quantitate_junction_df(samples, quant_df, gff3, W=W) # Remove original columns and rename new ones with multiindex new_columns = [x for x in new_df.columns if x not in columns] new_df = new_df[new_columns] new_df.columns = pd.MultiIndex.from_product([[genotype], new_df.columns]) final_df = final_df.join(new_df, how='inner') #final_df = final_df.merge(new_df, right_index=True, left_index=True) final_df.to_csv(prefix + '_quant_df.csv') final_df.to_pickle(prefix + '_quant_df.pickle') SP.SP_quant_scatters(final_df.dropna(how='any'), bam_dict, W=W)
def count_reads_in_transcript(bam_files, df, gff3, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) bams = {} for bam_file in bam_files: bams[bam_file] = pysam.Samfile(bam_file) all_reads = {} for bam, reader in bams.iteritems(): all_reads[bam] = pd.DataFrame(index=df.index, columns=['total', 'intron']) for tx in set(df['transcript']): tx_df = df[df['transcript'] == tx] if organism == 'pombe': tx = tx + '.1' else: tx = tx + 'T0' start, end, strand, CDS_start, CDS_end, exons, chrom = SP.tx_info( tx, tx_dict) if organism == 'pombe': lat_rom = {'chr1': 'I', 'chr2': 'II', 'chr3': 'III'} chrom = lat_rom[chrom] tx_iter = reader.fetch(chrom, start, end) intron_ranges = {} for ix, r in tx_df.iterrows(): if strand == '+': intron_start = int(r['position']) intron_end = int(r['position'] + r['intron size']) + 1 elif strand == '-': intron_start = int(r['position'] - r['intron size']) intron_end = int(r['position']) + 1 intron_ranges[ix] = [range(intron_start, intron_end), 0] reads = 0 for read in tx_iter: if read.is_reverse and strand == '+': reads += 1 for ix in intron_ranges: if read.reference_end in intron_ranges[ix][0]: intron_ranges[ix][1] += 1 elif not read.is_reverse and strand == '-': reads += 1 for ix in intron_ranges: if read.reference_start in intron_ranges[ix][0]: intron_ranges[ix][1] += 1 for ix in intron_ranges: try: all_reads[bam].loc[ix, 'total'] = reads / float(end - start) * 1000 all_reads[bam].loc[ix, 'intron'] = ( (intron_ranges[ix][1] / float(tx_df.loc[ix, 'intron size'])) / (reads / float(end - start))) except ZeroDivisionError: all_reads[bam].loc[ix, 'total'] = np.NaN all_reads[bam].loc[ix, 'intron'] = np.NaN print ix return all_reads
def log2_Zscore_df(df, wt, mut, metrics=['Intermediate Level', 'Precursor'], Z=2, by_pos_scores=False): if type(df) == str: new_df = pd.read_pickle(df) else: new_df = copy.deepcopy(df) print len(new_df) mutA = [x for x in new_df.columns if (x[0] == mut) and (x[1][-2:] == '-A')] #wtA = [x for x in new_df.columns if (x[0] == wt) and (x[1][-2:] == '-A')] new_df = new_df[new_df[mutA].sum(axis=1) >= 10] print len(new_df) for metric in metrics: columns = [ x for x in new_df.columns if (metric in x[1]) and ('avg' not in x[1]) ] wt_cols = [x for x in columns if (x[0] == wt) and ('avg' not in x[1])] mut_cols = [ x for x in columns if (x[0] == mut) and ('avg' not in x[1]) ] for column in columns: new_df[(column[0], column[1] + ' log2')] = s_log2(new_df[column]) if len(wt_cols) != len(mut_cols): print "Number of WT reps must match number of mutant reps!" print wt_cols print mut_cols return None for n, wt_col in enumerate(wt_cols): new_df[('All', metric + ' log2 ratio' + str(n + 1))] = s_log2( new_df[mut_cols[n]] / new_df[wt_col]) new_index = [x + '-' + str(n) for x in new_df.index] wt_s = new_df[wt_col] wt_s.index = new_index mut_s = new_df[mut_cols[n]] mut_s.index = new_index if n == 0: wt_s_for_Z = wt_s mut_s_for_Z = mut_s else: wt_s_for_Z = wt_s_for_Z.append(wt_s) mut_s_for_Z = mut_s_for_Z.append(mut_s) Zlist = s_log2_ratio_Zscore(wt_s_for_Z.dropna(), mut_s_for_Z.dropna()) for n, wt_col in enumerate(wt_cols): n_up = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist >= Z)] n_up = [x[:-2] for x in n_up.index] n_down = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist <= -1 * Z)] n_down = [x[:-2] for x in n_down.index] n_other = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist < Z) & (Zlist > -1 * Z)] n_other = [x[:-2] for x in n_other.index] if n == 0: up = set(n_up) down = set(n_down) other = set(n_other) else: up = up.intersection(n_up) up = up.difference(n_down).difference(n_other) down = down.intersection(n_down) down = down.difference(n_up).difference(n_other) other = other.intersection(n_other) other = other.difference(n_up).difference(n_down) print len(up) print len(down) new_df[('All', metric + ' change')] = None new_df.loc[up, ('All', metric + ' change')] = 'Up' new_df.loc[down, ('All', metric + ' change')] = 'Down' new_df.loc[other, ('All', metric + ' change')] = 'Other' plot_df = copy.deepcopy(new_df) fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(8, 8)) groups = {'Other': '0.8', 'Up': 'tomato', 'Down': 'cornflowerblue'} for group in ['Other', 'Up', 'Down']: gr_df = plot_df[plot_df[('All', metric + ' change')] == group] if len(gr_df) >= 15: for n, wt_col in enumerate(wt_cols): ax[0][n].scatter(s_log2(gr_df[wt_col]), s_log2(gr_df[mut_cols[n]]), color=groups[group], alpha=0.9, label=group, s=20) ax[0][n].set_xlabel(wt_col[0] + ' log2 ' + metric, fontsize=12) ax[0][n].set_ylabel(mut_cols[n][0] + ' log2 ' + metric, fontsize=12) ax[0][n].set_title('Replicate ' + str(n + 1), fontsize=14) ax[0][n], limits = SP.draw_diagonal(ax[0][n]) ax[0][n].legend(fontsize=12) sns.kdeplot(gr_df[('Peaks', 'intron size')], ax=ax[1][0], bw=2, cumulative=True, linewidth=3, color=groups[group], label=group) ax[1][0].set_xlim([30, 400]) sns.kdeplot(gr_df[('Peaks', '5p score')], ax=ax[1][1], bw=2, cumulative=True, linewidth=3, color=groups[group], label=group) ax[1][0].set_xlabel('Intron size (nt)') ax[1][0].set_ylabel('Fraction of introns') ax[1][1].set_xlabel('5prime splice site score') ax[1][1].set_ylabel('Fraction of introns') ax[1][1].set_xlim([ np.percentile(plot_df[('Peaks', '5p score')], 0.5), np.percentile(plot_df[('Peaks', '5p score')], 99.9) + 5 ]) fig.tight_layout() plt.show() plt.clf() if by_pos_scores is True: SP.by_pos_plots(new_df, metrics=metrics) new_df[('Peaks', 'predicted')] = True new_df.loc[~new_df[('Peaks', 'type')].str.contains('prime'), ('Peaks', 'predicted')] = False return new_df