def write_intron_fasta(transcript_dict, fasta_dict, prefix='introns', sense=True): seq_dict = {} for transcript, values in transcript_dict.iteritems(): start = values[0] end = values[1] strand = values[2] chrom = values[3] CDS_start_list = values[4] CDS_end_list = values[5] for n in range(len(CDS_start_list)-1): if strand == '+': seq = fasta_dict[chrom][CDS_end_list[n]:CDS_start_list[n+1]-1] elif strand == '-': intron = len(CDS_start_list)-n-1 seq = fasta_dict[chrom][CDS_end_list[intron]:CDS_start_list[intron-1]-1] seq = SP.reverse_complement(seq) if sense is False: seq = SP.reverse_complement(seq) seq_dict[transcript+'_'+str(n)] = seq with open('{}.fa'.format(prefix), 'w') as fout: for transcript, seq in seq_dict.iteritems(): fout.write('>'+transcript+'\n') fout.write(seq+'\n') return seq_dict
def add_seq(branch_df, fa_dict): five_seqs = [] branch_seqs = [] for ix, r in branch_df.iterrows(): five = fa_dict[r['chromosome']][r['5p splice site']-8:r['5p splice site']+8] branch = fa_dict[r['chromosome']][r['branch site']-8:r['branch site']+8] if r['strand'] == '-': five = SP.reverse_complement(five) branch = SP.reverse_complement(branch) if 'GT' in five[4:11]: ix = five.index('GT') five = five[ix-2:ix+6] else: five = five[4:12] if 'AG' in branch[4:11]: ix = branch.index('AG') branch = branch[ix-4:ix+4] elif 'AA' in branch[4:11]: ix = branch.index('AA') branch = branch[ix-4:ix+4] elif 'GA' in branch[4:11]: ix = branch.index('GA') branch = branch[ix-4:ix+4] else: branch = branch[4:13] five_seqs.append(five) branch_seqs.append(branch) branch_df['5p seq'] = five_seqs branch_df['Branch seq'] = branch_seqs receptors = ['AG', 'AA', 'GA'] branch_df = branch_df[branch_df['Branch seq'].str[4:6].isin(receptors)] return branch_df
def generate_all_ss_seqs(gff3, fasta_dict, organism): transcript_dict = SP.build_transcript_dict(gff3, organism=organism) ss, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss) all_seq5 = [] all_seq3 = [] for transcript, introns in ss_dict.iteritems(): if organism == 'pombe': isoform = transcript+'.1' else: isoform = transcript+'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: if strand == '+': seq5 = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)] elif strand == '-': seq5 = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)] seq5 = SP.reverse_complement(seq5) all_seq5.append(seq5) if strand == '+': seq3 = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)] elif strand == '-': seq3 = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)] seq3 = SP.reverse_complement(seq3) all_seq3.append(seq3) return all_seq5, all_seq3
def add_seq(branch_df, fa_dict): five_seqs = [] branch_seqs = [] for ix, r in branch_df.iterrows(): five = fa_dict[r['chromosome']][r['5p splice site'] - 8:r['5p splice site'] + 8] branch = fa_dict[r['chromosome']][r['branch site'] - 8:r['branch site'] + 8] if r['strand'] == '-': five = SP.reverse_complement(five) branch = SP.reverse_complement(branch) if 'GT' in five[4:11]: ix = five.index('GT') five = five[ix - 2:ix + 6] else: five = five[4:12] if 'AG' in branch[4:11]: ix = branch.index('AG') branch = branch[ix - 4:ix + 4] elif 'AA' in branch[4:11]: ix = branch.index('AA') branch = branch[ix - 4:ix + 4] elif 'GA' in branch[4:11]: ix = branch.index('GA') branch = branch[ix - 4:ix + 4] else: branch = branch[4:13] five_seqs.append(five) branch_seqs.append(branch) branch_df['5p seq'] = five_seqs branch_df['Branch seq'] = branch_seqs receptors = ['AG', 'AA', 'GA'] branch_df = branch_df[branch_df['Branch seq'].str[4:6].isin(receptors)] return branch_df
def generate_all_ss_seqs(gff3, fasta_dict, organism): transcript_dict = SP.build_transcript_dict(gff3, organism=organism) ss, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss) all_seq5 = [] all_seq3 = [] for transcript, introns in ss_dict.iteritems(): if organism == 'pombe': isoform = transcript + '.1' else: isoform = transcript + 'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: if strand == '+': seq5 = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)] elif strand == '-': seq5 = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)] seq5 = SP.reverse_complement(seq5) all_seq5.append(seq5) if strand == '+': seq3 = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)] elif strand == '-': seq3 = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)] seq3 = SP.reverse_complement(seq3) all_seq3.append(seq3) return all_seq5, all_seq3
def write_intergenic_fasta(transcript_dict, fasta_dict, bps_us=0, bps_ds=0, all_intergenic=True, prefix='intergenic_transcripts'): seq_dict = {} if all_intergenic is False: for transcript, values in transcript_dict.iteritems(): start = values[0] end = values[1] strand = values[2] chrom = values[3] if bps_us > 0: if strand == '+': seq_us_sense = fasta_dict[chrom][start-bps_us:start] elif strand == '-': seq_us_sense = fasta_dict[chrom][end:end+bps_us] seq_us_sense = SP.reverse_complement(seq_us_sense) seq_us_antisense = SP.reverse_complement(seq_us_sense) seq_dict[transcript+'_us_sense'] = seq_us_sense seq_dict[transcript+'_us_antisense'] = seq_us_antisense if bps_ds > 0: if strand == '+': seq_ds_sense = fasta_dict[chrom][end:bps_ds+end] elif strand == '-': seq_ds_sense = fasta_dict[chrom][start-bps_ds:start] seq_ds_sense = SP.reverse_complement(seq_ds_sense) seq_ds_antisense = SP.reverse_complement(seq_ds_sense) seq_dict[transcript+'_ds_sense'] = seq_ds_sense seq_dict[transcript+'_ds_antisense'] = seq_ds_antisense elif all_intergenic is True: chroms = fasta_dict.keys() for chrom in chroms: chrom_transcripts = dict((k, transcript_dict[k]) for k in transcript_dict if transcript_dict[k][3] == chrom) chr_txs_df = pd.DataFrame.from_dict(chrom_transcripts, orient='index') chr_txs_df.sort_values([0], inplace=True) sorted_transcripts = chr_txs_df.index.tolist() n = 0 for n in range(len(sorted_transcripts)-1): transcript = sorted_transcripts[n] next_transcript = sorted_transcripts[n+1] transcript_end = chr_txs_df[1][transcript] next_start = chr_txs_df[0][next_transcript] if next_start > transcript_end: seq_plus = fasta_dict[chrom][transcript_end:next_start] seq_dict[transcript+'_'+next_transcript+'_plus'] = seq_plus seq_dict[transcript+'_'+next_transcript+'_minus'] = SP.reverse_complement(seq_plus) else: print 'Overlapping transcripts:' print transcript print next_transcript with open('{}.fa'.format(prefix), 'w') as fout: for transcript, seq in seq_dict.iteritems(): fout.write('>'+transcript+'\n') fout.write(seq+'\n') return seq_dict
def get_sequence(coord_dict, gff3_file, fasta_file): if 'pombe' in gff3_file: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) is str: fasta_dict = make_fasta_dict(fasta_file) else: fasta_dict = fasta_file seq_dict = {} counter5 = 0 counter3 = 0 other = 0 for transcript, coord_sets in coord_dict.iteritems(): seq_dict[transcript] = [] chrom = transcript_dict[transcript][3] #if chrom in rom_lat: chrom = rom_lat[chrom] strand = transcript_dict[transcript][2] for coord in coord_sets[0]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord - 9):(coord + 11)] elif strand == "-": sequence = fasta_dict[chrom][(coord - 10):(coord + 10)] sequence = SP.reverse_complement(sequence) if sequence[10:12] == 'GT' or sequence[10:12] == 'GC': seq_type = "5'" counter5 += 1 seq_dict[transcript].append((sequence, seq_type)) for coord in coord_sets[1]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord - 9):(coord + 11)] elif strand == "-": sequence = fasta_dict[chrom][(coord - 10):(coord + 10)] sequence = SP.reverse_complement(sequence) if sequence[8:10] == 'AG': seq_type = "3'" counter3 += 1 seq_dict[transcript].append((sequence, seq_type)) #print str(counter5)+" 5' splice sites" #print str(counter3)+" 3' splice sites" return seq_dict
def get_sequence(coord_dict, gff3_file, fasta_file): if 'pombe' in gff3_file: organism = 'pombe' else: organism = None transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) is str: fasta_dict = make_fasta_dict(fasta_file) else: fasta_dict = fasta_file seq_dict = {} counter5 = 0 counter3 = 0 other = 0 for transcript, coord_sets in coord_dict.iteritems(): seq_dict[transcript] = [] chrom = transcript_dict[transcript][3] #if chrom in rom_lat: chrom = rom_lat[chrom] strand = transcript_dict[transcript][2] for coord in coord_sets[0]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord-9):(coord+11)] elif strand == "-": sequence = fasta_dict[chrom][(coord-10):(coord+10)] sequence = SP.reverse_complement(sequence) if sequence[10:12] == 'GT' or sequence[10:12] == 'GC': seq_type = "5'" counter5 += 1 seq_dict[transcript].append((sequence, seq_type)) for coord in coord_sets[1]: seq_type = 'other' if strand == "+": sequence = fasta_dict[chrom][(coord-9):(coord+11)] elif strand == "-": sequence = fasta_dict[chrom][(coord-10):(coord+10)] sequence = SP.reverse_complement(sequence) if sequence[8:10] == 'AG': seq_type = "3'" counter3 += 1 seq_dict[transcript].append((sequence, seq_type)) #print str(counter5)+" 5' splice sites" #print str(counter3)+" 3' splice sites" return seq_dict
def seq_simple(chrom, start, end, strand, fasta_dict): if type(fasta_dict) == str: with open(fasta_dict, 'r') as f: fasta_dict = json.load(f) seq = fasta_dict[chrom][start:end+1] if strand == '-': seq = SP.reverse_complement(seq) return seq
def seq_simple(chrom, start, end, strand, fasta_dict): if type(fasta_dict) == str: with open(fasta_dict, 'r') as f: fasta_dict = json.load(f) seq = fasta_dict[chrom][start:end + 1] if strand == '-': seq = SP.reverse_complement(seq) return seq
def score_PyTract(df, fa_dict, alt_column_name=None, from_branches=False): py_score1 = [] py_score2 = [] alt_py1 = [] alt_py2 = [] for ix, r in df.iterrows(): strand = r['strand'] chrom = r['chromosome'] coord = r['annotated intron coords'][1] alt_coord = r['junction coords'][1] if strand == '+': if coord is not None: seq1 = fa_dict[chrom][coord - 15:coord] seq2 = fa_dict[chrom][coord - 30:coord - 15] alt1 = fa_dict[chrom][alt_coord - 15:alt_coord] alt2 = fa_dict[chrom][alt_coord - 30:alt_coord - 15] if strand == '-': if coord is not None: seq1 = fa_dict[chrom][coord:coord + 15] seq2 = fa_dict[chrom][coord + 15:coord + 30] seq1 = SP.reverse_complement(seq1) seq2 = SP.reverse_complement(seq2) alt1 = fa_dict[chrom][alt_coord:alt_coord + 15] alt2 = fa_dict[chrom][alt_coord + 15:alt_coord + 30] alt1 = SP.reverse_complement(alt1) alt2 = SP.reverse_complement(alt2) alt_py1.append(percent_py(alt1)) alt_py2.append(percent_py(alt2)) if coord is not None: py_score1.append(percent_py(seq1)) py_score2.append(percent_py(seq2)) else: py_score1.append(np.NaN) py_score2.append(np.NaN) df['Py score annotated -15:0'] = py_score1 df['Py score annotated -30:-15'] = py_score2 df['Py score alternative -15:0'] = alt_py1 df['Py score alternative -30:-15'] = alt_py2 return df
def score_PyTract(df, fa_dict, alt_column_name=None, from_branches=False): py_score1 = [] py_score2 = [] alt_py1 = [] alt_py2 = [] for ix, r in df.iterrows(): strand = r['strand'] chrom = r['chromosome'] coord = r['annotated intron coords'][1] alt_coord = r['junction coords'][1] if strand == '+': if coord is not None: seq1 = fa_dict[chrom][coord-15:coord] seq2 = fa_dict[chrom][coord-30:coord-15] alt1 = fa_dict[chrom][alt_coord-15:alt_coord] alt2 = fa_dict[chrom][alt_coord-30:alt_coord-15] if strand == '-': if coord is not None: seq1 = fa_dict[chrom][coord:coord+15] seq2 = fa_dict[chrom][coord+15:coord+30] seq1 = SP.reverse_complement(seq1) seq2 = SP.reverse_complement(seq2) alt1 = fa_dict[chrom][alt_coord:alt_coord+15] alt2 = fa_dict[chrom][alt_coord+15:alt_coord+30] alt1 = SP.reverse_complement(alt1) alt2 = SP.reverse_complement(alt2) alt_py1.append(percent_py(alt1)) alt_py2.append(percent_py(alt2)) if coord is not None: py_score1.append(percent_py(seq1)) py_score2.append(percent_py(seq2)) else: py_score1.append(np.NaN) py_score2.append(np.NaN) df['Py score annotated -15:0'] = py_score1 df['Py score annotated -30:-15'] = py_score2 df['Py score alternative -15:0'] = alt_py1 df['Py score alternative -30:-15'] = alt_py2 return df
def write_transcript_fasta(transcript_dict, fasta_dict, prefix='transcripts', sense=True, spliced=False): seq_dict = {} for transcript, values in transcript_dict.iteritems(): start = values[0] end = values[1] strand = values[2] chrom = values[3] CDS_start_list = values[4] CDS_end_list = values[5] if spliced is False: seq = fasta_dict[chrom][start-1:end] if strand == '-': seq = SP.reverse_complement(seq) elif spliced is True: seq = '' for n in range(len(CDS_start_list)): if strand == '+': seq = seq+fasta_dict[chrom][CDS_start_list[n]-1:CDS_end_list[n]] elif strand == '-': new_seq = fasta_dict[chrom][CDS_start_list[n]-1:CDS_end_list[n]] new_seq = SP.reverse_complement(new_seq) seq = seq+new_seq if sense is False: seq = SP.reverse_complement(seq) seq_dict[transcript] = seq with open('{}.fa'.format(prefix), 'w') as fout: for transcript, seq in seq_dict.iteritems(): fout.write('>'+transcript+'\n') fout.write(seq+'\n') return seq_dict
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) == dict: fasta_dict = fasta_file elif fasta_file.endswith('json'): with open(fasta_file, 'r') as f: fasta_dict = json.load(f) else: fasta_dict = make_fasta_dict(fasta_file) if ss_dict is not None: ss_dict=ss_dict elif junction_bed is not None: ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) elif peak_df is not None: ss_dict = {} peak_df = peak_df[~peak_df['type'].str.contains('prime')] for ix, r in peak_df.iterrows(): if r['transcript'] not in ss_dict: ss_dict[r['transcript']] = [] if r['strand'] == '+': ss_dict[r['transcript']].append((r['position'],r['position']+50)) elif r['strand'] == '-': ss_dict[r['transcript']].append((r['position'],r['position']-50)) else: ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) seq_dict = {} for transcript, introns in ss_dict.iteritems(): if junction_bed is None: if organism == 'pombe': transcript = transcript+'.1' else: transcript = transcript+'T0' introns = list(introns) strand = transcript_dict[transcript][2] chrom = transcript_dict[transcript][3] n = 0 for n in range(len(introns)): if strand == '+': seq_dict[transcript+'-'+chrom+':'+str(introns[n][0]+1)] = fasta_dict[chrom][introns[n][0]+2:introns[n][0]+17] elif strand == '-': seq = fasta_dict[chrom][introns[n][0]-16:introns[n][0]-1] seq_dict[transcript+'-'+chrom+':'+str(introns[n][0])] = SP.reverse_complement(seq) return seq_dict
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) column_dict = {'position':[],'transcript':[],'alt splicing':[],'type':[],'strand':[], 'introns in transcript':[], 'intron size':[],'chromosome':[], '5p score':[], '3p score':[], 'intron position':[], 'exon size (us)':[], 'exon size (ds)':[],'transcript size':[], 'peak':[], 'seq5':[],'seq3':[]} new_index = [] for tx in set(df['transcript']): strand = df[df['transcript'] == tx].iloc[0]['strand'] splice_sites = ss_dict[tx] if strand == '+': splice_sites = sorted(list(splice_sites), key=lambda x:x[0]) elif strand == '-': splice_sites = sorted(list(splice_sites), key=lambda x:x[0], reverse=True) df_pos = None for n, (five_site, three_site) in enumerate(splice_sites): # Check if already in dataframe in_df = False for peak in df[df['transcript'] == tx]['position']: if five_site in range(int(peak)-5,int(peak)+5): in_df = True df_pos = peak break column_dict['transcript'].append(tx) if organism == 'pombe': iso = tx+'.1' else: iso = tx+'T0' column_dict['intron size'].append(abs(three_site-five_site)) column_dict['introns in transcript'].append(len(splice_sites)) column_dict['strand'].append(strand) chrom = df[df['transcript'] == tx].iloc[0]['chromosome'] column_dict['chromosome'].append(chrom) column_dict['transcript size'].append((tx_dict[iso][1]-tx_dict[iso][0])/1000.) # Check if first or last intron and add exon size if n == 0: column_dict['intron position'].append('First') if strand == '+': column_dict['exon size (us)'].append((five_site-tx_dict[iso][0])/1000.) if len(splice_sites) > 1: ds_length = (splice_sites[n+1][0] - three_site)/1000. try: if ds_length < 0: ds_length = (splice_sites[n+2][0] - three_site)/1000. except IndexError: ds_length = np.NaN else: ds_length = (tx_dict[iso][1] - three_site)/1000. elif strand == '-': column_dict['exon size (us)'].append((tx_dict[iso][1]-five_site)/1000.) if len(splice_sites) > 1: ds_length = (three_site - splice_sites[n+1][0])/1000. try: if ds_length < 0: ds_length = (three_site - splice_sites[n+2][0])/1000. except IndexError: ds_length = np.NaN else: ds_length = (three_site - tx_dict[iso][0])/1000. column_dict['exon size (ds)'].append(ds_length) elif n == len(splice_sites)-1: column_dict['intron position'].append('Last') column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.) if strand == '+': column_dict['exon size (ds)'].append((tx_dict[iso][1]-three_site)/1000.) elif strand == '-': column_dict['exon size (ds)'].append((three_site - tx_dict[iso][0])/1000.) else: column_dict['intron position'].append('Middle') column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.) column_dict['exon size (ds)'].append(abs(three_site - splice_sites[n+1][0])/1000.) if in_df is True: peak_index = chrom+':'+str(int(df_pos)) new_index.append(peak_index) column_dict['position'].append(df_pos) column_dict['3p score'].append(df.loc[peak_index,'3p score']) column_dict['5p score'].append(df.loc[peak_index,'5p score']) column_dict['alt splicing'].append(df.loc[peak_index,'alt splicing']) column_dict['type'].append(df.loc[peak_index,'type']) column_dict['peak'].append(True) column_dict['seq5'].append(df.loc[peak_index,'seq5']) column_dict['seq3'].append(df.loc[peak_index,'seq3']) if in_df is False: column_dict['alt splicing'].append(False) column_dict['type'].append('5prime') column_dict['peak'].append(False) # Get position, index and sequence for scoring and position code if strand == '+': column_dict['position'].append(five_site+1) new_index.append(chrom+':'+str(five_site+1)) sequence1 = fa_dict[chrom][(five_site-1):(five_site+7)] sequence2 = fa_dict[chrom][(three_site-5):(three_site+3)] elif strand == '-': column_dict['position'].append(five_site-1) new_index.append(chrom+':'+str(five_site-1)) sequence1 = fa_dict[chrom][(five_site-6):(five_site+2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fa_dict[chrom][(three_site-2):(three_site+6)] sequence2 = SP.reverse_complement(sequence2) column_dict['seq5'].append(sequence1) column_dict['seq3'].append(sequence2) # Score sequences score_5, score_3 = SP.simple_score_junction(sequence1, sequence2, PSSM) column_dict['3p score'].append(score_3) column_dict['5p score'].append(score_5) # Create new dataframe from column dictionary new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index) for column, data in column_dict.iteritems(): new_df[column] = data return new_df
def find_score_branches_ppy(quant_df, peak_branch_df, fa_dict): #branches = generate_all_branches() PSSM = branch_PSSM(peak_branch_df, fa_dict) branches = [] if type(peak_branch_df) is not str: for ix, branch in peak_branch_df['Branch seq'].iteritems(): seq = branch[2:7] if seq[:-2] != 'A' and 'A' in seq: A_ix = branch.rfind('A') new_seq = branch[A_ix-3:A_ix+2] if len(new_seq) == 5: seq = new_seq branches = peak_branch_df['Branch seq'].str[2:7] else: with open(peak_branch_df) as f: for line in f: branches.append(line.strip()) # Sort branches by abundance so that the most common ones are first in the search br_abund = [] for branch in set(branches): count = len([x for x in branches if x == branch]) if count > 1: br_abund.append((branch, count)) br_abund = sorted(br_abund, key=lambda x: x[1], reverse=True) branches = zip(*br_abund)[0] branch_dict = collections.OrderedDict() for branch in branches: branch_dict[branch] = score_branch(branch, PSSM) branch_3_dist = [] branch_score = [] branch_seqs = [] perc_py = [] for ix, r in quant_df.iterrows(): if r['strand'] == '+': intron_seq = fa_dict[r['chromosome']][int(r['position']):int(r['position']+r['intron size'])] three_site = r['position']+r['intron size'] elif r['strand'] == '-': intron_seq = fa_dict[r['chromosome']][int(r['position']-r['intron size']-1):int(r['position']-1)] intron_seq = SP.reverse_complement(intron_seq) three_site = r['position']-r['intron size'] if type(peak_branch_df) is not str: if ix in peak_branch_df['genome coord']: ix_df = peak_branch_df[peak_branch_df['genome coord'] == ix] ix_df = ix_df.sort_values('depth', ascending=False) best_branch = ix_df.iloc[0,'branch site'] best_branch = abs(ix_df.iloc[0,'5p splice site']-best_branch) seq = ix_df.iloc[0,'Branch seq'][2:7] branch_seqs.append(seq) branch_score.append(score_branch(seq, PSSM)) branch_3_dist.append(ix_df.iloc[0,'Branch to 3p distance']) if 'N' in intron_seq[best_branch[0]+5:]: print ix print intron_seq perc_py.append(percent_py(intron_seq[best_branch[0]+5:])) else: matches = [] for branch in branch_dict: if branch in intron_seq: matches.append((intron_seq.index(branch), branch, branch_dict[branch])) if len(matches) == 0: # Find the closest A best_ix = intron_seq[:-3].rfind('A') seq = intron_seq[best_ix-3:best_ix+2] score = score_branch(seq, PSSM) best_branch = (best_ix, seq, score) #branch_3_dist.append(np.NaN) #branch_score.append(np.NaN) #branch_seqs.append('NNNNN') #perc_py.append(percent_py(intron_seq[-30:])) elif len(matches) > 1: matches = sorted(matches, key=lambda x: x[2], reverse=True) best_branch = matches[0] else: best_branch = matches[0] branch_3_dist.append((len(intron_seq)-best_branch[0]-4)/1000.) branch_score.append(best_branch[2]) branch_seqs.append(best_branch[1]) if len(intron_seq)-best_branch[0]-5 > 1: if 'N' in intron_seq[best_branch[0]+5:]: print ix print intron_seq perc_py.append(percent_py(intron_seq[best_branch[0]+5:])) else: perc_py.append(np.NaN) else: matches = [] for branch in branch_dict: if branch in intron_seq: matches.append((intron_seq.index(branch), branch, branch_dict[branch])) if len(matches) == 0: # Find the closest A best_ix = intron_seq[:-3].rfind('A') seq = intron_seq[best_ix-3:best_ix+2] score = score_branch(seq, PSSM) best_branch = (best_ix, seq, score) #branch_3_dist.append(np.NaN) #branch_score.append(np.NaN) #branch_seqs.append('NNNNN') #perc_py.append(percent_py(intron_seq[-30:])) elif len(matches) > 1: matches = sorted(matches, key=lambda x: x[2], reverse=True) best_branch = matches[0] else: best_branch = matches[0] branch_3_dist.append((len(intron_seq)-best_branch[0]-4)/1000.) branch_score.append(best_branch[2]) branch_seqs.append(best_branch[1]) if len(intron_seq)-best_branch[0]-5 > 1: if 'N' in intron_seq[best_branch[0]+5:]: print ix print intron_seq perc_py.append(percent_py(intron_seq[best_branch[0]+5:])) else: perc_py.append(np.NaN) print len(quant_df) print len(branch_score) quant_df['branch score'] = branch_score quant_df['branch to 3p distance'] = branch_3_dist quant_df['percent pPy'] = perc_py branch_seqs = ['NNNNN' if len(x) < 5 else x for x in branch_seqs ] print len(branch_seqs) print len(quant_df) for n in range(len(branch_seqs[0])): pos = [x[n] for x in branch_seqs] quant_df['branch-'+str(n)] = pos print str(len(quant_df)-len(quant_df['branch score'].dropna()))+' introns without identifiable branches' return quant_df
def build_junction_df(junction_bed, gff3_file, fasta, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta) == str: fasta = SP.make_fasta_dict(fasta) junction_dict = build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) junction_count = 0 for tx, junctions in junction_dict.iteritems(): junction_count += len(junctions) junction_df = pd.DataFrame(index=range(junction_count), columns=[ 'intron tuple', 'chromosome', 'start', 'end', 'strand', 'depth', 'type', 'size', 'annotated intron size', 'annotated intron start', 'annotated intron end' ]) n = 0 for tx, junctions in junction_dict.iteritems(): for junction in junctions: junction_df.ix[n] = [tx] + junction n += 1 sequence1 = [] sequence2 = [] ann_seq1 = [] ann_seq2 = [] seq_type1 = [] seq_type2 = [] df_tx = [] for index, row in junction_df.iterrows(): df_tx.append(row['intron tuple'][0]) chrom = convert_chrom(row['chromosome']) if row['strand'] == '+': curr1 = fasta[chrom][(row['start'] - 1):(row['start'] + 7)] sequence1.append(curr1) curr2 = fasta[chrom][(row['end'] - 5):(row['end'] + 3)] sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append( fasta[chrom][(row['annotated intron start'] - 1):(row['annotated intron start'] + 7)]) ann_seq2.append(fasta[chrom][(row['annotated intron end'] - 5):(row['annotated intron end'] + 3)]) elif row['strand'] == '-': curr1 = SP.reverse_complement(fasta[chrom][(row['start'] - 6):(row['start'] + 2)]) sequence1.append(curr1) curr2 = SP.reverse_complement(fasta[chrom][(row['end'] - 2):(row['end'] + 6)]) sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append( SP.reverse_complement( fasta[chrom][row['annotated intron start'] - 6:row['annotated intron start'] + 2])) ann_seq2.append( SP.reverse_complement( fasta[chrom][row['annotated intron end'] - 2:row['annotated intron end'] + 6])) else: sequence1.append('NNNNNNNN') sequence2.append('NNNNNNNN') ann_seq1.append('NNNNNNNN') ann_seq2.append('NNNNNNNN') if row['type'] == 'Annotated': seq_type1.append('5p annotated') seq_type2.append('3p annotated') elif row['type'] == '5p tethered': seq_type1.append('5p annotated') seq_type2.append(curr2[4:6]) else: seq_type1.append(curr1[2:4]) seq_type2.append(curr2[4:6]) junc_seq_df = junction_df junc_seq_df['sequence1'] = sequence1 junc_seq_df['sequence2'] = sequence2 junc_seq_df['seq type1'] = seq_type1 junc_seq_df['seq type2'] = seq_type2 junc_seq_df['annotated sequence1'] = ann_seq1 junc_seq_df['annotated sequence2'] = ann_seq2 junc_seq_df['transcript'] = df_tx return junc_seq_df
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False): #Populate gene dictionary and build genome if 'pombe' in gff3.lower(): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') ss, flag = SP.list_splice_sites(gff3, organism='pombe') organism = 'pombe' else: transcript_dict = SP.build_transcript_dict(gff3) ss, flag = SP.list_splice_sites(gff3) organism = None ss_dict = SP.collapse_ss_dict(ss) genome = fasta_dict #print genome.keys() nuc_prob = gc_content(fasta_dict) #print nuc_prob base_dict = {"A":0, "C":1, "T":2, "G":3} #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G. pos_matrix_5prime = np.zeros([4,8]) pos_matrix_3prime = np.zeros([4,8]) counter1 = 0 counter2 = 0 for transcript, introns in ss_dict.iteritems(): counter2 += 1 if organism == 'pombe': isoform = transcript+'.1' else: isoform = transcript+'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: counter1+=1 if strand == '+': seq = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)] elif strand == '-': seq = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)] seq = SP.reverse_complement(seq) for a, base in enumerate(seq): pos_matrix_5prime[base_dict[base],a] += 1 if strand == '+': seq = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)] elif strand == '-': seq = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)] seq = SP.reverse_complement(seq) for b, base in enumerate(seq): pos_matrix_3prime[base_dict[base],b] += 1 #print counter1 #print counter2 float_formatter = lambda x: "%.1f" % x np.set_printoptions(formatter={'float_kind':float_formatter}) a = 0 while a < 4: b = 0 while b < 8: if PSSM is False: pos_matrix_5prime[a,b] = (pos_matrix_5prime[a,b])/float(counter1) pos_matrix_3prime[a,b] = (pos_matrix_3prime[a,b])/float(counter1) if PSSM is True: if pos_matrix_5prime[a,b] == 0: pos_matrix_5prime[a,b] += 1 if pos_matrix_3prime[a,b] == 0: pos_matrix_3prime[a,b] += 1 pos_matrix_5prime[a,b] = np.log2((pos_matrix_5prime[a,b]/float(counter1))/nuc_prob[a]) pos_matrix_3prime[a,b] = np.log2((pos_matrix_3prime[a,b]/float(counter1))/nuc_prob[a]) b += 1 a += 1 return (pos_matrix_5prime, pos_matrix_3prime)
def get_junction_sequence(df, gff3_file, fasta_file): df = df.sort_values('chr', axis=0) #transcript_dict[transcript] = [start, end, strand, chromosome, CDS start, CDS end] transcript_dict = SP.build_transcript_dict(gff3_file) #splice_dict[transcipt] = [[5'sites][3'sites]] splice_dict, flag = SP.list_splice_sites(gff3_file) #fasta_dict[chr] = sequence if type(fasta_file) is str: fasta_dict = make_fasta_dict(fasta_file) else: fasta_dict = fasta_file transcript_by_chr = {} for transcript, coords in transcript_dict.iteritems(): chromosome = coords[3] if chromosome in transcript_by_chr: transcript_by_chr[chromosome].append(transcript) else: transcript_by_chr[chromosome] = [] transcript_by_chr[chromosome].append(transcript) df['Gene'] = "Unknown" df['intron'] = "Middle" df['sequence1'] = '' df['sequence2'] = '' df['intron sequence'] = 'No sequence here' n = 0 for n in range(len(df)): coord1 = int(df['coord_1'][n].strip()) coord2 = int(df['coord_2'][n].strip()) chrom = df['chr'][n].strip() strand = df['strand'][n].strip() transcripts = transcript_by_chr[chrom] for transcript in transcripts: tx_strand = transcript_dict[transcript][2] start = transcript_dict[transcript][0] stop = transcript_dict[transcript][1] if strand == tx_strand and coord1 >= start and coord2 <= stop: df.loc[n,'Gene'] = transcript if strand == '+': sequence1 = fasta_dict[chrom][(coord1-3):(coord1+5)] sequence2 = fasta_dict[chrom][(coord2-6):(coord2+2)] all_seq = fasta_dict[chrom][(coord1-1):coord2] elif strand == '-': sequence1 = fasta_dict[chrom][(coord2-6):(coord2+2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fasta_dict[chrom][(coord1-3):(coord1+5)] sequence2 = SP.reverse_complement(sequence2) all_seq = fasta_dict[chrom][(coord1-1):coord2] all_seq = SP.reverse_complement(all_seq) df.loc[n,'sequence1'] = sequence1 df.loc[n,'sequence2'] = sequence2 df.loc[n,'intron sequence'] = all_seq for transcript in transcripts: if transcript in df['Gene'].tolist(): tx_df = df[df['Gene'] == transcript] s = tx_df['coord_1'] min_idx = s.idxmin() first = int(s.min()) #print transcript_dict[transcript][2] #print first max_idx = s.idxmax() last = int(s.max()) #print last if first == last: df.loc[min_idx,'intron'] = 'Only' else: if transcript_dict[transcript][2] == '+': df.loc[min_idx,'intron'] = 'First' df.loc[max_idx,'intron'] = 'Last' elif transcript_dict[transcript][2] == '-': df.loc[min_idx,'intron'] = 'Last' df.loc[max_idx,'intron'] = 'First' for index, coord_1 in s.iteritems(): if df['intron'][index] == 'Middle': if coord_1 in range(first-10, first+10): df_idx = s[s == coord_1].index[0] if transcript_dict[transcript][2] == '+': df.loc[df_idx, 'intron'] = 'First' elif transcript_dict[transcript][2] == '-': df.loc[df_idx, 'intron'] = 'Last' elif coord_1 in range(last-10, last+10): df_idx = s[s == coord_1].index[0] if transcript_dict[transcript][2] == '+': df.loc[df_idx, 'intron'] = 'Last' elif transcript_dict[transcript][2] == '-': df.loc[df_idx, 'intron'] = 'First' df = df[df['contained in'] != ''] df = df.reset_index() return df
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None): count1 = 0 count2 = 0 pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')] quant_df['genome coord'] = quant_df['chromosome'].str.cat( quant_df['position'].values.astype(str), sep=':') quant_df.index = quant_df['genome coord'] quant_df = quant_df.drop('index', axis=1) column_dict = { 'intron size': [], 'alt splicing': [], '5p score': [], '3p score': [], 'seq5': [], 'seq3': [] } new_index = [] seq5 = [] seq3 = [] for coord in quant_df.index: coord_df = quant_df[quant_df.index == coord] three_site = None alt3 = False if len(coord_df) > 0: coord_df = coord_df.sort_values('height', ascending=False).ix[0] introns = ss_dict[coord_df['transcript']] if 'prime' in coord_df['type']: peak_range = range(coord_df['position'] - 5, coord_df['position'] + 5) for intron in introns: if intron[0] in peak_range: five_site = intron[0] three_site = intron[1] break if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0: alt3 = True else: if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']: five_site = coord_df['position'] three_df = quant_df[ (quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')] three_df = three_df.sort_values('height', ascending=False) three_site = three_df.ix[0]['position'] if three_site is not None: new_index.append(coord) size = abs(three_site - five_site) / 1000. column_dict['intron size'].append(size) column_dict['alt splicing'].append(alt3) if coord_df['strand'] == '+': s5 = fa_dict[coord_df['chromosome']][five_site - 2:five_site + 6] s3 = fa_dict[coord_df['chromosome']][three_site - 6:three_site + 2] elif coord_df['strand'] == '-': s5 = fa_dict[coord_df['chromosome']][five_site - 6:five_site + 2] s5 = SP.reverse_complement(s5) s3 = fa_dict[coord_df['chromosome']][three_site - 2:three_site + 6] s3 = SP.reverse_complement(s3) column_dict['seq5'].append(s5) column_dict['seq3'].append(s3) scores = SP.simple_score_junction(s5, s3, pssm) column_dict['3p score'].append(scores[1]) column_dict['5p score'].append(scores[0]) new_quant_df = quant_df[quant_df.index.isin(new_index)][[ 'genome coord', 'chromosome', 'strand', 'transcript', 'position', 'type' ]] for column, data in column_dict.iteritems(): new_quant_df[column] = data new_quant_df = new_quant_df.drop_duplicates( subset='genome coord', keep='first').set_index('genome coord') new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) #for n in range(len(new_quant_df['seq5'].iloc[0])): # new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] #for n in range(len(new_quant_df['seq3'].iloc[0])): # new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) new_quant_df = SP.find_score_branches_ppy( new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict) return new_quant_df
def make_quant_df(junc_df, branch_df, gff3, fa_dict, organism=None): pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) quant_df = junc_df[(junc_df['type'] != '3prime') & (junc_df['looks like'] != 'AG')] new_intron_size = [] alt_splice = [] score_3 = [] score_5 = [] seq5 = [] seq3 = [] new_quant_df = pd.DataFrame(index=set(quant_df.index), columns=['intron size', 'alt splicing']) for coord in new_quant_df.index: coord_df = quant_df[quant_df.index == coord] #Determine if multiple junctions come from this peak if len(coord_df) > 1: alt_splice.append(True) else: alt_splice.append(False) if max(coord_df['annotated intron size']) > 0: coord_df = coord_df.sort_values('annotated intron size', ascending=False) new_intron_size.append(coord_df.ix[0]['annotated intron size'] / 1000.) seq5.append(coord_df.ix[0]['junction sequence1']) seq3.append(coord_df.ix[0]['junction sequence2']) score_3.append(coord_df.ix[0]['annotated 3p score']) score_5.append(coord_df.ix[0]['annotated 5p score']) else: coord_df = coord_df.sort_values('junction size', ascending=False) new_intron_size.append(coord_df.ix[0]['junction size'] / 1000.) seq5.append(coord_df.ix[0]['junction sequence1']) seq3.append(coord_df.ix[0]['junction sequence2']) scores = SP.simple_score_junction( coord_df.ix[0]['junction sequence1'], coord_df.ix[0]['junction sequence2'], pssm) score_3.append(scores[1]) score_5.append(scores[0]) new_quant_df['intron size'] = new_intron_size new_quant_df['alt splicing'] = alt_splice new_quant_df['5p score'] = score_5 new_quant_df['3p score'] = score_3 new_quant_df['seq5'] = seq5 new_quant_df['seq3'] = seq3 quant_df = quant_df.sort_values('annotated intron size') quant_df = quant_df.reset_index(drop=True).drop_duplicates( subset='genome coord', keep='first').set_index('genome coord') new_quant_df = new_quant_df.merge( quant_df[['transcript', 'chromosome', 'position', 'strand', 'type']], right_index=True, left_index=True) for coord in set(branch_df['genome coord']): if coord not in new_quant_df.index: coord_df = branch_df[branch_df['genome coord'] == coord] coord_df = coord_df.sort_values('depth') best = coord_df.iloc[0] coord_dict = { 'transcript': best['transcript'][:-2], 'chromosome': best['chromosome'], 'position': best['5p splice site'], 'strand': best['strand'], 'type': best['type'], 'intron size': best['intron size'], 'alt splicing': np.where(len(coord_df) > 1, True, False), '5p score': np.NaN, '3p score': np.NaN, 'seq5': '', 'seq3': '' } if len(best['5p seq']) > 0: coord_dict['seq5'] = best['5p seq'] else: if best['strand'] == '+': coord_dict['seq5'] = fa_dict[best['chromosome']][( int(best['5p splice site']) - 1):(int(best['5p splice site']) + 7)] elif best['strand'] == '-': coord_dict['seq5'] = fa_dict[best['chromosome']][( int(best['5p splice site']) - 6):(int(best['5p splice site']) + 2)] coord_dict['seq5'] = SP.reverse_complement( coord_dict['seq5']) if str(best['3p splice site']) != 'nan': three_site = best['3p splice site'] else: if best['strand'] == '+': after_branch = fa_dict[best['chromosome']][ best['branch site']:best['branch site'] + 100] elif best['strand'] == '-': after_branch = fa_dict[ best['chromosome']][best['branch site'] - 100:best['branch site']] after_branch = SP.reverse_complement(after_branch) for subs in ['TAG', 'CAG', 'GAG', 'AAG']: if subs in after_branch: ix = after_branch.find(subs) + 3 break three_site = best['branch site'] + ix if best['strand'] == '-': three_site = best['branch site'] - ix coord_dict['intron size'] = abs(coord_dict['position'] - three_site) if best['strand'] == '+': coord_dict['seq3'] = fa_dict[ best['chromosome']][int(three_site - 5):int(three_site) + 3] elif best['strand'] == '-': coord_dict['seq3'] = fa_dict[ best['chromosome']][int(three_site) - 2:int(three_site) + 6] coord_dict['seq3'] = SP.reverse_complement(coord_dict['seq3']) coord_dict['5p score'], coord_dict[ '3p score'] = SP.simple_score_junction(coord_dict['seq5'], coord_dict['seq3'], pssm) coord_s = pd.Series(coord_dict, name=coord) new_quant_df = new_quant_df.append(coord_s) new_quant_df = backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) for n in range(len(new_quant_df['seq5'].iloc[0])): new_quant_df['Base 5-' + str(n)] = [x[n] for x in new_quant_df['seq5']] for n in range(len(new_quant_df['seq3'].iloc[0])): new_quant_df['Base 3-' + str(n)] = [x[n] for x in new_quant_df['seq3']] new_quant_df = new_quant_df.drop(['seq5', 'seq3'], axis=1) lariat_df = junc_df[(junc_df['type'] == '3prime') | (junc_df['looks like'] == 'AG')] lariat_df = lariat_df.sort_values( ['genome coord', 'annotated intron size'], ascending=False) lariat_df = lariat_df.reset_index(drop=True).drop_duplicates( subset='genome coord', keep='first').set_index('genome coord') lariat_df = lariat_df[[ 'transcript', 'chromosome', 'position', 'strand', 'type' ]] return new_quant_df, lariat_df
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) column_dict = { 'position': [], 'transcript': [], 'alt splicing': [], 'type': [], 'strand': [], 'introns in transcript': [], 'intron size': [], 'chromosome': [], '5p score': [], '3p score': [], 'intron position': [], 'exon size (us)': [], 'exon size (ds)': [], 'transcript size': [], 'peak': [], 'seq5': [], 'seq3': [] } new_index = [] for tx in set(df['transcript']): strand = df[df['transcript'] == tx].iloc[0]['strand'] splice_sites = ss_dict[tx] if strand == '+': splice_sites = sorted(list(splice_sites), key=lambda x: x[0]) elif strand == '-': splice_sites = sorted(list(splice_sites), key=lambda x: x[0], reverse=True) df_pos = None for n, (five_site, three_site) in enumerate(splice_sites): # Check if already in dataframe in_df = False for peak in df[df['transcript'] == tx]['position']: if five_site in range(int(peak) - 5, int(peak) + 5): in_df = True df_pos = peak break column_dict['transcript'].append(tx) if organism == 'pombe': iso = tx + '.1' else: iso = tx + 'T0' column_dict['intron size'].append(abs(three_site - five_site)) column_dict['introns in transcript'].append(len(splice_sites)) column_dict['strand'].append(strand) chrom = df[df['transcript'] == tx].iloc[0]['chromosome'] column_dict['chromosome'].append(chrom) column_dict['transcript size'].append( (tx_dict[iso][1] - tx_dict[iso][0]) / 1000.) # Check if first or last intron and add exon size if n == 0: column_dict['intron position'].append('First') if strand == '+': column_dict['exon size (us)'].append( (five_site - tx_dict[iso][0]) / 1000.) if len(splice_sites) > 1: ds_length = (splice_sites[n + 1][0] - three_site) / 1000. try: if ds_length < 0: ds_length = (splice_sites[n + 2][0] - three_site) / 1000. except IndexError: ds_length = np.NaN else: ds_length = (tx_dict[iso][1] - three_site) / 1000. elif strand == '-': column_dict['exon size (us)'].append( (tx_dict[iso][1] - five_site) / 1000.) if len(splice_sites) > 1: ds_length = (three_site - splice_sites[n + 1][0]) / 1000. try: if ds_length < 0: ds_length = (three_site - splice_sites[n + 2][0]) / 1000. except IndexError: ds_length = np.NaN else: ds_length = (three_site - tx_dict[iso][0]) / 1000. column_dict['exon size (ds)'].append(ds_length) elif n == len(splice_sites) - 1: column_dict['intron position'].append('Last') column_dict['exon size (us)'].append( (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.) if strand == '+': column_dict['exon size (ds)'].append( (tx_dict[iso][1] - three_site) / 1000.) elif strand == '-': column_dict['exon size (ds)'].append( (three_site - tx_dict[iso][0]) / 1000.) else: column_dict['intron position'].append('Middle') column_dict['exon size (us)'].append( (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.) column_dict['exon size (ds)'].append( abs(three_site - splice_sites[n + 1][0]) / 1000.) if in_df is True: peak_index = chrom + ':' + str(int(df_pos)) new_index.append(peak_index) column_dict['position'].append(df_pos) column_dict['3p score'].append(df.loc[peak_index, '3p score']) column_dict['5p score'].append(df.loc[peak_index, '5p score']) column_dict['alt splicing'].append(df.loc[peak_index, 'alt splicing']) column_dict['type'].append(df.loc[peak_index, 'type']) column_dict['peak'].append(True) column_dict['seq5'].append(df.loc[peak_index, 'seq5']) column_dict['seq3'].append(df.loc[peak_index, 'seq3']) if in_df is False: column_dict['alt splicing'].append(False) column_dict['type'].append('5prime') column_dict['peak'].append(False) # Get position, index and sequence for scoring and position code if strand == '+': column_dict['position'].append(five_site + 1) new_index.append(chrom + ':' + str(five_site + 1)) sequence1 = fa_dict[chrom][(five_site - 1):(five_site + 7)] sequence2 = fa_dict[chrom][(three_site - 5):(three_site + 3)] elif strand == '-': column_dict['position'].append(five_site - 1) new_index.append(chrom + ':' + str(five_site - 1)) sequence1 = fa_dict[chrom][(five_site - 6):(five_site + 2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fa_dict[chrom][(three_site - 2):(three_site + 6)] sequence2 = SP.reverse_complement(sequence2) column_dict['seq5'].append(sequence1) column_dict['seq3'].append(sequence2) # Score sequences score_5, score_3 = SP.simple_score_junction( sequence1, sequence2, PSSM) column_dict['3p score'].append(score_3) column_dict['5p score'].append(score_5) # Create new dataframe from column dictionary new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index) for column, data in column_dict.iteritems(): new_df[column] = data return new_df
def find_score_branches_ppy(quant_df, peak_branch_df, fa_dict): #branches = generate_all_branches() PSSM = branch_PSSM(peak_branch_df, fa_dict) branches = [] if type(peak_branch_df) is not str: for ix, branch in peak_branch_df['Branch seq'].iteritems(): seq = branch[2:7] if seq[:-2] != 'A' and 'A' in seq: A_ix = branch.rfind('A') new_seq = branch[A_ix - 3:A_ix + 2] if len(new_seq) == 5: seq = new_seq branches = peak_branch_df['Branch seq'].str[2:7] else: with open(peak_branch_df) as f: for line in f: branches.append(line.strip()) # Sort branches by abundance so that the most common ones are first in the search br_abund = [] for branch in set(branches): count = len([x for x in branches if x == branch]) if count > 1: br_abund.append((branch, count)) br_abund = sorted(br_abund, key=lambda x: x[1], reverse=True) branches = zip(*br_abund)[0] branch_dict = collections.OrderedDict() for branch in branches: branch_dict[branch] = score_branch(branch, PSSM) branch_3_dist = [] branch_score = [] branch_seqs = [] perc_py = [] for ix, r in quant_df.iterrows(): if r['strand'] == '+': intron_seq = fa_dict[ r['chromosome']][int(r['position']):int(r['position'] + r['intron size'])] three_site = r['position'] + r['intron size'] elif r['strand'] == '-': intron_seq = fa_dict[r['chromosome']][int(r['position'] - r['intron size'] - 1):int(r['position'] - 1)] intron_seq = SP.reverse_complement(intron_seq) three_site = r['position'] - r['intron size'] if type(peak_branch_df) is not str: if ix in peak_branch_df['genome coord']: ix_df = peak_branch_df[peak_branch_df['genome coord'] == ix] ix_df = ix_df.sort_values('depth', ascending=False) best_branch = ix_df.iloc[0, 'branch site'] best_branch = abs(ix_df.iloc[0, '5p splice site'] - best_branch) seq = ix_df.iloc[0, 'Branch seq'][2:7] branch_seqs.append(seq) branch_score.append(score_branch(seq, PSSM)) branch_3_dist.append(ix_df.iloc[0, 'Branch to 3p distance']) if 'N' in intron_seq[best_branch[0] + 5:]: print ix print intron_seq perc_py.append(percent_py(intron_seq[best_branch[0] + 5:])) else: matches = [] for branch in branch_dict: if branch in intron_seq: matches.append((intron_seq.index(branch), branch, branch_dict[branch])) if len(matches) == 0: # Find the closest A best_ix = intron_seq[:-3].rfind('A') seq = intron_seq[best_ix - 3:best_ix + 2] score = score_branch(seq, PSSM) best_branch = (best_ix, seq, score) #branch_3_dist.append(np.NaN) #branch_score.append(np.NaN) #branch_seqs.append('NNNNN') #perc_py.append(percent_py(intron_seq[-30:])) elif len(matches) > 1: matches = sorted(matches, key=lambda x: x[2], reverse=True) best_branch = matches[0] else: best_branch = matches[0] branch_3_dist.append( (len(intron_seq) - best_branch[0] - 4) / 1000.) branch_score.append(best_branch[2]) branch_seqs.append(best_branch[1]) if len(intron_seq) - best_branch[0] - 5 > 1: if 'N' in intron_seq[best_branch[0] + 5:]: print ix print intron_seq perc_py.append(percent_py(intron_seq[best_branch[0] + 5:])) else: perc_py.append(np.NaN) else: matches = [] for branch in branch_dict: if branch in intron_seq: matches.append((intron_seq.index(branch), branch, branch_dict[branch])) if len(matches) == 0: # Find the closest A best_ix = intron_seq[:-3].rfind('A') seq = intron_seq[best_ix - 3:best_ix + 2] score = score_branch(seq, PSSM) best_branch = (best_ix, seq, score) #branch_3_dist.append(np.NaN) #branch_score.append(np.NaN) #branch_seqs.append('NNNNN') #perc_py.append(percent_py(intron_seq[-30:])) elif len(matches) > 1: matches = sorted(matches, key=lambda x: x[2], reverse=True) best_branch = matches[0] else: best_branch = matches[0] branch_3_dist.append( (len(intron_seq) - best_branch[0] - 4) / 1000.) branch_score.append(best_branch[2]) branch_seqs.append(best_branch[1]) if len(intron_seq) - best_branch[0] - 5 > 1: if 'N' in intron_seq[best_branch[0] + 5:]: print ix print intron_seq perc_py.append(percent_py(intron_seq[best_branch[0] + 5:])) else: perc_py.append(np.NaN) print len(quant_df) print len(branch_score) quant_df['branch score'] = branch_score quant_df['branch to 3p distance'] = branch_3_dist quant_df['percent pPy'] = perc_py branch_seqs = ['NNNNN' if len(x) < 5 else x for x in branch_seqs] print len(branch_seqs) print len(quant_df) for n in range(len(branch_seqs[0])): pos = [x[n] for x in branch_seqs] quant_df['branch-' + str(n)] = pos print str(len(quant_df) - len(quant_df['branch score'].dropna()) ) + ' introns without identifiable branches' return quant_df
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) == dict: fasta_dict = fasta_file elif fasta_file.endswith('json'): with open(fasta_file, 'r') as f: fasta_dict = json.load(f) else: fasta_dict = make_fasta_dict(fasta_file) if ss_dict is not None: ss_dict = ss_dict elif junction_bed is not None: ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) elif peak_df is not None: ss_dict = {} peak_df = peak_df[~peak_df['type'].str.contains('prime')] for ix, r in peak_df.iterrows(): if r['transcript'] not in ss_dict: ss_dict[r['transcript']] = [] if r['strand'] == '+': ss_dict[r['transcript']].append( (r['position'], r['position'] + 50)) elif r['strand'] == '-': ss_dict[r['transcript']].append( (r['position'], r['position'] - 50)) else: ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) seq_dict = {} for transcript, introns in ss_dict.iteritems(): if junction_bed is None: if organism == 'pombe': transcript = transcript + '.1' else: transcript = transcript + 'T0' introns = list(introns) strand = transcript_dict[transcript][2] chrom = transcript_dict[transcript][3] n = 0 for n in range(len(introns)): if strand == '+': seq_dict[transcript + '-' + chrom + ':' + str(introns[n][0] + 1)] = fasta_dict[chrom][introns[n][0] + 2:introns[n][0] + 17] elif strand == '-': seq = fasta_dict[chrom][introns[n][0] - 16:introns[n][0] - 1] seq_dict[transcript + '-' + chrom + ':' + str(introns[n][0])] = SP.reverse_complement(seq) return seq_dict
def make_quant_df(junc_df, branch_df, gff3, fa_dict, organism=None): pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) quant_df = junc_df[(junc_df['type'] != '3prime') & (junc_df['looks like'] != 'AG')] new_intron_size = [] alt_splice = [] score_3 = [] score_5 = [] seq5 = [] seq3 = [] new_quant_df = pd.DataFrame(index=set(quant_df.index), columns=['intron size','alt splicing']) for coord in new_quant_df.index: coord_df = quant_df[quant_df.index == coord] #Determine if multiple junctions come from this peak if len(coord_df) > 1: alt_splice.append(True) else: alt_splice.append(False) if max(coord_df['annotated intron size']) > 0: coord_df = coord_df.sort_values('annotated intron size', ascending=False) new_intron_size.append(coord_df.ix[0]['annotated intron size']/1000.) seq5.append(coord_df.ix[0]['junction sequence1']) seq3.append(coord_df.ix[0]['junction sequence2']) score_3.append(coord_df.ix[0]['annotated 3p score']) score_5.append(coord_df.ix[0]['annotated 5p score']) else: coord_df = coord_df.sort_values('junction size', ascending=False) new_intron_size.append(coord_df.ix[0]['junction size']/1000.) seq5.append(coord_df.ix[0]['junction sequence1']) seq3.append(coord_df.ix[0]['junction sequence2']) scores = SP.simple_score_junction(coord_df.ix[0]['junction sequence1'], coord_df.ix[0]['junction sequence2'], pssm) score_3.append(scores[1]) score_5.append(scores[0]) new_quant_df['intron size'] = new_intron_size new_quant_df['alt splicing'] = alt_splice new_quant_df['5p score'] = score_5 new_quant_df['3p score'] = score_3 new_quant_df['seq5'] = seq5 new_quant_df['seq3'] = seq3 quant_df = quant_df.sort_values('annotated intron size') quant_df = quant_df.reset_index(drop=True).drop_duplicates(subset='genome coord', keep='first').set_index('genome coord') new_quant_df = new_quant_df.merge(quant_df[['transcript','chromosome','position','strand','type']], right_index=True, left_index=True) for coord in set(branch_df['genome coord']): if coord not in new_quant_df.index: coord_df = branch_df[branch_df['genome coord'] == coord] coord_df = coord_df.sort_values('depth') best = coord_df.iloc[0] coord_dict = {'transcript':best['transcript'][:-2], 'chromosome':best['chromosome'], 'position':best['5p splice site'], 'strand':best['strand'], 'type':best['type'], 'intron size':best['intron size'], 'alt splicing':np.where(len(coord_df)> 1, True, False), '5p score':np.NaN, '3p score':np.NaN, 'seq5':'','seq3':''} if len(best['5p seq']) > 0: coord_dict['seq5'] = best['5p seq'] else: if best['strand'] == '+': coord_dict['seq5'] = fa_dict[best['chromosome']][(int(best['5p splice site'])-1):(int(best['5p splice site'])+7)] elif best['strand'] == '-': coord_dict['seq5'] = fa_dict[best['chromosome']][(int(best['5p splice site'])-6):(int(best['5p splice site'])+2)] coord_dict['seq5'] = SP.reverse_complement(coord_dict['seq5']) if str(best['3p splice site']) != 'nan': three_site = best['3p splice site'] else: if best['strand'] == '+': after_branch = fa_dict[best['chromosome']][best['branch site']:best['branch site']+100] elif best['strand'] == '-': after_branch = fa_dict[best['chromosome']][best['branch site']-100:best['branch site']] after_branch = SP.reverse_complement(after_branch) for subs in ['TAG','CAG','GAG','AAG']: if subs in after_branch: ix = after_branch.find(subs)+3 break three_site = best['branch site']+ix if best['strand'] == '-': three_site = best['branch site']-ix coord_dict['intron size'] = abs(coord_dict['position']-three_site) if best['strand'] == '+': coord_dict['seq3'] = fa_dict[best['chromosome']][int(three_site-5):int(three_site)+3] elif best['strand'] == '-': coord_dict['seq3'] = fa_dict[best['chromosome']][int(three_site)-2:int(three_site)+6] coord_dict['seq3'] = SP.reverse_complement(coord_dict['seq3']) coord_dict['5p score'], coord_dict['3p score'] = SP.simple_score_junction(coord_dict['seq5'], coord_dict['seq3'], pssm) coord_s = pd.Series(coord_dict, name=coord) new_quant_df = new_quant_df.append(coord_s) new_quant_df = backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) for n in range(len(new_quant_df['seq5'].iloc[0])): new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] for n in range(len(new_quant_df['seq3'].iloc[0])): new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) lariat_df = junc_df[(junc_df['type'] == '3prime') | (junc_df['looks like'] == 'AG')] lariat_df = lariat_df.sort_values(['genome coord','annotated intron size'], ascending=False) lariat_df = lariat_df.reset_index(drop=True).drop_duplicates(subset='genome coord', keep='first').set_index('genome coord') lariat_df = lariat_df[['transcript','chromosome','position','strand','type']] return new_quant_df, lariat_df
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False): #Populate gene dictionary and build genome if 'pombe' in gff3.lower(): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') ss, flag = SP.list_splice_sites(gff3, organism='pombe') organism = 'pombe' else: transcript_dict = SP.build_transcript_dict(gff3) ss, flag = SP.list_splice_sites(gff3) organism = None ss_dict = SP.collapse_ss_dict(ss) genome = fasta_dict #print genome.keys() nuc_prob = gc_content(fasta_dict) #print nuc_prob base_dict = {"A": 0, "C": 1, "T": 2, "G": 3} #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G. pos_matrix_5prime = np.zeros([4, 8]) pos_matrix_3prime = np.zeros([4, 8]) counter1 = 0 counter2 = 0 for transcript, introns in ss_dict.iteritems(): counter2 += 1 if organism == 'pombe': isoform = transcript + '.1' else: isoform = transcript + 'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: counter1 += 1 if strand == '+': seq = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)] elif strand == '-': seq = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)] seq = SP.reverse_complement(seq) for a, base in enumerate(seq): pos_matrix_5prime[base_dict[base], a] += 1 if strand == '+': seq = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)] elif strand == '-': seq = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)] seq = SP.reverse_complement(seq) for b, base in enumerate(seq): pos_matrix_3prime[base_dict[base], b] += 1 #print counter1 #print counter2 float_formatter = lambda x: "%.1f" % x np.set_printoptions(formatter={'float_kind': float_formatter}) a = 0 while a < 4: b = 0 while b < 8: if PSSM is False: pos_matrix_5prime[a, b] = (pos_matrix_5prime[a, b]) / float(counter1) pos_matrix_3prime[a, b] = (pos_matrix_3prime[a, b]) / float(counter1) if PSSM is True: if pos_matrix_5prime[a, b] == 0: pos_matrix_5prime[a, b] += 1 if pos_matrix_3prime[a, b] == 0: pos_matrix_3prime[a, b] += 1 pos_matrix_5prime[a, b] = np.log2( (pos_matrix_5prime[a, b] / float(counter1)) / nuc_prob[a]) pos_matrix_3prime[a, b] = np.log2( (pos_matrix_3prime[a, b] / float(counter1)) / nuc_prob[a]) b += 1 a += 1 return (pos_matrix_5prime, pos_matrix_3prime)
def get_junction_sequence(df, gff3_file, fasta_file): df = df.sort_values('chr', axis=0) #transcript_dict[transcript] = [start, end, strand, chromosome, CDS start, CDS end] transcript_dict = SP.build_transcript_dict(gff3_file) #splice_dict[transcipt] = [[5'sites][3'sites]] splice_dict, flag = SP.list_splice_sites(gff3_file) #fasta_dict[chr] = sequence if type(fasta_file) is str: fasta_dict = make_fasta_dict(fasta_file) else: fasta_dict = fasta_file transcript_by_chr = {} for transcript, coords in transcript_dict.iteritems(): chromosome = coords[3] if chromosome in transcript_by_chr: transcript_by_chr[chromosome].append(transcript) else: transcript_by_chr[chromosome] = [] transcript_by_chr[chromosome].append(transcript) df['Gene'] = "Unknown" df['intron'] = "Middle" df['sequence1'] = '' df['sequence2'] = '' df['intron sequence'] = 'No sequence here' n = 0 for n in range(len(df)): coord1 = int(df['coord_1'][n].strip()) coord2 = int(df['coord_2'][n].strip()) chrom = df['chr'][n].strip() strand = df['strand'][n].strip() transcripts = transcript_by_chr[chrom] for transcript in transcripts: tx_strand = transcript_dict[transcript][2] start = transcript_dict[transcript][0] stop = transcript_dict[transcript][1] if strand == tx_strand and coord1 >= start and coord2 <= stop: df.loc[n, 'Gene'] = transcript if strand == '+': sequence1 = fasta_dict[chrom][(coord1 - 3):(coord1 + 5)] sequence2 = fasta_dict[chrom][(coord2 - 6):(coord2 + 2)] all_seq = fasta_dict[chrom][(coord1 - 1):coord2] elif strand == '-': sequence1 = fasta_dict[chrom][(coord2 - 6):(coord2 + 2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fasta_dict[chrom][(coord1 - 3):(coord1 + 5)] sequence2 = SP.reverse_complement(sequence2) all_seq = fasta_dict[chrom][(coord1 - 1):coord2] all_seq = SP.reverse_complement(all_seq) df.loc[n, 'sequence1'] = sequence1 df.loc[n, 'sequence2'] = sequence2 df.loc[n, 'intron sequence'] = all_seq for transcript in transcripts: if transcript in df['Gene'].tolist(): tx_df = df[df['Gene'] == transcript] s = tx_df['coord_1'] min_idx = s.idxmin() first = int(s.min()) #print transcript_dict[transcript][2] #print first max_idx = s.idxmax() last = int(s.max()) #print last if first == last: df.loc[min_idx, 'intron'] = 'Only' else: if transcript_dict[transcript][2] == '+': df.loc[min_idx, 'intron'] = 'First' df.loc[max_idx, 'intron'] = 'Last' elif transcript_dict[transcript][2] == '-': df.loc[min_idx, 'intron'] = 'Last' df.loc[max_idx, 'intron'] = 'First' for index, coord_1 in s.iteritems(): if df['intron'][index] == 'Middle': if coord_1 in range(first - 10, first + 10): df_idx = s[s == coord_1].index[0] if transcript_dict[transcript][2] == '+': df.loc[df_idx, 'intron'] = 'First' elif transcript_dict[transcript][2] == '-': df.loc[df_idx, 'intron'] = 'Last' elif coord_1 in range(last - 10, last + 10): df_idx = s[s == coord_1].index[0] if transcript_dict[transcript][2] == '+': df.loc[df_idx, 'intron'] = 'Last' elif transcript_dict[transcript][2] == '-': df.loc[df_idx, 'intron'] = 'First' df = df[df['contained in'] != ''] df = df.reset_index() return df
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None): count1 = 0 count2 = 0 pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')] quant_df['genome coord'] = quant_df['chromosome'].str.cat(quant_df['position'].values.astype(str), sep=':') quant_df.index = quant_df['genome coord'] quant_df = quant_df.drop('index', axis=1) column_dict = {'intron size':[], 'alt splicing':[], '5p score':[], '3p score':[], 'seq5':[], 'seq3':[]} new_index = [] seq5 = [] seq3 = [] for coord in quant_df.index: coord_df = quant_df[quant_df.index == coord] three_site = None alt3 = False if len(coord_df) > 0: coord_df = coord_df.sort_values('height', ascending=False).ix[0] introns = ss_dict[coord_df['transcript']] if 'prime' in coord_df['type']: peak_range = range(coord_df['position']-5,coord_df['position']+5) for intron in introns: if intron[0] in peak_range: five_site = intron[0] three_site = intron[1] break if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0: alt3=True else: if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']: five_site = coord_df['position'] three_df = quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')] three_df = three_df.sort_values('height', ascending=False) three_site = three_df.ix[0]['position'] if three_site is not None: new_index.append(coord) size = abs(three_site-five_site)/1000. column_dict['intron size'].append(size) column_dict['alt splicing'].append(alt3) if coord_df['strand'] == '+': s5 = fa_dict[coord_df['chromosome']][five_site-2:five_site+6] s3 = fa_dict[coord_df['chromosome']][three_site-6:three_site+2] elif coord_df['strand'] == '-': s5 = fa_dict[coord_df['chromosome']][five_site-6:five_site+2] s5 = SP.reverse_complement(s5) s3 = fa_dict[coord_df['chromosome']][three_site-2:three_site+6] s3 = SP.reverse_complement(s3) column_dict['seq5'].append(s5) column_dict['seq3'].append(s3) scores = SP.simple_score_junction(s5, s3, pssm) column_dict['3p score'].append(scores[1]) column_dict['5p score'].append(scores[0]) new_quant_df = quant_df[quant_df.index.isin(new_index)][['genome coord','chromosome', 'strand','transcript','position','type']] for column, data in column_dict.iteritems(): new_quant_df[column] = data new_quant_df = new_quant_df.drop_duplicates(subset='genome coord', keep='first').set_index('genome coord') new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) #for n in range(len(new_quant_df['seq5'].iloc[0])): # new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] #for n in range(len(new_quant_df['seq3'].iloc[0])): # new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) new_quant_df = SP.find_score_branches_ppy(new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict) return new_quant_df
def build_junction_df(junction_bed, gff3_file, fasta, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta) == str: fasta=SP.make_fasta_dict(fasta) junction_dict = build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) junction_count = 0 for tx, junctions in junction_dict.iteritems(): junction_count += len(junctions) junction_df = pd.DataFrame(index=range(junction_count), columns=['intron tuple','chromosome','start','end','strand','depth','type','size','annotated intron size','annotated intron start','annotated intron end']) n=0 for tx, junctions in junction_dict.iteritems(): for junction in junctions: junction_df.ix[n] = [tx]+junction n+=1 sequence1 = [] sequence2 = [] ann_seq1 = [] ann_seq2 = [] seq_type1 = [] seq_type2 = [] df_tx = [] for index, row in junction_df.iterrows(): df_tx.append(row['intron tuple'][0]) chrom = convert_chrom(row['chromosome']) if row['strand'] == '+': curr1 = fasta[chrom][(row['start']-1):(row['start']+7)] sequence1.append(curr1) curr2 = fasta[chrom][(row['end']-5):(row['end']+3)] sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append(fasta[chrom][(row['annotated intron start']-1):(row['annotated intron start']+7)]) ann_seq2.append(fasta[chrom][(row['annotated intron end']-5):(row['annotated intron end']+3)]) elif row['strand'] == '-': curr1 = SP.reverse_complement(fasta[chrom][(row['start']-6):(row['start']+2)]) sequence1.append(curr1) curr2 = SP.reverse_complement(fasta[chrom][(row['end']-2):(row['end']+6)]) sequence2.append(curr2) if row['annotated intron start'] is None: ann_seq1.append(None) ann_seq2.append(None) else: ann_seq1.append(SP.reverse_complement(fasta[chrom][row['annotated intron start']-6:row['annotated intron start']+2])) ann_seq2.append(SP.reverse_complement(fasta[chrom][row['annotated intron end']-2:row['annotated intron end']+6])) else: sequence1.append('NNNNNNNN') sequence2.append('NNNNNNNN') ann_seq1.append('NNNNNNNN') ann_seq2.append('NNNNNNNN') if row['type'] == 'Annotated': seq_type1.append('5p annotated') seq_type2.append('3p annotated') elif row['type'] == '5p tethered': seq_type1.append('5p annotated') seq_type2.append(curr2[4:6]) else: seq_type1.append(curr1[2:4]) seq_type2.append(curr2[4:6]) junc_seq_df = junction_df junc_seq_df['sequence1'] = sequence1 junc_seq_df['sequence2'] = sequence2 junc_seq_df['seq type1'] = seq_type1 junc_seq_df['seq type2'] = seq_type2 junc_seq_df['annotated sequence1'] = ann_seq1 junc_seq_df['annotated sequence2'] = ann_seq2 junc_seq_df['transcript'] = df_tx return junc_seq_df