Example #1
0
def write_intron_fasta(transcript_dict, fasta_dict, prefix='introns', sense=True):
    seq_dict = {}
    for transcript, values in transcript_dict.iteritems():
        start = values[0]
        end = values[1]
        strand = values[2]
        chrom = values[3]
        CDS_start_list = values[4]
        CDS_end_list = values[5]

        for n in range(len(CDS_start_list)-1):
            if strand == '+':
                seq = fasta_dict[chrom][CDS_end_list[n]:CDS_start_list[n+1]-1]
            elif strand == '-':
                intron = len(CDS_start_list)-n-1
                seq = fasta_dict[chrom][CDS_end_list[intron]:CDS_start_list[intron-1]-1]
                seq = SP.reverse_complement(seq)
        
            if sense is False:
                seq = SP.reverse_complement(seq)
            
            seq_dict[transcript+'_'+str(n)] = seq
        
    with open('{}.fa'.format(prefix), 'w') as fout:
        for transcript, seq in seq_dict.iteritems():
            fout.write('>'+transcript+'\n')
            fout.write(seq+'\n')
    return seq_dict
Example #2
0
def add_seq(branch_df, fa_dict):
    five_seqs = []
    branch_seqs = []
    for ix, r in branch_df.iterrows():
        five = fa_dict[r['chromosome']][r['5p splice site']-8:r['5p splice site']+8]
        branch = fa_dict[r['chromosome']][r['branch site']-8:r['branch site']+8]
        if r['strand'] == '-':
            five = SP.reverse_complement(five)
            branch = SP.reverse_complement(branch)
        if 'GT' in five[4:11]:
            ix = five.index('GT')
            five = five[ix-2:ix+6]
        else:
            five = five[4:12]
        if 'AG' in branch[4:11]:
            ix = branch.index('AG')
            branch = branch[ix-4:ix+4]
        elif 'AA' in branch[4:11]:
            ix = branch.index('AA')
            branch = branch[ix-4:ix+4]
        elif 'GA' in branch[4:11]:
            ix = branch.index('GA')
            branch = branch[ix-4:ix+4]
        else:
            branch = branch[4:13]
        five_seqs.append(five)
        branch_seqs.append(branch)
    branch_df['5p seq'] = five_seqs
    branch_df['Branch seq'] = branch_seqs
    
    receptors = ['AG', 'AA', 'GA']
    branch_df = branch_df[branch_df['Branch seq'].str[4:6].isin(receptors)]
    return branch_df
Example #3
0
def generate_all_ss_seqs(gff3, fasta_dict, organism):
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss)
    
    all_seq5 = []
    all_seq3 = []
    for transcript, introns in ss_dict.iteritems():
        if organism == 'pombe':
            isoform = transcript+'.1'
        else:
            isoform = transcript+'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            if strand == '+':
                seq5 = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)]
            elif strand == '-':
                seq5 = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)]
                seq5 = SP.reverse_complement(seq5)

            all_seq5.append(seq5)

            if strand == '+':
                seq3 = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)]
            elif strand == '-':
                seq3 = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)]
                seq3 = SP.reverse_complement(seq3)
            
            all_seq3.append(seq3)
    return all_seq5, all_seq3
Example #4
0
def add_seq(branch_df, fa_dict):
    five_seqs = []
    branch_seqs = []
    for ix, r in branch_df.iterrows():
        five = fa_dict[r['chromosome']][r['5p splice site'] -
                                        8:r['5p splice site'] + 8]
        branch = fa_dict[r['chromosome']][r['branch site'] -
                                          8:r['branch site'] + 8]
        if r['strand'] == '-':
            five = SP.reverse_complement(five)
            branch = SP.reverse_complement(branch)
        if 'GT' in five[4:11]:
            ix = five.index('GT')
            five = five[ix - 2:ix + 6]
        else:
            five = five[4:12]
        if 'AG' in branch[4:11]:
            ix = branch.index('AG')
            branch = branch[ix - 4:ix + 4]
        elif 'AA' in branch[4:11]:
            ix = branch.index('AA')
            branch = branch[ix - 4:ix + 4]
        elif 'GA' in branch[4:11]:
            ix = branch.index('GA')
            branch = branch[ix - 4:ix + 4]
        else:
            branch = branch[4:13]
        five_seqs.append(five)
        branch_seqs.append(branch)
    branch_df['5p seq'] = five_seqs
    branch_df['Branch seq'] = branch_seqs

    receptors = ['AG', 'AA', 'GA']
    branch_df = branch_df[branch_df['Branch seq'].str[4:6].isin(receptors)]
    return branch_df
Example #5
0
def generate_all_ss_seqs(gff3, fasta_dict, organism):
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss)

    all_seq5 = []
    all_seq3 = []
    for transcript, introns in ss_dict.iteritems():
        if organism == 'pombe':
            isoform = transcript + '.1'
        else:
            isoform = transcript + 'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            if strand == '+':
                seq5 = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)]
            elif strand == '-':
                seq5 = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)]
                seq5 = SP.reverse_complement(seq5)

            all_seq5.append(seq5)

            if strand == '+':
                seq3 = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)]
            elif strand == '-':
                seq3 = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)]
                seq3 = SP.reverse_complement(seq3)

            all_seq3.append(seq3)
    return all_seq5, all_seq3
Example #6
0
def write_intergenic_fasta(transcript_dict, fasta_dict, bps_us=0, bps_ds=0, all_intergenic=True, prefix='intergenic_transcripts'):
    seq_dict = {}
    if all_intergenic is False:
        for transcript, values in transcript_dict.iteritems():
            start = values[0]
            end = values[1]
            strand = values[2]
            chrom = values[3]
            
            if bps_us > 0:
                if strand == '+':
                    seq_us_sense = fasta_dict[chrom][start-bps_us:start]
                elif strand == '-':
                    seq_us_sense = fasta_dict[chrom][end:end+bps_us]
                    seq_us_sense = SP.reverse_complement(seq_us_sense)
                seq_us_antisense = SP.reverse_complement(seq_us_sense)
                seq_dict[transcript+'_us_sense'] = seq_us_sense
                seq_dict[transcript+'_us_antisense'] = seq_us_antisense
            
            if bps_ds > 0:
                if strand == '+':
                    seq_ds_sense = fasta_dict[chrom][end:bps_ds+end]
                elif strand == '-':
                    seq_ds_sense = fasta_dict[chrom][start-bps_ds:start]
                    seq_ds_sense = SP.reverse_complement(seq_ds_sense)
                seq_ds_antisense = SP.reverse_complement(seq_ds_sense)
                seq_dict[transcript+'_ds_sense'] = seq_ds_sense
                seq_dict[transcript+'_ds_antisense'] = seq_ds_antisense
    
    elif all_intergenic is True:
        chroms = fasta_dict.keys()
        for chrom in chroms:
            chrom_transcripts = dict((k, transcript_dict[k]) for k in transcript_dict if transcript_dict[k][3] == chrom)
            chr_txs_df = pd.DataFrame.from_dict(chrom_transcripts, orient='index')
            chr_txs_df.sort_values([0], inplace=True)
            sorted_transcripts = chr_txs_df.index.tolist()

            n = 0
            for n in range(len(sorted_transcripts)-1):
                transcript = sorted_transcripts[n]
                next_transcript = sorted_transcripts[n+1]
                transcript_end = chr_txs_df[1][transcript]
                next_start = chr_txs_df[0][next_transcript]
                if next_start > transcript_end:
                    seq_plus = fasta_dict[chrom][transcript_end:next_start]
                    seq_dict[transcript+'_'+next_transcript+'_plus'] = seq_plus
                    seq_dict[transcript+'_'+next_transcript+'_minus'] = SP.reverse_complement(seq_plus)
                else:
                    print 'Overlapping transcripts:'
                    print transcript
                    print next_transcript
                    
    with open('{}.fa'.format(prefix), 'w') as fout:
        for transcript, seq in seq_dict.iteritems():
            fout.write('>'+transcript+'\n')
            fout.write(seq+'\n')
    
    return seq_dict
Example #7
0
def get_sequence(coord_dict, gff3_file, fasta_file):
    if 'pombe' in gff3_file:
        organism = 'pombe'

    else:
        organism = None

    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) is str:
        fasta_dict = make_fasta_dict(fasta_file)
    else:
        fasta_dict = fasta_file

    seq_dict = {}
    counter5 = 0
    counter3 = 0
    other = 0
    for transcript, coord_sets in coord_dict.iteritems():
        seq_dict[transcript] = []
        chrom = transcript_dict[transcript][3]
        #if chrom in rom_lat: chrom = rom_lat[chrom]
        strand = transcript_dict[transcript][2]
        for coord in coord_sets[0]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord - 9):(coord + 11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord - 10):(coord + 10)]
                sequence = SP.reverse_complement(sequence)

            if sequence[10:12] == 'GT' or sequence[10:12] == 'GC':
                seq_type = "5'"
                counter5 += 1
            seq_dict[transcript].append((sequence, seq_type))

        for coord in coord_sets[1]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord - 9):(coord + 11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord - 10):(coord + 10)]
                sequence = SP.reverse_complement(sequence)

            if sequence[8:10] == 'AG':
                seq_type = "3'"
                counter3 += 1
            seq_dict[transcript].append((sequence, seq_type))

    #print str(counter5)+" 5' splice sites"
    #print str(counter3)+" 3' splice sites"

    return seq_dict
Example #8
0
def get_sequence(coord_dict, gff3_file, fasta_file):
    if 'pombe' in gff3_file:
        organism = 'pombe'

    else: organism = None
    
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) is str:
        fasta_dict = make_fasta_dict(fasta_file)
    else:
        fasta_dict = fasta_file
    
    seq_dict = {}
    counter5 = 0
    counter3 = 0
    other = 0
    for transcript, coord_sets in coord_dict.iteritems():
        seq_dict[transcript] = []
        chrom = transcript_dict[transcript][3]
        #if chrom in rom_lat: chrom = rom_lat[chrom]
        strand = transcript_dict[transcript][2]
        for coord in coord_sets[0]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord-9):(coord+11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord-10):(coord+10)]
                sequence = SP.reverse_complement(sequence)

            if sequence[10:12] == 'GT' or sequence[10:12] == 'GC': 
                seq_type = "5'"
                counter5 += 1
            seq_dict[transcript].append((sequence, seq_type))
     
        for coord in coord_sets[1]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord-9):(coord+11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord-10):(coord+10)]
                sequence = SP.reverse_complement(sequence)
                
            if sequence[8:10] == 'AG': 
                seq_type = "3'"
                counter3 += 1
            seq_dict[transcript].append((sequence, seq_type))
    
    #print str(counter5)+" 5' splice sites"
    #print str(counter3)+" 3' splice sites"
    
    return seq_dict
Example #9
0
def seq_simple(chrom, start, end, strand, fasta_dict):
    if type(fasta_dict) == str:
        with open(fasta_dict, 'r') as f:
            fasta_dict = json.load(f)
    seq = fasta_dict[chrom][start:end+1]
    if strand == '-':
        seq = SP.reverse_complement(seq)
    return seq
Example #10
0
def seq_simple(chrom, start, end, strand, fasta_dict):
    if type(fasta_dict) == str:
        with open(fasta_dict, 'r') as f:
            fasta_dict = json.load(f)
    seq = fasta_dict[chrom][start:end + 1]
    if strand == '-':
        seq = SP.reverse_complement(seq)
    return seq
Example #11
0
def score_PyTract(df, fa_dict, alt_column_name=None, from_branches=False):
    py_score1 = []
    py_score2 = []
    alt_py1 = []
    alt_py2 = []

    for ix, r in df.iterrows():
        strand = r['strand']
        chrom = r['chromosome']
        coord = r['annotated intron coords'][1]
        alt_coord = r['junction coords'][1]
        if strand == '+':
            if coord is not None:
                seq1 = fa_dict[chrom][coord - 15:coord]
                seq2 = fa_dict[chrom][coord - 30:coord - 15]
            alt1 = fa_dict[chrom][alt_coord - 15:alt_coord]
            alt2 = fa_dict[chrom][alt_coord - 30:alt_coord - 15]
        if strand == '-':
            if coord is not None:
                seq1 = fa_dict[chrom][coord:coord + 15]
                seq2 = fa_dict[chrom][coord + 15:coord + 30]
                seq1 = SP.reverse_complement(seq1)
                seq2 = SP.reverse_complement(seq2)
            alt1 = fa_dict[chrom][alt_coord:alt_coord + 15]
            alt2 = fa_dict[chrom][alt_coord + 15:alt_coord + 30]
            alt1 = SP.reverse_complement(alt1)
            alt2 = SP.reverse_complement(alt2)

        alt_py1.append(percent_py(alt1))
        alt_py2.append(percent_py(alt2))

        if coord is not None:
            py_score1.append(percent_py(seq1))
            py_score2.append(percent_py(seq2))
        else:
            py_score1.append(np.NaN)
            py_score2.append(np.NaN)

    df['Py score annotated -15:0'] = py_score1
    df['Py score annotated -30:-15'] = py_score2
    df['Py score alternative -15:0'] = alt_py1
    df['Py score alternative -30:-15'] = alt_py2
    return df
Example #12
0
def score_PyTract(df, fa_dict, alt_column_name=None, from_branches=False):
    py_score1 = []
    py_score2 = []
    alt_py1 = []
    alt_py2 = []
    
    for ix, r in df.iterrows():
        strand = r['strand']
        chrom = r['chromosome']
        coord = r['annotated intron coords'][1]
        alt_coord = r['junction coords'][1]
        if strand == '+':
            if coord is not None:
                seq1 = fa_dict[chrom][coord-15:coord]
                seq2 = fa_dict[chrom][coord-30:coord-15]
            alt1 = fa_dict[chrom][alt_coord-15:alt_coord]
            alt2 = fa_dict[chrom][alt_coord-30:alt_coord-15]
        if strand == '-':
            if coord is not None:
                seq1 = fa_dict[chrom][coord:coord+15]
                seq2 = fa_dict[chrom][coord+15:coord+30]
                seq1 = SP.reverse_complement(seq1)
                seq2 = SP.reverse_complement(seq2)
            alt1 = fa_dict[chrom][alt_coord:alt_coord+15]
            alt2 = fa_dict[chrom][alt_coord+15:alt_coord+30]
            alt1 = SP.reverse_complement(alt1)
            alt2 = SP.reverse_complement(alt2)

        alt_py1.append(percent_py(alt1))
        alt_py2.append(percent_py(alt2))
        
        if coord is not None:
            py_score1.append(percent_py(seq1))
            py_score2.append(percent_py(seq2))
        else:
            py_score1.append(np.NaN)
            py_score2.append(np.NaN)
    
    df['Py score annotated -15:0'] = py_score1
    df['Py score annotated -30:-15'] = py_score2
    df['Py score alternative -15:0'] = alt_py1
    df['Py score alternative -30:-15'] = alt_py2
    return df
Example #13
0
def write_transcript_fasta(transcript_dict, fasta_dict, prefix='transcripts', sense=True, spliced=False):
    seq_dict = {}
    for transcript, values in transcript_dict.iteritems():
        start = values[0]
        end = values[1]
        strand = values[2]
        chrom = values[3]
        CDS_start_list = values[4]
        CDS_end_list = values[5]
        
        if spliced is False:
            seq = fasta_dict[chrom][start-1:end]
            if strand == '-':
                seq = SP.reverse_complement(seq)

        elif spliced is True:
            seq = ''
            for n in range(len(CDS_start_list)):
                if strand == '+':
                    seq = seq+fasta_dict[chrom][CDS_start_list[n]-1:CDS_end_list[n]]
                elif strand == '-':
                    new_seq = fasta_dict[chrom][CDS_start_list[n]-1:CDS_end_list[n]]
                    new_seq = SP.reverse_complement(new_seq)
                    seq = seq+new_seq
        
        if sense is False:
            seq = SP.reverse_complement(seq)
            
        seq_dict[transcript] = seq
        
    with open('{}.fa'.format(prefix), 'w') as fout:
        for transcript, seq in seq_dict.iteritems():
            fout.write('>'+transcript+'\n')
            fout.write(seq+'\n')
    
    return seq_dict
Example #14
0
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) == dict:
        fasta_dict = fasta_file
    elif fasta_file.endswith('json'):
        with open(fasta_file, 'r') as f:
            fasta_dict = json.load(f)
    else:
        fasta_dict = make_fasta_dict(fasta_file)
    if ss_dict is not None:
        ss_dict=ss_dict
    elif junction_bed is not None:
        ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism)
    elif peak_df is not None:
        ss_dict = {}
        peak_df = peak_df[~peak_df['type'].str.contains('prime')]
        for ix, r in peak_df.iterrows():
            if r['transcript'] not in ss_dict:
                ss_dict[r['transcript']] = []
            if r['strand'] == '+':
                ss_dict[r['transcript']].append((r['position'],r['position']+50))
            elif r['strand'] == '-':
                ss_dict[r['transcript']].append((r['position'],r['position']-50))
                
    else:
        ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism)
        ss_dict = SP.collapse_ss_dict(ss_dict)
    
    seq_dict = {}
    for transcript, introns in ss_dict.iteritems():
        if junction_bed is None:
            if organism == 'pombe':
                transcript = transcript+'.1'
            else:
                transcript = transcript+'T0'
        introns = list(introns)
        strand = transcript_dict[transcript][2]
        chrom = transcript_dict[transcript][3]
        n = 0
        for n in range(len(introns)):
            if strand == '+':
                seq_dict[transcript+'-'+chrom+':'+str(introns[n][0]+1)] = fasta_dict[chrom][introns[n][0]+2:introns[n][0]+17]
            elif strand == '-':
                seq = fasta_dict[chrom][introns[n][0]-16:introns[n][0]-1]
                seq_dict[transcript+'-'+chrom+':'+str(introns[n][0])] = SP.reverse_complement(seq)
    return seq_dict
Example #15
0
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    column_dict = {'position':[],'transcript':[],'alt splicing':[],'type':[],'strand':[], 'introns in transcript':[],
                   'intron size':[],'chromosome':[], '5p score':[], '3p score':[], 'intron position':[], 'exon size (us)':[],
                   'exon size (ds)':[],'transcript size':[], 'peak':[], 'seq5':[],'seq3':[]} 
    new_index = []
    
    for tx in set(df['transcript']):
        strand = df[df['transcript'] == tx].iloc[0]['strand']
        splice_sites = ss_dict[tx]
        if strand == '+':
            splice_sites = sorted(list(splice_sites), key=lambda x:x[0])
        elif strand == '-':
            splice_sites = sorted(list(splice_sites), key=lambda x:x[0], reverse=True)
        
        df_pos = None
        for n, (five_site, three_site) in enumerate(splice_sites):
            # Check if already in dataframe
            in_df = False
            for peak in df[df['transcript'] == tx]['position']:
                if five_site in range(int(peak)-5,int(peak)+5):
                    in_df = True
                    df_pos = peak
                    break
            
            column_dict['transcript'].append(tx)
            if organism == 'pombe':
                iso = tx+'.1'
            else: iso = tx+'T0'
            
            column_dict['intron size'].append(abs(three_site-five_site))
            column_dict['introns in transcript'].append(len(splice_sites))
            column_dict['strand'].append(strand)   
            chrom = df[df['transcript'] == tx].iloc[0]['chromosome']
            column_dict['chromosome'].append(chrom)
            column_dict['transcript size'].append((tx_dict[iso][1]-tx_dict[iso][0])/1000.)

            # Check if first or last intron and add exon size
            if n == 0:
                column_dict['intron position'].append('First')
                if strand == '+':
                    column_dict['exon size (us)'].append((five_site-tx_dict[iso][0])/1000.)
                    if len(splice_sites) > 1:
                        ds_length = (splice_sites[n+1][0] - three_site)/1000.
                        try:
                            if ds_length < 0:
                                ds_length = (splice_sites[n+2][0] - three_site)/1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (tx_dict[iso][1] - three_site)/1000.
                    
                elif strand == '-':
                    column_dict['exon size (us)'].append((tx_dict[iso][1]-five_site)/1000.)
                    if len(splice_sites) > 1:
                        ds_length = (three_site - splice_sites[n+1][0])/1000.
                        try:
                            if ds_length < 0:
                                ds_length = (three_site - splice_sites[n+2][0])/1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (three_site - tx_dict[iso][0])/1000.
                column_dict['exon size (ds)'].append(ds_length)
            
            elif n == len(splice_sites)-1:
                column_dict['intron position'].append('Last')
                column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.)
                
                if strand == '+':
                    column_dict['exon size (ds)'].append((tx_dict[iso][1]-three_site)/1000.)
                elif strand == '-':
                    column_dict['exon size (ds)'].append((three_site - tx_dict[iso][0])/1000.)
            else:
                column_dict['intron position'].append('Middle')
                column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.)
                column_dict['exon size (ds)'].append(abs(three_site - splice_sites[n+1][0])/1000.)

            if in_df is True:
                peak_index = chrom+':'+str(int(df_pos))
                new_index.append(peak_index)
                column_dict['position'].append(df_pos)
                column_dict['3p score'].append(df.loc[peak_index,'3p score'])
                column_dict['5p score'].append(df.loc[peak_index,'5p score'])
                column_dict['alt splicing'].append(df.loc[peak_index,'alt splicing'])
                column_dict['type'].append(df.loc[peak_index,'type'])
                column_dict['peak'].append(True)
                column_dict['seq5'].append(df.loc[peak_index,'seq5'])
                column_dict['seq3'].append(df.loc[peak_index,'seq3'])

            if in_df is False:
                column_dict['alt splicing'].append(False)
                column_dict['type'].append('5prime')
                column_dict['peak'].append(False)
                
                # Get position, index and sequence for scoring and position code
                if strand == '+':
                    column_dict['position'].append(five_site+1)
                    new_index.append(chrom+':'+str(five_site+1))
                    sequence1 = fa_dict[chrom][(five_site-1):(five_site+7)]
                    sequence2 = fa_dict[chrom][(three_site-5):(three_site+3)]
                
                elif strand == '-':
                    column_dict['position'].append(five_site-1)
                    new_index.append(chrom+':'+str(five_site-1))
                    sequence1 = fa_dict[chrom][(five_site-6):(five_site+2)]
                    sequence1 = SP.reverse_complement(sequence1)
                    sequence2 = fa_dict[chrom][(three_site-2):(three_site+6)]
                    sequence2 = SP.reverse_complement(sequence2)

                column_dict['seq5'].append(sequence1)
                column_dict['seq3'].append(sequence2)
                
                # Score sequences
                score_5, score_3 = SP.simple_score_junction(sequence1, sequence2, PSSM)
                column_dict['3p score'].append(score_3)
                column_dict['5p score'].append(score_5)
    
    # Create new dataframe from column dictionary
    new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index)
    for column, data in column_dict.iteritems():
        new_df[column] = data
    
    return new_df
Example #16
0
def find_score_branches_ppy(quant_df, peak_branch_df, fa_dict):
    #branches = generate_all_branches()
    PSSM = branch_PSSM(peak_branch_df, fa_dict)
    
    branches = []
    if type(peak_branch_df) is not str:
        for ix, branch in peak_branch_df['Branch seq'].iteritems():
            seq = branch[2:7]
            if seq[:-2] != 'A' and 'A' in seq:
                A_ix = branch.rfind('A')
                new_seq = branch[A_ix-3:A_ix+2]
                if len(new_seq) == 5:
                    seq = new_seq
        branches = peak_branch_df['Branch seq'].str[2:7]
    else:
        with open(peak_branch_df) as f:
            for line in f:
                branches.append(line.strip())
    
    # Sort branches by abundance so that the most common ones are first in the search
    br_abund = []
    for branch in set(branches):
        count = len([x for x in branches if x == branch])
        if count > 1:
            br_abund.append((branch, count))
    br_abund = sorted(br_abund, key=lambda x: x[1], reverse=True)
    branches = zip(*br_abund)[0]
    
    branch_dict = collections.OrderedDict()
    for branch in branches:
        branch_dict[branch] = score_branch(branch, PSSM)
    
    branch_3_dist = []
    branch_score = []
    branch_seqs = []
    perc_py = []
    for ix, r in quant_df.iterrows():
        if r['strand'] == '+':
            intron_seq = fa_dict[r['chromosome']][int(r['position']):int(r['position']+r['intron size'])]
            three_site = r['position']+r['intron size']
        elif r['strand'] == '-':
            intron_seq = fa_dict[r['chromosome']][int(r['position']-r['intron size']-1):int(r['position']-1)]
            intron_seq = SP.reverse_complement(intron_seq)
            three_site = r['position']-r['intron size']
        
        if type(peak_branch_df) is not str:
            if ix in peak_branch_df['genome coord']:
                ix_df = peak_branch_df[peak_branch_df['genome coord'] == ix]
                ix_df = ix_df.sort_values('depth', ascending=False)
                best_branch = ix_df.iloc[0,'branch site']
                best_branch = abs(ix_df.iloc[0,'5p splice site']-best_branch)

                seq = ix_df.iloc[0,'Branch seq'][2:7]
                branch_seqs.append(seq)
                branch_score.append(score_branch(seq, PSSM))
                branch_3_dist.append(ix_df.iloc[0,'Branch to 3p distance'])

                if 'N' in intron_seq[best_branch[0]+5:]:
                        print ix
                        print intron_seq
                perc_py.append(percent_py(intron_seq[best_branch[0]+5:]))
                               
            else:
                matches = []
                for branch in branch_dict:
                    if branch in intron_seq:
                        matches.append((intron_seq.index(branch), branch, branch_dict[branch]))

                if len(matches) == 0:
                    # Find the closest A
                    best_ix = intron_seq[:-3].rfind('A')
                    seq = intron_seq[best_ix-3:best_ix+2]
                    score = score_branch(seq, PSSM)
                    best_branch = (best_ix, seq, score)

                    #branch_3_dist.append(np.NaN)
                    #branch_score.append(np.NaN)
                    #branch_seqs.append('NNNNN')
                    #perc_py.append(percent_py(intron_seq[-30:]))
                elif len(matches) > 1:
                    matches = sorted(matches, key=lambda x: x[2], reverse=True)
                    best_branch = matches[0]
                else:
                    best_branch = matches[0]

                branch_3_dist.append((len(intron_seq)-best_branch[0]-4)/1000.)
                branch_score.append(best_branch[2])
                branch_seqs.append(best_branch[1])

                if len(intron_seq)-best_branch[0]-5 > 1:
                    if 'N' in intron_seq[best_branch[0]+5:]:
                        print ix
                        print intron_seq
                    perc_py.append(percent_py(intron_seq[best_branch[0]+5:]))
                else:
                    perc_py.append(np.NaN)
        else:
            matches = []
            for branch in branch_dict:
                if branch in intron_seq:
                    matches.append((intron_seq.index(branch), branch, branch_dict[branch]))

            if len(matches) == 0:
                # Find the closest A
                best_ix = intron_seq[:-3].rfind('A')
                seq = intron_seq[best_ix-3:best_ix+2]
                score = score_branch(seq, PSSM)
                best_branch = (best_ix, seq, score)

                #branch_3_dist.append(np.NaN)
                #branch_score.append(np.NaN)
                #branch_seqs.append('NNNNN')
                #perc_py.append(percent_py(intron_seq[-30:]))
            elif len(matches) > 1:
                matches = sorted(matches, key=lambda x: x[2], reverse=True)
                best_branch = matches[0]
            else:
                best_branch = matches[0]

            branch_3_dist.append((len(intron_seq)-best_branch[0]-4)/1000.)
            branch_score.append(best_branch[2])
            branch_seqs.append(best_branch[1])

            if len(intron_seq)-best_branch[0]-5 > 1:
                if 'N' in intron_seq[best_branch[0]+5:]:
                    print ix
                    print intron_seq
                perc_py.append(percent_py(intron_seq[best_branch[0]+5:]))
            else:
                perc_py.append(np.NaN)
    
    print len(quant_df)
    print len(branch_score)
    
    quant_df['branch score'] = branch_score
    quant_df['branch to 3p distance'] = branch_3_dist
    quant_df['percent pPy'] = perc_py
    
    branch_seqs = ['NNNNN' if len(x) < 5 else x for x in branch_seqs ]
    print len(branch_seqs)
    print len(quant_df)
    
    for n in range(len(branch_seqs[0])):
        pos = [x[n] for x in branch_seqs]
        quant_df['branch-'+str(n)] = pos
    
    print str(len(quant_df)-len(quant_df['branch score'].dropna()))+' introns without identifiable branches'
    return quant_df              
Example #17
0
def build_junction_df(junction_bed, gff3_file, fasta, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta) == str:
        fasta = SP.make_fasta_dict(fasta)
    junction_dict = build_junction_dict(junction_bed,
                                        gff3_file,
                                        transcript_dict,
                                        organism=organism)
    junction_count = 0
    for tx, junctions in junction_dict.iteritems():
        junction_count += len(junctions)

    junction_df = pd.DataFrame(index=range(junction_count),
                               columns=[
                                   'intron tuple', 'chromosome', 'start',
                                   'end', 'strand', 'depth', 'type', 'size',
                                   'annotated intron size',
                                   'annotated intron start',
                                   'annotated intron end'
                               ])
    n = 0
    for tx, junctions in junction_dict.iteritems():
        for junction in junctions:
            junction_df.ix[n] = [tx] + junction
            n += 1

    sequence1 = []
    sequence2 = []
    ann_seq1 = []
    ann_seq2 = []
    seq_type1 = []
    seq_type2 = []
    df_tx = []
    for index, row in junction_df.iterrows():
        df_tx.append(row['intron tuple'][0])
        chrom = convert_chrom(row['chromosome'])
        if row['strand'] == '+':
            curr1 = fasta[chrom][(row['start'] - 1):(row['start'] + 7)]
            sequence1.append(curr1)
            curr2 = fasta[chrom][(row['end'] - 5):(row['end'] + 3)]
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(
                    fasta[chrom][(row['annotated intron start'] -
                                  1):(row['annotated intron start'] + 7)])
                ann_seq2.append(fasta[chrom][(row['annotated intron end'] -
                                              5):(row['annotated intron end'] +
                                                  3)])
        elif row['strand'] == '-':
            curr1 = SP.reverse_complement(fasta[chrom][(row['start'] -
                                                        6):(row['start'] + 2)])
            sequence1.append(curr1)
            curr2 = SP.reverse_complement(fasta[chrom][(row['end'] -
                                                        2):(row['end'] + 6)])
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(
                    SP.reverse_complement(
                        fasta[chrom][row['annotated intron start'] -
                                     6:row['annotated intron start'] + 2]))
                ann_seq2.append(
                    SP.reverse_complement(
                        fasta[chrom][row['annotated intron end'] -
                                     2:row['annotated intron end'] + 6]))
        else:
            sequence1.append('NNNNNNNN')
            sequence2.append('NNNNNNNN')
            ann_seq1.append('NNNNNNNN')
            ann_seq2.append('NNNNNNNN')

        if row['type'] == 'Annotated':
            seq_type1.append('5p annotated')
            seq_type2.append('3p annotated')
        elif row['type'] == '5p tethered':
            seq_type1.append('5p annotated')
            seq_type2.append(curr2[4:6])
        else:
            seq_type1.append(curr1[2:4])
            seq_type2.append(curr2[4:6])

    junc_seq_df = junction_df
    junc_seq_df['sequence1'] = sequence1
    junc_seq_df['sequence2'] = sequence2
    junc_seq_df['seq type1'] = seq_type1
    junc_seq_df['seq type2'] = seq_type2
    junc_seq_df['annotated sequence1'] = ann_seq1
    junc_seq_df['annotated sequence2'] = ann_seq2
    junc_seq_df['transcript'] = df_tx

    return junc_seq_df
Example #18
0
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False):
    #Populate gene dictionary and build genome
    if 'pombe' in gff3.lower():
        transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')
        ss, flag = SP.list_splice_sites(gff3, organism='pombe')
        organism = 'pombe'
    else:
        transcript_dict = SP.build_transcript_dict(gff3)
        ss, flag = SP.list_splice_sites(gff3)
        organism = None
    ss_dict = SP.collapse_ss_dict(ss)
    genome = fasta_dict
    #print genome.keys()
    nuc_prob = gc_content(fasta_dict)
    #print nuc_prob

    base_dict = {"A":0, "C":1, "T":2, "G":3}
    
    #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G.
    pos_matrix_5prime = np.zeros([4,8])
    pos_matrix_3prime = np.zeros([4,8])

    counter1 = 0
    counter2 = 0

    for transcript, introns in ss_dict.iteritems():
        counter2 += 1
        if organism == 'pombe':
            isoform = transcript+'.1'
        else:
            isoform = transcript+'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            counter1+=1
            if strand == '+':
                seq = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)]
                seq = SP.reverse_complement(seq)

            for a, base in enumerate(seq):
                pos_matrix_5prime[base_dict[base],a] += 1

            if strand == '+':
                seq = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)]
                seq = SP.reverse_complement(seq)
            
            for b, base in enumerate(seq):
                pos_matrix_3prime[base_dict[base],b] += 1
                
    #print counter1
    #print counter2

    float_formatter = lambda x: "%.1f" % x
    np.set_printoptions(formatter={'float_kind':float_formatter})
    
    a = 0
    while a < 4:
        b = 0
        while b < 8:
            if PSSM is False:
                pos_matrix_5prime[a,b] = (pos_matrix_5prime[a,b])/float(counter1)
                pos_matrix_3prime[a,b] = (pos_matrix_3prime[a,b])/float(counter1)
            if PSSM is True:
                if pos_matrix_5prime[a,b] == 0: pos_matrix_5prime[a,b] += 1
                if pos_matrix_3prime[a,b] == 0: pos_matrix_3prime[a,b] += 1
                pos_matrix_5prime[a,b] = np.log2((pos_matrix_5prime[a,b]/float(counter1))/nuc_prob[a])
                pos_matrix_3prime[a,b] = np.log2((pos_matrix_3prime[a,b]/float(counter1))/nuc_prob[a])
            b += 1
        a += 1
    
    return (pos_matrix_5prime, pos_matrix_3prime)
Example #19
0
def get_junction_sequence(df, gff3_file, fasta_file):
    df = df.sort_values('chr', axis=0)
    
    #transcript_dict[transcript] = [start, end, strand, chromosome, CDS start, CDS end]
    transcript_dict = SP.build_transcript_dict(gff3_file)

    #splice_dict[transcipt] = [[5'sites][3'sites]]
    splice_dict, flag = SP.list_splice_sites(gff3_file)
    
    #fasta_dict[chr] = sequence
    if type(fasta_file) is str:
        fasta_dict = make_fasta_dict(fasta_file)
    else:
        fasta_dict = fasta_file

    transcript_by_chr = {}
    for transcript, coords in transcript_dict.iteritems():
        chromosome = coords[3]
        if chromosome in transcript_by_chr:
            transcript_by_chr[chromosome].append(transcript)
        else:
            transcript_by_chr[chromosome] = []
            transcript_by_chr[chromosome].append(transcript)

    df['Gene'] = "Unknown"
    df['intron'] = "Middle"
    df['sequence1'] = ''
    df['sequence2'] = ''
    df['intron sequence'] = 'No sequence here'

    n = 0
    for n in range(len(df)):
        coord1 = int(df['coord_1'][n].strip())
        coord2 = int(df['coord_2'][n].strip())
        chrom = df['chr'][n].strip()
        strand = df['strand'][n].strip()
        transcripts = transcript_by_chr[chrom]

        for transcript in transcripts:
            tx_strand = transcript_dict[transcript][2]
            start = transcript_dict[transcript][0]
            stop = transcript_dict[transcript][1]
            
            if strand == tx_strand and coord1 >= start and coord2 <= stop:
                df.loc[n,'Gene'] = transcript
               
        if strand == '+':
            sequence1 = fasta_dict[chrom][(coord1-3):(coord1+5)]
            sequence2 = fasta_dict[chrom][(coord2-6):(coord2+2)]
            all_seq = fasta_dict[chrom][(coord1-1):coord2]
        elif strand == '-':
            sequence1 = fasta_dict[chrom][(coord2-6):(coord2+2)]
            sequence1 = SP.reverse_complement(sequence1)
            sequence2 = fasta_dict[chrom][(coord1-3):(coord1+5)]
            sequence2 = SP.reverse_complement(sequence2)
            all_seq = fasta_dict[chrom][(coord1-1):coord2]
            all_seq = SP.reverse_complement(all_seq)
        
        df.loc[n,'sequence1'] = sequence1
        df.loc[n,'sequence2'] = sequence2
        df.loc[n,'intron sequence'] = all_seq

    for transcript in transcripts:
        if transcript in df['Gene'].tolist():
            tx_df = df[df['Gene'] == transcript]
            s = tx_df['coord_1']
            min_idx = s.idxmin()
            first = int(s.min())
            #print transcript_dict[transcript][2]
            #print first
            max_idx = s.idxmax()
            last = int(s.max())
            #print last
        
            if first == last:
                df.loc[min_idx,'intron'] = 'Only'
            else:
                if transcript_dict[transcript][2] == '+':
                    df.loc[min_idx,'intron'] = 'First'
                    df.loc[max_idx,'intron'] = 'Last'
                elif transcript_dict[transcript][2] == '-':
                    df.loc[min_idx,'intron'] = 'Last'
                    df.loc[max_idx,'intron'] = 'First'
            
            for index, coord_1 in s.iteritems():
                if df['intron'][index] == 'Middle':
                    if coord_1 in range(first-10, first+10):
                        df_idx = s[s == coord_1].index[0]
                        if transcript_dict[transcript][2] == '+':
                            df.loc[df_idx, 'intron'] = 'First'
                        elif transcript_dict[transcript][2] == '-':
                            df.loc[df_idx, 'intron'] = 'Last'
                    elif coord_1 in range(last-10, last+10):
                        df_idx = s[s == coord_1].index[0]
                        if transcript_dict[transcript][2] == '+':
                            df.loc[df_idx, 'intron'] = 'Last'
                        elif transcript_dict[transcript][2] == '-':
                            df.loc[df_idx, 'intron'] = 'First'
                
    df = df[df['contained in'] != '']
    df = df.reset_index()
    return df
Example #20
0
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None):
    count1 = 0
    count2 = 0

    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    quant_df = peak_df[(peak_df['type'] != '3prime')
                       & (peak_df['looks like'] != 'AG')]
    quant_df['genome coord'] = quant_df['chromosome'].str.cat(
        quant_df['position'].values.astype(str), sep=':')
    quant_df.index = quant_df['genome coord']
    quant_df = quant_df.drop('index', axis=1)

    column_dict = {
        'intron size': [],
        'alt splicing': [],
        '5p score': [],
        '3p score': [],
        'seq5': [],
        'seq3': []
    }
    new_index = []
    seq5 = []
    seq3 = []

    for coord in quant_df.index:
        coord_df = quant_df[quant_df.index == coord]
        three_site = None
        alt3 = False
        if len(coord_df) > 0:
            coord_df = coord_df.sort_values('height', ascending=False).ix[0]
        introns = ss_dict[coord_df['transcript']]
        if 'prime' in coord_df['type']:
            peak_range = range(coord_df['position'] - 5,
                               coord_df['position'] + 5)
            for intron in introns:
                if intron[0] in peak_range:
                    five_site = intron[0]
                    three_site = intron[1]
                    break
            if len(quant_df[(quant_df['transcript'] == coord_df['transcript'])
                            & (quant_df['type'] == 'AG')]) > 0:
                alt3 = True
        else:
            if 'AG' in quant_df[quant_df['transcript'] ==
                                coord_df['transcript']]['type']:
                five_site = coord_df['position']
                three_df = quant_df[
                    (quant_df['transcript'] == coord_df['transcript'])
                    & (quant_df['type'] == 'AG')]
                three_df = three_df.sort_values('height', ascending=False)
                three_site = three_df.ix[0]['position']

        if three_site is not None:
            new_index.append(coord)
            size = abs(three_site - five_site) / 1000.
            column_dict['intron size'].append(size)
            column_dict['alt splicing'].append(alt3)

            if coord_df['strand'] == '+':
                s5 = fa_dict[coord_df['chromosome']][five_site - 2:five_site +
                                                     6]
                s3 = fa_dict[coord_df['chromosome']][three_site -
                                                     6:three_site + 2]
            elif coord_df['strand'] == '-':
                s5 = fa_dict[coord_df['chromosome']][five_site - 6:five_site +
                                                     2]
                s5 = SP.reverse_complement(s5)
                s3 = fa_dict[coord_df['chromosome']][three_site -
                                                     2:three_site + 6]
                s3 = SP.reverse_complement(s3)
            column_dict['seq5'].append(s5)
            column_dict['seq3'].append(s3)
            scores = SP.simple_score_junction(s5, s3, pssm)
            column_dict['3p score'].append(scores[1])
            column_dict['5p score'].append(scores[0])

    new_quant_df = quant_df[quant_df.index.isin(new_index)][[
        'genome coord', 'chromosome', 'strand', 'transcript', 'position',
        'type'
    ]]
    for column, data in column_dict.iteritems():
        new_quant_df[column] = data

    new_quant_df = new_quant_df.drop_duplicates(
        subset='genome coord', keep='first').set_index('genome coord')

    new_quant_df = SP.backfill_splice_sites(new_quant_df,
                                            gff3,
                                            fa_dict,
                                            pssm,
                                            organism=organism)

    #for n in range(len(new_quant_df['seq5'].iloc[0])):
    #    new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    #for n in range(len(new_quant_df['seq3'].iloc[0])):
    #    new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)

    new_quant_df = SP.find_score_branches_ppy(
        new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt',
        fa_dict)

    return new_quant_df
Example #21
0
def make_quant_df(junc_df, branch_df, gff3, fa_dict, organism=None):
    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)

    quant_df = junc_df[(junc_df['type'] != '3prime')
                       & (junc_df['looks like'] != 'AG')]

    new_intron_size = []
    alt_splice = []
    score_3 = []
    score_5 = []
    seq5 = []
    seq3 = []

    new_quant_df = pd.DataFrame(index=set(quant_df.index),
                                columns=['intron size', 'alt splicing'])
    for coord in new_quant_df.index:
        coord_df = quant_df[quant_df.index == coord]

        #Determine if multiple junctions come from this peak
        if len(coord_df) > 1: alt_splice.append(True)
        else: alt_splice.append(False)

        if max(coord_df['annotated intron size']) > 0:
            coord_df = coord_df.sort_values('annotated intron size',
                                            ascending=False)
            new_intron_size.append(coord_df.ix[0]['annotated intron size'] /
                                   1000.)
            seq5.append(coord_df.ix[0]['junction sequence1'])
            seq3.append(coord_df.ix[0]['junction sequence2'])
            score_3.append(coord_df.ix[0]['annotated 3p score'])
            score_5.append(coord_df.ix[0]['annotated 5p score'])

        else:
            coord_df = coord_df.sort_values('junction size', ascending=False)
            new_intron_size.append(coord_df.ix[0]['junction size'] / 1000.)
            seq5.append(coord_df.ix[0]['junction sequence1'])
            seq3.append(coord_df.ix[0]['junction sequence2'])
            scores = SP.simple_score_junction(
                coord_df.ix[0]['junction sequence1'],
                coord_df.ix[0]['junction sequence2'], pssm)
            score_3.append(scores[1])
            score_5.append(scores[0])

    new_quant_df['intron size'] = new_intron_size
    new_quant_df['alt splicing'] = alt_splice
    new_quant_df['5p score'] = score_5
    new_quant_df['3p score'] = score_3
    new_quant_df['seq5'] = seq5
    new_quant_df['seq3'] = seq3

    quant_df = quant_df.sort_values('annotated intron size')
    quant_df = quant_df.reset_index(drop=True).drop_duplicates(
        subset='genome coord', keep='first').set_index('genome coord')

    new_quant_df = new_quant_df.merge(
        quant_df[['transcript', 'chromosome', 'position', 'strand', 'type']],
        right_index=True,
        left_index=True)

    for coord in set(branch_df['genome coord']):
        if coord not in new_quant_df.index:
            coord_df = branch_df[branch_df['genome coord'] == coord]
            coord_df = coord_df.sort_values('depth')
            best = coord_df.iloc[0]
            coord_dict = {
                'transcript': best['transcript'][:-2],
                'chromosome': best['chromosome'],
                'position': best['5p splice site'],
                'strand': best['strand'],
                'type': best['type'],
                'intron size': best['intron size'],
                'alt splicing': np.where(len(coord_df) > 1, True, False),
                '5p score': np.NaN,
                '3p score': np.NaN,
                'seq5': '',
                'seq3': ''
            }

            if len(best['5p seq']) > 0:
                coord_dict['seq5'] = best['5p seq']
            else:
                if best['strand'] == '+':
                    coord_dict['seq5'] = fa_dict[best['chromosome']][(
                        int(best['5p splice site']) -
                        1):(int(best['5p splice site']) + 7)]
                elif best['strand'] == '-':
                    coord_dict['seq5'] = fa_dict[best['chromosome']][(
                        int(best['5p splice site']) -
                        6):(int(best['5p splice site']) + 2)]
                    coord_dict['seq5'] = SP.reverse_complement(
                        coord_dict['seq5'])

            if str(best['3p splice site']) != 'nan':
                three_site = best['3p splice site']
            else:
                if best['strand'] == '+':
                    after_branch = fa_dict[best['chromosome']][
                        best['branch site']:best['branch site'] + 100]
                elif best['strand'] == '-':
                    after_branch = fa_dict[
                        best['chromosome']][best['branch site'] -
                                            100:best['branch site']]
                    after_branch = SP.reverse_complement(after_branch)
                for subs in ['TAG', 'CAG', 'GAG', 'AAG']:
                    if subs in after_branch:
                        ix = after_branch.find(subs) + 3
                        break
                three_site = best['branch site'] + ix
                if best['strand'] == '-':
                    three_site = best['branch site'] - ix
                coord_dict['intron size'] = abs(coord_dict['position'] -
                                                three_site)

            if best['strand'] == '+':
                coord_dict['seq3'] = fa_dict[
                    best['chromosome']][int(three_site - 5):int(three_site) +
                                        3]
            elif best['strand'] == '-':
                coord_dict['seq3'] = fa_dict[
                    best['chromosome']][int(three_site) - 2:int(three_site) +
                                        6]
                coord_dict['seq3'] = SP.reverse_complement(coord_dict['seq3'])

            coord_dict['5p score'], coord_dict[
                '3p score'] = SP.simple_score_junction(coord_dict['seq5'],
                                                       coord_dict['seq3'],
                                                       pssm)
            coord_s = pd.Series(coord_dict, name=coord)
            new_quant_df = new_quant_df.append(coord_s)

    new_quant_df = backfill_splice_sites(new_quant_df,
                                         gff3,
                                         fa_dict,
                                         pssm,
                                         organism=organism)

    for n in range(len(new_quant_df['seq5'].iloc[0])):
        new_quant_df['Base 5-' + str(n)] = [x[n] for x in new_quant_df['seq5']]
    for n in range(len(new_quant_df['seq3'].iloc[0])):
        new_quant_df['Base 3-' + str(n)] = [x[n] for x in new_quant_df['seq3']]
    new_quant_df = new_quant_df.drop(['seq5', 'seq3'], axis=1)

    lariat_df = junc_df[(junc_df['type'] == '3prime') |
                        (junc_df['looks like'] == 'AG')]
    lariat_df = lariat_df.sort_values(
        ['genome coord', 'annotated intron size'], ascending=False)
    lariat_df = lariat_df.reset_index(drop=True).drop_duplicates(
        subset='genome coord', keep='first').set_index('genome coord')
    lariat_df = lariat_df[[
        'transcript', 'chromosome', 'position', 'strand', 'type'
    ]]

    return new_quant_df, lariat_df
Example #22
0
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    column_dict = {
        'position': [],
        'transcript': [],
        'alt splicing': [],
        'type': [],
        'strand': [],
        'introns in transcript': [],
        'intron size': [],
        'chromosome': [],
        '5p score': [],
        '3p score': [],
        'intron position': [],
        'exon size (us)': [],
        'exon size (ds)': [],
        'transcript size': [],
        'peak': [],
        'seq5': [],
        'seq3': []
    }
    new_index = []

    for tx in set(df['transcript']):
        strand = df[df['transcript'] == tx].iloc[0]['strand']
        splice_sites = ss_dict[tx]
        if strand == '+':
            splice_sites = sorted(list(splice_sites), key=lambda x: x[0])
        elif strand == '-':
            splice_sites = sorted(list(splice_sites),
                                  key=lambda x: x[0],
                                  reverse=True)

        df_pos = None
        for n, (five_site, three_site) in enumerate(splice_sites):
            # Check if already in dataframe
            in_df = False
            for peak in df[df['transcript'] == tx]['position']:
                if five_site in range(int(peak) - 5, int(peak) + 5):
                    in_df = True
                    df_pos = peak
                    break

            column_dict['transcript'].append(tx)
            if organism == 'pombe':
                iso = tx + '.1'
            else:
                iso = tx + 'T0'

            column_dict['intron size'].append(abs(three_site - five_site))
            column_dict['introns in transcript'].append(len(splice_sites))
            column_dict['strand'].append(strand)
            chrom = df[df['transcript'] == tx].iloc[0]['chromosome']
            column_dict['chromosome'].append(chrom)
            column_dict['transcript size'].append(
                (tx_dict[iso][1] - tx_dict[iso][0]) / 1000.)

            # Check if first or last intron and add exon size
            if n == 0:
                column_dict['intron position'].append('First')
                if strand == '+':
                    column_dict['exon size (us)'].append(
                        (five_site - tx_dict[iso][0]) / 1000.)
                    if len(splice_sites) > 1:
                        ds_length = (splice_sites[n + 1][0] -
                                     three_site) / 1000.
                        try:
                            if ds_length < 0:
                                ds_length = (splice_sites[n + 2][0] -
                                             three_site) / 1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (tx_dict[iso][1] - three_site) / 1000.

                elif strand == '-':
                    column_dict['exon size (us)'].append(
                        (tx_dict[iso][1] - five_site) / 1000.)
                    if len(splice_sites) > 1:
                        ds_length = (three_site -
                                     splice_sites[n + 1][0]) / 1000.
                        try:
                            if ds_length < 0:
                                ds_length = (three_site -
                                             splice_sites[n + 2][0]) / 1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (three_site - tx_dict[iso][0]) / 1000.
                column_dict['exon size (ds)'].append(ds_length)

            elif n == len(splice_sites) - 1:
                column_dict['intron position'].append('Last')
                column_dict['exon size (us)'].append(
                    (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.)

                if strand == '+':
                    column_dict['exon size (ds)'].append(
                        (tx_dict[iso][1] - three_site) / 1000.)
                elif strand == '-':
                    column_dict['exon size (ds)'].append(
                        (three_site - tx_dict[iso][0]) / 1000.)
            else:
                column_dict['intron position'].append('Middle')
                column_dict['exon size (us)'].append(
                    (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.)
                column_dict['exon size (ds)'].append(
                    abs(three_site - splice_sites[n + 1][0]) / 1000.)

            if in_df is True:
                peak_index = chrom + ':' + str(int(df_pos))
                new_index.append(peak_index)
                column_dict['position'].append(df_pos)
                column_dict['3p score'].append(df.loc[peak_index, '3p score'])
                column_dict['5p score'].append(df.loc[peak_index, '5p score'])
                column_dict['alt splicing'].append(df.loc[peak_index,
                                                          'alt splicing'])
                column_dict['type'].append(df.loc[peak_index, 'type'])
                column_dict['peak'].append(True)
                column_dict['seq5'].append(df.loc[peak_index, 'seq5'])
                column_dict['seq3'].append(df.loc[peak_index, 'seq3'])

            if in_df is False:
                column_dict['alt splicing'].append(False)
                column_dict['type'].append('5prime')
                column_dict['peak'].append(False)

                # Get position, index and sequence for scoring and position code
                if strand == '+':
                    column_dict['position'].append(five_site + 1)
                    new_index.append(chrom + ':' + str(five_site + 1))
                    sequence1 = fa_dict[chrom][(five_site - 1):(five_site + 7)]
                    sequence2 = fa_dict[chrom][(three_site - 5):(three_site +
                                                                 3)]

                elif strand == '-':
                    column_dict['position'].append(five_site - 1)
                    new_index.append(chrom + ':' + str(five_site - 1))
                    sequence1 = fa_dict[chrom][(five_site - 6):(five_site + 2)]
                    sequence1 = SP.reverse_complement(sequence1)
                    sequence2 = fa_dict[chrom][(three_site - 2):(three_site +
                                                                 6)]
                    sequence2 = SP.reverse_complement(sequence2)

                column_dict['seq5'].append(sequence1)
                column_dict['seq3'].append(sequence2)

                # Score sequences
                score_5, score_3 = SP.simple_score_junction(
                    sequence1, sequence2, PSSM)
                column_dict['3p score'].append(score_3)
                column_dict['5p score'].append(score_5)

    # Create new dataframe from column dictionary
    new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index)
    for column, data in column_dict.iteritems():
        new_df[column] = data

    return new_df
Example #23
0
def find_score_branches_ppy(quant_df, peak_branch_df, fa_dict):
    #branches = generate_all_branches()
    PSSM = branch_PSSM(peak_branch_df, fa_dict)

    branches = []
    if type(peak_branch_df) is not str:
        for ix, branch in peak_branch_df['Branch seq'].iteritems():
            seq = branch[2:7]
            if seq[:-2] != 'A' and 'A' in seq:
                A_ix = branch.rfind('A')
                new_seq = branch[A_ix - 3:A_ix + 2]
                if len(new_seq) == 5:
                    seq = new_seq
        branches = peak_branch_df['Branch seq'].str[2:7]
    else:
        with open(peak_branch_df) as f:
            for line in f:
                branches.append(line.strip())

    # Sort branches by abundance so that the most common ones are first in the search
    br_abund = []
    for branch in set(branches):
        count = len([x for x in branches if x == branch])
        if count > 1:
            br_abund.append((branch, count))
    br_abund = sorted(br_abund, key=lambda x: x[1], reverse=True)
    branches = zip(*br_abund)[0]

    branch_dict = collections.OrderedDict()
    for branch in branches:
        branch_dict[branch] = score_branch(branch, PSSM)

    branch_3_dist = []
    branch_score = []
    branch_seqs = []
    perc_py = []
    for ix, r in quant_df.iterrows():
        if r['strand'] == '+':
            intron_seq = fa_dict[
                r['chromosome']][int(r['position']):int(r['position'] +
                                                        r['intron size'])]
            three_site = r['position'] + r['intron size']
        elif r['strand'] == '-':
            intron_seq = fa_dict[r['chromosome']][int(r['position'] -
                                                      r['intron size'] -
                                                      1):int(r['position'] -
                                                             1)]
            intron_seq = SP.reverse_complement(intron_seq)
            three_site = r['position'] - r['intron size']

        if type(peak_branch_df) is not str:
            if ix in peak_branch_df['genome coord']:
                ix_df = peak_branch_df[peak_branch_df['genome coord'] == ix]
                ix_df = ix_df.sort_values('depth', ascending=False)
                best_branch = ix_df.iloc[0, 'branch site']
                best_branch = abs(ix_df.iloc[0, '5p splice site'] -
                                  best_branch)

                seq = ix_df.iloc[0, 'Branch seq'][2:7]
                branch_seqs.append(seq)
                branch_score.append(score_branch(seq, PSSM))
                branch_3_dist.append(ix_df.iloc[0, 'Branch to 3p distance'])

                if 'N' in intron_seq[best_branch[0] + 5:]:
                    print ix
                    print intron_seq
                perc_py.append(percent_py(intron_seq[best_branch[0] + 5:]))

            else:
                matches = []
                for branch in branch_dict:
                    if branch in intron_seq:
                        matches.append((intron_seq.index(branch), branch,
                                        branch_dict[branch]))

                if len(matches) == 0:
                    # Find the closest A
                    best_ix = intron_seq[:-3].rfind('A')
                    seq = intron_seq[best_ix - 3:best_ix + 2]
                    score = score_branch(seq, PSSM)
                    best_branch = (best_ix, seq, score)

                    #branch_3_dist.append(np.NaN)
                    #branch_score.append(np.NaN)
                    #branch_seqs.append('NNNNN')
                    #perc_py.append(percent_py(intron_seq[-30:]))
                elif len(matches) > 1:
                    matches = sorted(matches, key=lambda x: x[2], reverse=True)
                    best_branch = matches[0]
                else:
                    best_branch = matches[0]

                branch_3_dist.append(
                    (len(intron_seq) - best_branch[0] - 4) / 1000.)
                branch_score.append(best_branch[2])
                branch_seqs.append(best_branch[1])

                if len(intron_seq) - best_branch[0] - 5 > 1:
                    if 'N' in intron_seq[best_branch[0] + 5:]:
                        print ix
                        print intron_seq
                    perc_py.append(percent_py(intron_seq[best_branch[0] + 5:]))
                else:
                    perc_py.append(np.NaN)
        else:
            matches = []
            for branch in branch_dict:
                if branch in intron_seq:
                    matches.append((intron_seq.index(branch), branch,
                                    branch_dict[branch]))

            if len(matches) == 0:
                # Find the closest A
                best_ix = intron_seq[:-3].rfind('A')
                seq = intron_seq[best_ix - 3:best_ix + 2]
                score = score_branch(seq, PSSM)
                best_branch = (best_ix, seq, score)

                #branch_3_dist.append(np.NaN)
                #branch_score.append(np.NaN)
                #branch_seqs.append('NNNNN')
                #perc_py.append(percent_py(intron_seq[-30:]))
            elif len(matches) > 1:
                matches = sorted(matches, key=lambda x: x[2], reverse=True)
                best_branch = matches[0]
            else:
                best_branch = matches[0]

            branch_3_dist.append(
                (len(intron_seq) - best_branch[0] - 4) / 1000.)
            branch_score.append(best_branch[2])
            branch_seqs.append(best_branch[1])

            if len(intron_seq) - best_branch[0] - 5 > 1:
                if 'N' in intron_seq[best_branch[0] + 5:]:
                    print ix
                    print intron_seq
                perc_py.append(percent_py(intron_seq[best_branch[0] + 5:]))
            else:
                perc_py.append(np.NaN)

    print len(quant_df)
    print len(branch_score)

    quant_df['branch score'] = branch_score
    quant_df['branch to 3p distance'] = branch_3_dist
    quant_df['percent pPy'] = perc_py

    branch_seqs = ['NNNNN' if len(x) < 5 else x for x in branch_seqs]
    print len(branch_seqs)
    print len(quant_df)

    for n in range(len(branch_seqs[0])):
        pos = [x[n] for x in branch_seqs]
        quant_df['branch-' + str(n)] = pos

    print str(len(quant_df) - len(quant_df['branch score'].dropna())
              ) + ' introns without identifiable branches'
    return quant_df
Example #24
0
def collect_intron_seq(gff3_file,
                       fasta_file,
                       ss_dict=None,
                       junction_bed=None,
                       gene_list=None,
                       peak_df=None,
                       organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) == dict:
        fasta_dict = fasta_file
    elif fasta_file.endswith('json'):
        with open(fasta_file, 'r') as f:
            fasta_dict = json.load(f)
    else:
        fasta_dict = make_fasta_dict(fasta_file)
    if ss_dict is not None:
        ss_dict = ss_dict
    elif junction_bed is not None:
        ss_dict = SP.build_junction_dict(junction_bed,
                                         gff3_file,
                                         transcript_dict,
                                         organism=organism)
    elif peak_df is not None:
        ss_dict = {}
        peak_df = peak_df[~peak_df['type'].str.contains('prime')]
        for ix, r in peak_df.iterrows():
            if r['transcript'] not in ss_dict:
                ss_dict[r['transcript']] = []
            if r['strand'] == '+':
                ss_dict[r['transcript']].append(
                    (r['position'], r['position'] + 50))
            elif r['strand'] == '-':
                ss_dict[r['transcript']].append(
                    (r['position'], r['position'] - 50))

    else:
        ss_dict, intron_flag = SP.list_splice_sites(gff3_file,
                                                    gene_list=gene_list,
                                                    organism=organism)
        ss_dict = SP.collapse_ss_dict(ss_dict)

    seq_dict = {}
    for transcript, introns in ss_dict.iteritems():
        if junction_bed is None:
            if organism == 'pombe':
                transcript = transcript + '.1'
            else:
                transcript = transcript + 'T0'
        introns = list(introns)
        strand = transcript_dict[transcript][2]
        chrom = transcript_dict[transcript][3]
        n = 0
        for n in range(len(introns)):
            if strand == '+':
                seq_dict[transcript + '-' + chrom + ':' +
                         str(introns[n][0] +
                             1)] = fasta_dict[chrom][introns[n][0] +
                                                     2:introns[n][0] + 17]
            elif strand == '-':
                seq = fasta_dict[chrom][introns[n][0] - 16:introns[n][0] - 1]
                seq_dict[transcript + '-' + chrom + ':' +
                         str(introns[n][0])] = SP.reverse_complement(seq)
    return seq_dict
Example #25
0
def make_quant_df(junc_df, branch_df, gff3, fa_dict, organism=None):
    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
        
    quant_df = junc_df[(junc_df['type'] != '3prime') & (junc_df['looks like'] != 'AG')]
    
    new_intron_size = []
    alt_splice = []
    score_3 = []
    score_5 = []
    seq5 = []
    seq3 = []

    new_quant_df = pd.DataFrame(index=set(quant_df.index), columns=['intron size','alt splicing'])
    for coord in new_quant_df.index:
        coord_df = quant_df[quant_df.index == coord]

        #Determine if multiple junctions come from this peak
        if len(coord_df) > 1: alt_splice.append(True)
        else: alt_splice.append(False)

        if max(coord_df['annotated intron size']) > 0:
            coord_df = coord_df.sort_values('annotated intron size', ascending=False)
            new_intron_size.append(coord_df.ix[0]['annotated intron size']/1000.)
            seq5.append(coord_df.ix[0]['junction sequence1'])
            seq3.append(coord_df.ix[0]['junction sequence2'])
            score_3.append(coord_df.ix[0]['annotated 3p score'])
            score_5.append(coord_df.ix[0]['annotated 5p score'])
            
        else:
            coord_df = coord_df.sort_values('junction size', ascending=False)
            new_intron_size.append(coord_df.ix[0]['junction size']/1000.)
            seq5.append(coord_df.ix[0]['junction sequence1'])
            seq3.append(coord_df.ix[0]['junction sequence2'])
            scores = SP.simple_score_junction(coord_df.ix[0]['junction sequence1'], coord_df.ix[0]['junction sequence2'], pssm)
            score_3.append(scores[1])
            score_5.append(scores[0])
            
    new_quant_df['intron size'] = new_intron_size
    new_quant_df['alt splicing'] = alt_splice
    new_quant_df['5p score'] = score_5
    new_quant_df['3p score'] = score_3
    new_quant_df['seq5'] = seq5
    new_quant_df['seq3'] = seq3

    quant_df = quant_df.sort_values('annotated intron size')
    quant_df = quant_df.reset_index(drop=True).drop_duplicates(subset='genome coord', keep='first').set_index('genome coord')

    new_quant_df = new_quant_df.merge(quant_df[['transcript','chromosome','position','strand','type']], right_index=True, left_index=True)
    
    for coord in set(branch_df['genome coord']):
        if coord not in new_quant_df.index:
            coord_df = branch_df[branch_df['genome coord'] == coord]
            coord_df = coord_df.sort_values('depth')
            best = coord_df.iloc[0]
            coord_dict = {'transcript':best['transcript'][:-2], 
                         'chromosome':best['chromosome'],
                         'position':best['5p splice site'],
                         'strand':best['strand'],
                         'type':best['type'],
                         'intron size':best['intron size'],
                         'alt splicing':np.where(len(coord_df)> 1, True, False),
                         '5p score':np.NaN,
                         '3p score':np.NaN,
                         'seq5':'','seq3':''}

            
            if len(best['5p seq']) > 0:
                coord_dict['seq5'] = best['5p seq']
            else:
                if best['strand'] == '+':
                    coord_dict['seq5'] = fa_dict[best['chromosome']][(int(best['5p splice site'])-1):(int(best['5p splice site'])+7)]
                elif best['strand'] == '-':
                    coord_dict['seq5'] = fa_dict[best['chromosome']][(int(best['5p splice site'])-6):(int(best['5p splice site'])+2)]
                    coord_dict['seq5'] = SP.reverse_complement(coord_dict['seq5'])
                    
            if str(best['3p splice site']) != 'nan':
                three_site = best['3p splice site']
            else:
                if best['strand'] == '+':
                    after_branch = fa_dict[best['chromosome']][best['branch site']:best['branch site']+100]
                elif best['strand'] == '-':
                    after_branch = fa_dict[best['chromosome']][best['branch site']-100:best['branch site']]
                    after_branch = SP.reverse_complement(after_branch)
                for subs in ['TAG','CAG','GAG','AAG']:
                    if subs in after_branch:
                        ix = after_branch.find(subs)+3
                        break
                three_site = best['branch site']+ix
                if best['strand'] == '-':
                    three_site = best['branch site']-ix
                coord_dict['intron size'] = abs(coord_dict['position']-three_site)
            
            if best['strand'] == '+':
                coord_dict['seq3'] = fa_dict[best['chromosome']][int(three_site-5):int(three_site)+3]
            elif best['strand'] == '-':
                coord_dict['seq3'] = fa_dict[best['chromosome']][int(three_site)-2:int(three_site)+6]
                coord_dict['seq3'] = SP.reverse_complement(coord_dict['seq3'])
                    
            coord_dict['5p score'], coord_dict['3p score'] = SP.simple_score_junction(coord_dict['seq5'], coord_dict['seq3'], pssm)
            coord_s = pd.Series(coord_dict, name=coord)
            new_quant_df = new_quant_df.append(coord_s)
    
    new_quant_df = backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism)
    
    for n in range(len(new_quant_df['seq5'].iloc[0])):     
        new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    for n in range(len(new_quant_df['seq3'].iloc[0])):
        new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)
    
    lariat_df = junc_df[(junc_df['type'] == '3prime') | (junc_df['looks like'] == 'AG')]
    lariat_df = lariat_df.sort_values(['genome coord','annotated intron size'], ascending=False)
    lariat_df = lariat_df.reset_index(drop=True).drop_duplicates(subset='genome coord', keep='first').set_index('genome coord')
    lariat_df = lariat_df[['transcript','chromosome','position','strand','type']]
    
    return new_quant_df, lariat_df
Example #26
0
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False):
    #Populate gene dictionary and build genome
    if 'pombe' in gff3.lower():
        transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')
        ss, flag = SP.list_splice_sites(gff3, organism='pombe')
        organism = 'pombe'
    else:
        transcript_dict = SP.build_transcript_dict(gff3)
        ss, flag = SP.list_splice_sites(gff3)
        organism = None
    ss_dict = SP.collapse_ss_dict(ss)
    genome = fasta_dict
    #print genome.keys()
    nuc_prob = gc_content(fasta_dict)
    #print nuc_prob

    base_dict = {"A": 0, "C": 1, "T": 2, "G": 3}

    #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G.
    pos_matrix_5prime = np.zeros([4, 8])
    pos_matrix_3prime = np.zeros([4, 8])

    counter1 = 0
    counter2 = 0

    for transcript, introns in ss_dict.iteritems():
        counter2 += 1
        if organism == 'pombe':
            isoform = transcript + '.1'
        else:
            isoform = transcript + 'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            counter1 += 1
            if strand == '+':
                seq = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)]
                seq = SP.reverse_complement(seq)

            for a, base in enumerate(seq):
                pos_matrix_5prime[base_dict[base], a] += 1

            if strand == '+':
                seq = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)]
                seq = SP.reverse_complement(seq)

            for b, base in enumerate(seq):
                pos_matrix_3prime[base_dict[base], b] += 1

    #print counter1
    #print counter2

    float_formatter = lambda x: "%.1f" % x
    np.set_printoptions(formatter={'float_kind': float_formatter})

    a = 0
    while a < 4:
        b = 0
        while b < 8:
            if PSSM is False:
                pos_matrix_5prime[a,
                                  b] = (pos_matrix_5prime[a,
                                                          b]) / float(counter1)
                pos_matrix_3prime[a,
                                  b] = (pos_matrix_3prime[a,
                                                          b]) / float(counter1)
            if PSSM is True:
                if pos_matrix_5prime[a, b] == 0: pos_matrix_5prime[a, b] += 1
                if pos_matrix_3prime[a, b] == 0: pos_matrix_3prime[a, b] += 1
                pos_matrix_5prime[a, b] = np.log2(
                    (pos_matrix_5prime[a, b] / float(counter1)) / nuc_prob[a])
                pos_matrix_3prime[a, b] = np.log2(
                    (pos_matrix_3prime[a, b] / float(counter1)) / nuc_prob[a])
            b += 1
        a += 1

    return (pos_matrix_5prime, pos_matrix_3prime)
Example #27
0
def get_junction_sequence(df, gff3_file, fasta_file):
    df = df.sort_values('chr', axis=0)

    #transcript_dict[transcript] = [start, end, strand, chromosome, CDS start, CDS end]
    transcript_dict = SP.build_transcript_dict(gff3_file)

    #splice_dict[transcipt] = [[5'sites][3'sites]]
    splice_dict, flag = SP.list_splice_sites(gff3_file)

    #fasta_dict[chr] = sequence
    if type(fasta_file) is str:
        fasta_dict = make_fasta_dict(fasta_file)
    else:
        fasta_dict = fasta_file

    transcript_by_chr = {}
    for transcript, coords in transcript_dict.iteritems():
        chromosome = coords[3]
        if chromosome in transcript_by_chr:
            transcript_by_chr[chromosome].append(transcript)
        else:
            transcript_by_chr[chromosome] = []
            transcript_by_chr[chromosome].append(transcript)

    df['Gene'] = "Unknown"
    df['intron'] = "Middle"
    df['sequence1'] = ''
    df['sequence2'] = ''
    df['intron sequence'] = 'No sequence here'

    n = 0
    for n in range(len(df)):
        coord1 = int(df['coord_1'][n].strip())
        coord2 = int(df['coord_2'][n].strip())
        chrom = df['chr'][n].strip()
        strand = df['strand'][n].strip()
        transcripts = transcript_by_chr[chrom]

        for transcript in transcripts:
            tx_strand = transcript_dict[transcript][2]
            start = transcript_dict[transcript][0]
            stop = transcript_dict[transcript][1]

            if strand == tx_strand and coord1 >= start and coord2 <= stop:
                df.loc[n, 'Gene'] = transcript

        if strand == '+':
            sequence1 = fasta_dict[chrom][(coord1 - 3):(coord1 + 5)]
            sequence2 = fasta_dict[chrom][(coord2 - 6):(coord2 + 2)]
            all_seq = fasta_dict[chrom][(coord1 - 1):coord2]
        elif strand == '-':
            sequence1 = fasta_dict[chrom][(coord2 - 6):(coord2 + 2)]
            sequence1 = SP.reverse_complement(sequence1)
            sequence2 = fasta_dict[chrom][(coord1 - 3):(coord1 + 5)]
            sequence2 = SP.reverse_complement(sequence2)
            all_seq = fasta_dict[chrom][(coord1 - 1):coord2]
            all_seq = SP.reverse_complement(all_seq)

        df.loc[n, 'sequence1'] = sequence1
        df.loc[n, 'sequence2'] = sequence2
        df.loc[n, 'intron sequence'] = all_seq

    for transcript in transcripts:
        if transcript in df['Gene'].tolist():
            tx_df = df[df['Gene'] == transcript]
            s = tx_df['coord_1']
            min_idx = s.idxmin()
            first = int(s.min())
            #print transcript_dict[transcript][2]
            #print first
            max_idx = s.idxmax()
            last = int(s.max())
            #print last

            if first == last:
                df.loc[min_idx, 'intron'] = 'Only'
            else:
                if transcript_dict[transcript][2] == '+':
                    df.loc[min_idx, 'intron'] = 'First'
                    df.loc[max_idx, 'intron'] = 'Last'
                elif transcript_dict[transcript][2] == '-':
                    df.loc[min_idx, 'intron'] = 'Last'
                    df.loc[max_idx, 'intron'] = 'First'

            for index, coord_1 in s.iteritems():
                if df['intron'][index] == 'Middle':
                    if coord_1 in range(first - 10, first + 10):
                        df_idx = s[s == coord_1].index[0]
                        if transcript_dict[transcript][2] == '+':
                            df.loc[df_idx, 'intron'] = 'First'
                        elif transcript_dict[transcript][2] == '-':
                            df.loc[df_idx, 'intron'] = 'Last'
                    elif coord_1 in range(last - 10, last + 10):
                        df_idx = s[s == coord_1].index[0]
                        if transcript_dict[transcript][2] == '+':
                            df.loc[df_idx, 'intron'] = 'Last'
                        elif transcript_dict[transcript][2] == '-':
                            df.loc[df_idx, 'intron'] = 'First'

    df = df[df['contained in'] != '']
    df = df.reset_index()
    return df
Example #28
0
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None):
    count1 = 0
    count2 = 0
    
    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    
    quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')]
    quant_df['genome coord'] = quant_df['chromosome'].str.cat(quant_df['position'].values.astype(str), sep=':')
    quant_df.index = quant_df['genome coord']
    quant_df = quant_df.drop('index', axis=1)
    
    column_dict = {'intron size':[], 'alt splicing':[], '5p score':[], '3p score':[], 'seq5':[], 'seq3':[]}
    new_index = []
    seq5 = []
    seq3 = []

    for coord in quant_df.index:
        coord_df = quant_df[quant_df.index == coord]
        three_site = None
        alt3 = False
        if len(coord_df) > 0:
            coord_df = coord_df.sort_values('height', ascending=False).ix[0]
        introns = ss_dict[coord_df['transcript']]
        if 'prime' in coord_df['type']:
            peak_range = range(coord_df['position']-5,coord_df['position']+5)
            for intron in introns:
                if intron[0] in peak_range:
                    five_site = intron[0]
                    three_site = intron[1]
                    break
            if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0:
                alt3=True
        else:
            if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']:
                five_site = coord_df['position']
                three_df = quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]
                three_df = three_df.sort_values('height', ascending=False)
                three_site = three_df.ix[0]['position']
        
        if three_site is not None:
            new_index.append(coord)
            size = abs(three_site-five_site)/1000.
            column_dict['intron size'].append(size)
            column_dict['alt splicing'].append(alt3)
            
            if coord_df['strand'] == '+':
                s5 = fa_dict[coord_df['chromosome']][five_site-2:five_site+6]
                s3 = fa_dict[coord_df['chromosome']][three_site-6:three_site+2]
            elif coord_df['strand'] == '-':
                s5 = fa_dict[coord_df['chromosome']][five_site-6:five_site+2]
                s5 = SP.reverse_complement(s5)
                s3 = fa_dict[coord_df['chromosome']][three_site-2:three_site+6]
                s3 = SP.reverse_complement(s3)
            column_dict['seq5'].append(s5)
            column_dict['seq3'].append(s3)
            scores = SP.simple_score_junction(s5, s3, pssm)
            column_dict['3p score'].append(scores[1])
            column_dict['5p score'].append(scores[0])
            
    new_quant_df = quant_df[quant_df.index.isin(new_index)][['genome coord','chromosome',
                                                             'strand','transcript','position','type']]
    for column, data in column_dict.iteritems():
        new_quant_df[column] = data
    
    new_quant_df = new_quant_df.drop_duplicates(subset='genome coord', keep='first').set_index('genome coord')
    
    new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism)
    
    #for n in range(len(new_quant_df['seq5'].iloc[0])):     
    #    new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    #for n in range(len(new_quant_df['seq3'].iloc[0])):
    #    new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)
    
    new_quant_df = SP.find_score_branches_ppy(new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict)
    
    return new_quant_df
Example #29
0
def build_junction_df(junction_bed, gff3_file, fasta, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta) == str:
        fasta=SP.make_fasta_dict(fasta)
    junction_dict = build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism)
    junction_count = 0
    for tx, junctions in junction_dict.iteritems():
        junction_count += len(junctions)
    
    junction_df = pd.DataFrame(index=range(junction_count), columns=['intron tuple','chromosome','start','end','strand','depth','type','size','annotated intron size','annotated intron start','annotated intron end'])
    n=0
    for tx, junctions in junction_dict.iteritems():
        for junction in junctions:
            junction_df.ix[n] = [tx]+junction
            n+=1
    
    sequence1 = []
    sequence2 = []
    ann_seq1 = []
    ann_seq2 = []
    seq_type1 = []
    seq_type2 = []
    df_tx = []
    for index, row in junction_df.iterrows():
        df_tx.append(row['intron tuple'][0])
        chrom = convert_chrom(row['chromosome'])
        if row['strand'] == '+':
            curr1 = fasta[chrom][(row['start']-1):(row['start']+7)]
            sequence1.append(curr1)
            curr2 = fasta[chrom][(row['end']-5):(row['end']+3)]
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(fasta[chrom][(row['annotated intron start']-1):(row['annotated intron start']+7)])
                ann_seq2.append(fasta[chrom][(row['annotated intron end']-5):(row['annotated intron end']+3)])
        elif row['strand'] == '-':
            curr1 = SP.reverse_complement(fasta[chrom][(row['start']-6):(row['start']+2)])
            sequence1.append(curr1)
            curr2 = SP.reverse_complement(fasta[chrom][(row['end']-2):(row['end']+6)])
            sequence2.append(curr2)
            if row['annotated intron start'] is None:
                ann_seq1.append(None)
                ann_seq2.append(None)
            else:
                ann_seq1.append(SP.reverse_complement(fasta[chrom][row['annotated intron start']-6:row['annotated intron start']+2]))
                ann_seq2.append(SP.reverse_complement(fasta[chrom][row['annotated intron end']-2:row['annotated intron end']+6]))
        else:
            sequence1.append('NNNNNNNN')
            sequence2.append('NNNNNNNN')
            ann_seq1.append('NNNNNNNN')
            ann_seq2.append('NNNNNNNN')
        
        
        if row['type'] == 'Annotated': 
            seq_type1.append('5p annotated')
            seq_type2.append('3p annotated')
        elif row['type'] == '5p tethered':
            seq_type1.append('5p annotated')
            seq_type2.append(curr2[4:6])
        else:
            seq_type1.append(curr1[2:4])
            seq_type2.append(curr2[4:6])
            
    junc_seq_df = junction_df
    junc_seq_df['sequence1'] = sequence1
    junc_seq_df['sequence2'] = sequence2
    junc_seq_df['seq type1'] = seq_type1
    junc_seq_df['seq type2'] = seq_type2
    junc_seq_df['annotated sequence1'] = ann_seq1
    junc_seq_df['annotated sequence2'] = ann_seq2
    junc_seq_df['transcript'] = df_tx
    
    return junc_seq_df