Example #1
0
def write_intron_fasta(transcript_dict, fasta_dict, prefix='introns', sense=True):
    seq_dict = {}
    for transcript, values in transcript_dict.iteritems():
        start = values[0]
        end = values[1]
        strand = values[2]
        chrom = values[3]
        CDS_start_list = values[4]
        CDS_end_list = values[5]

        for n in range(len(CDS_start_list)-1):
            if strand == '+':
                seq = fasta_dict[chrom][CDS_end_list[n]:CDS_start_list[n+1]-1]
            elif strand == '-':
                intron = len(CDS_start_list)-n-1
                seq = fasta_dict[chrom][CDS_end_list[intron]:CDS_start_list[intron-1]-1]
                seq = SP.reverse_complement(seq)
        
            if sense is False:
                seq = SP.reverse_complement(seq)
            
            seq_dict[transcript+'_'+str(n)] = seq
        
    with open('{}.fa'.format(prefix), 'w') as fout:
        for transcript, seq in seq_dict.iteritems():
            fout.write('>'+transcript+'\n')
            fout.write(seq+'\n')
    return seq_dict
Example #2
0
def add_seq(branch_df, fa_dict):
    five_seqs = []
    branch_seqs = []
    for ix, r in branch_df.iterrows():
        five = fa_dict[r['chromosome']][r['5p splice site']-8:r['5p splice site']+8]
        branch = fa_dict[r['chromosome']][r['branch site']-8:r['branch site']+8]
        if r['strand'] == '-':
            five = SP.reverse_complement(five)
            branch = SP.reverse_complement(branch)
        if 'GT' in five[4:11]:
            ix = five.index('GT')
            five = five[ix-2:ix+6]
        else:
            five = five[4:12]
        if 'AG' in branch[4:11]:
            ix = branch.index('AG')
            branch = branch[ix-4:ix+4]
        elif 'AA' in branch[4:11]:
            ix = branch.index('AA')
            branch = branch[ix-4:ix+4]
        elif 'GA' in branch[4:11]:
            ix = branch.index('GA')
            branch = branch[ix-4:ix+4]
        else:
            branch = branch[4:13]
        five_seqs.append(five)
        branch_seqs.append(branch)
    branch_df['5p seq'] = five_seqs
    branch_df['Branch seq'] = branch_seqs
    
    receptors = ['AG', 'AA', 'GA']
    branch_df = branch_df[branch_df['Branch seq'].str[4:6].isin(receptors)]
    return branch_df
Example #3
0
def generate_all_ss_seqs(gff3, fasta_dict, organism):
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss)
    
    all_seq5 = []
    all_seq3 = []
    for transcript, introns in ss_dict.iteritems():
        if organism == 'pombe':
            isoform = transcript+'.1'
        else:
            isoform = transcript+'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            if strand == '+':
                seq5 = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)]
            elif strand == '-':
                seq5 = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)]
                seq5 = SP.reverse_complement(seq5)

            all_seq5.append(seq5)

            if strand == '+':
                seq3 = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)]
            elif strand == '-':
                seq3 = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)]
                seq3 = SP.reverse_complement(seq3)
            
            all_seq3.append(seq3)
    return all_seq5, all_seq3
Example #4
0
def add_intron_size(peaks_df, gff3, organism=None):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    no_peaks = ss_dict
    intron_sizes = []
    for index, row in peaks_df.iterrows():
        if row['type'] != 'intronic':
            intron_sizes.append(np.NaN)
        else:
            sites = ss_dict[row['transcript']]
            assigned = False
            for pair in sites:
                if pair[0] > pair[1]:
                    if row['position'] >= pair[1] and row['position'] <= pair[
                            0]:
                        intron_sizes.append(pair[0] - pair[1])
                        assigned = True
                        no_peaks[row['transcript']].remove(pair)
                        break
                else:
                    if row['position'] >= pair[0] and row['position'] <= pair[
                            1]:
                        intron_sizes.append(pair[1] - pair[0])
                        assigned = True
                        no_peaks[row['transcript']].remove(pair)
                        break
            if assigned is False:
                intron_sizes.append(np.NaN)
    peaks_df['intron size'] = intron_sizes
    return peaks_df, no_peaks
Example #5
0
def add_intron_size(peaks_df, gff3, organism=None):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    no_peaks = ss_dict
    intron_sizes = []
    for index, row in peaks_df.iterrows():
        if row['type'] != 'intronic':
            intron_sizes.append(np.NaN)
        else:
            sites = ss_dict[row['transcript']]
            assigned=False
            for pair in sites:
                if pair[0] > pair[1]:
                    if row['position'] >= pair[1] and row['position'] <= pair[0]:
                        intron_sizes.append(pair[0]-pair[1])
                        assigned=True
                        no_peaks[row['transcript']].remove(pair)
                        break
                else:
                    if row['position'] >= pair[0] and row['position'] <= pair[1]:
                        intron_sizes.append(pair[1]-pair[0])
                        assigned=True
                        no_peaks[row['transcript']].remove(pair)
                        break
            if assigned is False:
                intron_sizes.append(np.NaN)
    peaks_df['intron size'] = intron_sizes
    return peaks_df,  no_peaks
Example #6
0
def add_seq(branch_df, fa_dict):
    five_seqs = []
    branch_seqs = []
    for ix, r in branch_df.iterrows():
        five = fa_dict[r['chromosome']][r['5p splice site'] -
                                        8:r['5p splice site'] + 8]
        branch = fa_dict[r['chromosome']][r['branch site'] -
                                          8:r['branch site'] + 8]
        if r['strand'] == '-':
            five = SP.reverse_complement(five)
            branch = SP.reverse_complement(branch)
        if 'GT' in five[4:11]:
            ix = five.index('GT')
            five = five[ix - 2:ix + 6]
        else:
            five = five[4:12]
        if 'AG' in branch[4:11]:
            ix = branch.index('AG')
            branch = branch[ix - 4:ix + 4]
        elif 'AA' in branch[4:11]:
            ix = branch.index('AA')
            branch = branch[ix - 4:ix + 4]
        elif 'GA' in branch[4:11]:
            ix = branch.index('GA')
            branch = branch[ix - 4:ix + 4]
        else:
            branch = branch[4:13]
        five_seqs.append(five)
        branch_seqs.append(branch)
    branch_df['5p seq'] = five_seqs
    branch_df['Branch seq'] = branch_seqs

    receptors = ['AG', 'AA', 'GA']
    branch_df = branch_df[branch_df['Branch seq'].str[4:6].isin(receptors)]
    return branch_df
Example #7
0
def peaks_only(config_file, untagged, organism):
    CP_out = []
    quant_bams = {}
    with open(config_file, 'r') as config:
        for line in config:
            if untagged in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = config_file.split('/')[-1].split('_config')[0]
    base_dir = config_file.split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: "+base_dir+name
    
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    
    peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks')
    peak_df.to_pickle(base_dir+name+'_all_peaks.pickle')
    
    quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism)
    quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism)
    
    quant_df.to_pickle(base_dir+name+'_quantitation.pickle')
    quant_df.to_csv(base_dir+name+'_quantitation.csv')
    
    scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
Example #8
0
def find_3p_site(branch_df, gff3, organism=None):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    three_coord = []
    for ix, r in branch_df.iterrows():
        introns = ss_dict[r['transcript'][:-2]]
        matched = False
        for intron in introns:
            if r['5p splice site'] in range(intron[0] - 1, intron[0] + 2):
                three_coord.append(intron[1])
                matched = True
                break
        if matched is False:
            three_coord.append(np.NaN)

    branch_df['3p splice site'] = three_coord
    branch_df['intron size'] = branch_df['5p splice site'] - branch_df[
        '3p splice site']
    branch_df['intron size'] = branch_df['intron size'].apply(abs)
    branch_df['Branch to 3p distance'] = branch_df['branch site'] - branch_df[
        '3p splice site']
    branch_df['Branch to 3p distance'] = branch_df[
        'Branch to 3p distance'].apply(abs)

    return branch_df
Example #9
0
def generate_all_ss_seqs(gff3, fasta_dict, organism):
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss)

    all_seq5 = []
    all_seq3 = []
    for transcript, introns in ss_dict.iteritems():
        if organism == 'pombe':
            isoform = transcript + '.1'
        else:
            isoform = transcript + 'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            if strand == '+':
                seq5 = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)]
            elif strand == '-':
                seq5 = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)]
                seq5 = SP.reverse_complement(seq5)

            all_seq5.append(seq5)

            if strand == '+':
                seq3 = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)]
            elif strand == '-':
                seq3 = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)]
                seq3 = SP.reverse_complement(seq3)

            all_seq3.append(seq3)
    return all_seq5, all_seq3
Example #10
0
def peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name):
    # Load in junctions
    junc_df1 = SP.build_junction_df(junc_beds[0], gff3, fa_dict, organism=organism)
    junc_df2 = SP.build_junction_df(junc_beds[1], gff3, fa_dict, organism=organism)
    
    junc_df = SP.combine_junctions(junc_df1, junc_df2)
    #print junc_df

    # Compare peaks and junctions
    peaks_w_junc = SP.compare_peak_junc_df(peak_df, junc_df, organism=organism)
    peaks_w_junc = SP.score_peaks(peaks_w_junc, gff3, fa_dict)
    
    # Reformat dataframe - add index, sort so that the annotated intron is first in each cluster
    peaks_w_junc.index = peaks_w_junc['genome coord']
    peaks_w_junc['type index'] = np.where(peaks_w_junc['junction type'] == 'Annotated', 0, 1)
    peaks_w_junc = peaks_w_junc.sort_values('type index')
    peaks_w_junc.groupby(peaks_w_junc.index).first()
    peaks_w_junc = peaks_w_junc.drop(['index', 'type index'], axis=1)
    peaks_w_junc['intron tuple'] = zip(peaks_w_junc['transcript'].tolist(),peaks_w_junc['annotated intron size'].tolist())
    
    print "\nPeaks with corresponding exon-exon junctions:"
    print len(peaks_w_junc)
    print str(len(set(peaks_w_junc[~peaks_w_junc['type'].str.contains('prime')]['genome coord'])))+" unpredicted"
    
    peaks_w_junc.to_csv(base_dir+name+'_peaks_w_junc.csv')
    peaks_w_junc.to_pickle(base_dir+name+'_peaks_w_junc.pickle')
    
    return peaks_w_junc
Example #11
0
def peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name):
    branches = SP.list_branch_points(branch_bams[0], gff3, fa_dict, organism=organism)
    branch_df = SP.create_branch_df(branches, gff3, fa_dict, organism=organism)
    if len(branch_bams) == 2:
        branches2 = SP.list_branch_points(branch_bams[1], gff3, fa_dict, organism=organism)
        branch_df2 = SP.create_branch_df(branches2, gff3, fa_dict, organism=organism)
        branch_df = branch_df.append(branch_df2)

        bed1 = branch_bams[0].split('_sorted.bam')[0]+'.bed'
        bed2 = branch_bams[1].split('_sorted.bam')[0]+'.bed'
        cat_args = "cat {0} {1} > {2}_all_branches.bed".format(bed1, bed2, name)
        call(cat_args, shell=True)

        os.remove(bed1)
        os.remove(bed2)

    # Compare peaks and branches
    peaks_w_branch = branch_df[branch_df['genome coord'].isin(peak_df['genome coord'])]
    peaks_w_branch = peaks_w_branch.merge(peak_df[['type','genome coord']], right_on='genome coord', left_on='genome coord', how='left')
    peaks_w_branch.index = peaks_w_branch['branch coord']

    print "\nPeaks with corresponding branches:"
    print len(peaks_w_branch)
    print str(len(set(peaks_w_branch['genome coord'])))+" unpredicted"

    peaks_w_branch.to_csv(base_dir+name+'_peaks_w_branch.csv')
    peaks_w_branch.to_pickle(base_dir+name+'_peaks_w_branch.pickle')
    
    return peaks_w_branch
Example #12
0
def peak_seq_enrichment(df, organism):
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    nuc_prob = SP.gc_content(fa_dict)
    p_dict = {'A':nuc_prob[0], 'T':nuc_prob[2], 'C':nuc_prob[1], 'G':nuc_prob[3]}
    
    unpeaks = df[df['type'] == 'other']
    unpeaks = unpeaks.append(df[df['type'] == 'intronic'])
    print "Number of unpredicted peaks:"
    print len(unpeaks)
    nucs = ['G','A','C','T']
    dinucs = set()
    for nuc in nucs:
        for nuc2 in nucs:
            dinucs.add(nuc+nuc2)
    
    five = {}
    three = {}
    for dinuc in dinucs:
        five[dinuc] = len(unpeaks[unpeaks['sequence'].str[6:8].str.contains(dinuc)])
        three[dinuc] = len(unpeaks[unpeaks['sequence'].str[4:6].str.contains(dinuc)])

    five_LO = {}
    three_LO = {}
    for dinuc in five.keys():
        p_dinuc = p_dict[dinuc[0]]*p_dict[dinuc[1]]
        phat_dinuc = five[dinuc]/float(len(unpeaks))
        phat_dinuc2 = three[dinuc]/float(len(unpeaks))

        SE = np.sqrt(phat_dinuc*(1-phat_dinuc)/len(unpeaks))
        SE2 = np.sqrt(phat_dinuc2*(1-phat_dinuc2)/len(unpeaks))
        Z = (phat_dinuc-p_dinuc)/SE
        Z2 = (phat_dinuc2-p_dinuc)/SE2

        pvalue = stats.norm.sf(Z)
        pvalue2 = stats.norm.sf(Z2)
        LO = np.log((1-pvalue)/pvalue)
        LO2 = np.log((1-pvalue2)/pvalue2)

        five_LO[dinuc] = LO
        three_LO[dinuc] = LO2

    fig, ax = plt.subplots(figsize=(12,6))
    width = 0.35
    ind = np.arange(len(five_LO.keys()))
    rects2 = ax.bar(ind, three_LO.values(), width, color='crimson', edgecolor='crimson', label='Before peak')
    rects1 = ax.bar(ind + width, five_LO.values(), width, color='indigo', edgecolor='indigo', label='After peak')
    ax.plot([-1,17],[0,0],'-', color='black')
    ax.plot([-1,17],[2.94,2.94], '--', color='0.7', label='95% CI')
    ax.plot([-1,17],[-2.94,-2.94], '--', color='0.7')

    ax.set_xlim([-1,17])
    ax.set_xticklabels(five_LO.keys(), fontsize=12)
    ax.set_xticks(ind + width / 2)
    ax.set_ylabel('Log odds dinucleotide enrichment', fontsize=14)
    ax.set_title('Unpredicted peaks', fontsize=14)
    ax.legend(fontsize=12)
    
    return fig
Example #13
0
def write_intergenic_fasta(transcript_dict, fasta_dict, bps_us=0, bps_ds=0, all_intergenic=True, prefix='intergenic_transcripts'):
    seq_dict = {}
    if all_intergenic is False:
        for transcript, values in transcript_dict.iteritems():
            start = values[0]
            end = values[1]
            strand = values[2]
            chrom = values[3]
            
            if bps_us > 0:
                if strand == '+':
                    seq_us_sense = fasta_dict[chrom][start-bps_us:start]
                elif strand == '-':
                    seq_us_sense = fasta_dict[chrom][end:end+bps_us]
                    seq_us_sense = SP.reverse_complement(seq_us_sense)
                seq_us_antisense = SP.reverse_complement(seq_us_sense)
                seq_dict[transcript+'_us_sense'] = seq_us_sense
                seq_dict[transcript+'_us_antisense'] = seq_us_antisense
            
            if bps_ds > 0:
                if strand == '+':
                    seq_ds_sense = fasta_dict[chrom][end:bps_ds+end]
                elif strand == '-':
                    seq_ds_sense = fasta_dict[chrom][start-bps_ds:start]
                    seq_ds_sense = SP.reverse_complement(seq_ds_sense)
                seq_ds_antisense = SP.reverse_complement(seq_ds_sense)
                seq_dict[transcript+'_ds_sense'] = seq_ds_sense
                seq_dict[transcript+'_ds_antisense'] = seq_ds_antisense
    
    elif all_intergenic is True:
        chroms = fasta_dict.keys()
        for chrom in chroms:
            chrom_transcripts = dict((k, transcript_dict[k]) for k in transcript_dict if transcript_dict[k][3] == chrom)
            chr_txs_df = pd.DataFrame.from_dict(chrom_transcripts, orient='index')
            chr_txs_df.sort_values([0], inplace=True)
            sorted_transcripts = chr_txs_df.index.tolist()

            n = 0
            for n in range(len(sorted_transcripts)-1):
                transcript = sorted_transcripts[n]
                next_transcript = sorted_transcripts[n+1]
                transcript_end = chr_txs_df[1][transcript]
                next_start = chr_txs_df[0][next_transcript]
                if next_start > transcript_end:
                    seq_plus = fasta_dict[chrom][transcript_end:next_start]
                    seq_dict[transcript+'_'+next_transcript+'_plus'] = seq_plus
                    seq_dict[transcript+'_'+next_transcript+'_minus'] = SP.reverse_complement(seq_plus)
                else:
                    print 'Overlapping transcripts:'
                    print transcript
                    print next_transcript
                    
    with open('{}.fa'.format(prefix), 'w') as fout:
        for transcript, seq in seq_dict.iteritems():
            fout.write('>'+transcript+'\n')
            fout.write(seq+'\n')
    
    return seq_dict
Example #14
0
def sort_bedgraphs(directory, transcript_dict):
    bedgraph_list = []
    for file in os.listdir(directory):
        if file.lower().endswith(".bedgraph"):
            print file
            bedgraph_list.append(directory + file)

    for bedgraph in bedgraph_list:
        SP.build_bedgraph_dict(transcript_dict, bedgraph)
Example #15
0
def sort_bedgraphs(directory, transcript_dict):
    bedgraph_list = []
    for file in os.listdir(directory):
        if file.lower().endswith(".bedgraph"):
            print file
            bedgraph_list.append(directory+file)
            
    for bedgraph in bedgraph_list:
        SP.build_bedgraph_dict(transcript_dict, bedgraph)
Example #16
0
def get_sequence(coord_dict, gff3_file, fasta_file):
    if 'pombe' in gff3_file:
        organism = 'pombe'

    else:
        organism = None

    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) is str:
        fasta_dict = make_fasta_dict(fasta_file)
    else:
        fasta_dict = fasta_file

    seq_dict = {}
    counter5 = 0
    counter3 = 0
    other = 0
    for transcript, coord_sets in coord_dict.iteritems():
        seq_dict[transcript] = []
        chrom = transcript_dict[transcript][3]
        #if chrom in rom_lat: chrom = rom_lat[chrom]
        strand = transcript_dict[transcript][2]
        for coord in coord_sets[0]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord - 9):(coord + 11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord - 10):(coord + 10)]
                sequence = SP.reverse_complement(sequence)

            if sequence[10:12] == 'GT' or sequence[10:12] == 'GC':
                seq_type = "5'"
                counter5 += 1
            seq_dict[transcript].append((sequence, seq_type))

        for coord in coord_sets[1]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord - 9):(coord + 11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord - 10):(coord + 10)]
                sequence = SP.reverse_complement(sequence)

            if sequence[8:10] == 'AG':
                seq_type = "3'"
                counter3 += 1
            seq_dict[transcript].append((sequence, seq_type))

    #print str(counter5)+" 5' splice sites"
    #print str(counter3)+" 3' splice sites"

    return seq_dict
Example #17
0
def read_sorted_bedgraphs(directory, transcript_dict):
    stranded_bedgraphs = {}
    for file in os.listdir(directory):
        if file.endswith("_CNAGsort.bedgraph"):
            if "plus" in file:
                if file.split('_plus')[0] not in stranded_bedgraphs:
                    stranded_bedgraphs[file.split('_plus')[0]] = [None, None]
                stranded_bedgraphs[file.split('_plus')[0]][0] = SP.read_CNAGsort_bedgraph2(file, transcript_dict, organism='pombe')
            elif 'minus' in file:
                if file.split('_minus')[0] not in stranded_bedgraphs:
                    stranded_bedgraphs[file.split('_minus')[0]] = [None, None]
                stranded_bedgraphs[file.split('_minus')[0]][1] = SP.read_CNAGsort_bedgraph2(file, transcript_dict, organism='pombe')
    return stranded_bedgraphs
Example #18
0
def get_sequence(coord_dict, gff3_file, fasta_file):
    if 'pombe' in gff3_file:
        organism = 'pombe'

    else: organism = None
    
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) is str:
        fasta_dict = make_fasta_dict(fasta_file)
    else:
        fasta_dict = fasta_file
    
    seq_dict = {}
    counter5 = 0
    counter3 = 0
    other = 0
    for transcript, coord_sets in coord_dict.iteritems():
        seq_dict[transcript] = []
        chrom = transcript_dict[transcript][3]
        #if chrom in rom_lat: chrom = rom_lat[chrom]
        strand = transcript_dict[transcript][2]
        for coord in coord_sets[0]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord-9):(coord+11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord-10):(coord+10)]
                sequence = SP.reverse_complement(sequence)

            if sequence[10:12] == 'GT' or sequence[10:12] == 'GC': 
                seq_type = "5'"
                counter5 += 1
            seq_dict[transcript].append((sequence, seq_type))
     
        for coord in coord_sets[1]:
            seq_type = 'other'
            if strand == "+":
                sequence = fasta_dict[chrom][(coord-9):(coord+11)]
            elif strand == "-":
                sequence = fasta_dict[chrom][(coord-10):(coord+10)]
                sequence = SP.reverse_complement(sequence)
                
            if sequence[8:10] == 'AG': 
                seq_type = "3'"
                counter3 += 1
            seq_dict[transcript].append((sequence, seq_type))
    
    #print str(counter5)+" 5' splice sites"
    #print str(counter3)+" 3' splice sites"
    
    return seq_dict
Example #19
0
def gene_patches(tx, tx_dict, ax, arrow=False):
    iso_list = [x for x in tx_dict if tx in x]
    if len(iso_list) == 0:
        return None
    
    for n, iso in enumerate(iso_list):
        start, end, strand, CDS_start, CDS_end, exons, chrom = SP.tx_info(iso, tx_dict)
        if arrow is False:
            tx_patch = patches.Rectangle((start,0.8-n*0.15),end-start,0.04,edgecolor='0.1',facecolor='0.1')
            ax.add_patch(tx_patch)
        else:
            if strand == '+':
                ax.arrow(start, 0.9, end-start-0.02*(end-start), 0, linewidth=2, head_width=0.1, 
                         head_length=0.02*(end-start), fc='k', ec='k')
            elif strand == '-':
                ax.arrow(end, 0.9, start-end-0.02*(start-end), 0, linewidth=2, head_width=0.1, 
                         head_length=0.02*(end-start), fc='k', ec='k')

        if exons is not None:
            exon_patches = []
            for exon_start, exon_stop in exons:
                exon_patches.append(patches.Rectangle((exon_start, 0.775-n*0.15), exon_stop-exon_start, 0.10,
                                                      edgecolor='0.1',facecolor='0.1'))
            for patch in exon_patches:
                ax.add_patch(patch)
        else:
            CDS_patch = patches.Rectangle((CDS_start, 0.75-n*0.15),CDS_end-CDS_start, 0.10, edgecolor='0.1', facecolor='0.1')
            ax.add_patch(CDS_patch)
        ax.get_yaxis().set_ticks([])
    return strand  
Example #20
0
def build_transcript_dict(gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", expand=False, convert_chroms=False):
    transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')
    
    lat_rom = {'chr1':'I','chr2':'II','chr3':'III','MT':'MT'}
    
    if convert_chroms is True:
        transcript_dict = {k:[start, end, strand, lat_rom[chrom], cds_start, cds_end] for 
                           k, [start, end, strand, chrom, cds_start, cds_end] in transcript_dict.items()}
    
    
    chrom_lengths = {'I':5818680, 'II':4744158, 'III':2598968,'chr1':5818680, 'chr2':4744158, 'chr3':2598968}
    
    if expand is True:
        expanded_dict = {}
        for tx, info in transcript_dict.iteritems():
            new_start = info[0]-300
            if new_start < 0:
                new_start = 0
            new_end = info[1]+300
            if info[3] in chrom_lengths:
                if new_end > chrom_lengths[info[3]]:
                    new_end = chrom_lengths[info[3]]
            #else: print info[3]
            if len(info[4]) == 0:
                info[4] = [info[0]]
            if len(info[5]) == 0:
                info[5] = [info[1]]
            expanded_dict[tx] = [new_start, new_end, info[2], info[3], info[4], info[5]]
        transcript_dict = expanded_dict
    
    return transcript_dict
Example #21
0
def main():
    gff3 = '/home/jordan/GENOMES/CNA3_all_transcripts.gff3'
    fasta = '/home/jordan/GENOMES/H99_fa.json'
    chrom_lengths = '/home/jordan/GENOMES/H99_chrom_lengths.json'
    prefix = sys.argv[1].split('/')[-1].split('.')[0]
    print prefix
    tx_dict = SP.build_transcript_dict(gff3)
    tx_by_chrom = sort_tx_by_chrom(tx_dict)
    int_dict = make_promoter_dict(tx_dict, chrom_lengths)
    peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1], cutoff=2)
    #peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1])
    peak_df = find_best_peaks(peak_df, int_dict, max_genes=300)
    if len(sys.argv) == 3:
        gene_list_file = sys.argv[2]
        in_list, other = split_by_gene(peak_df, gene_list_file)
        in_list.to_csv(prefix+'_by_gene_in_list.csv')
        other.to_csv(prefix+'_by_gene_other.csv')
        generate_sequence_file(in_list, int_dict, fasta, prefix+'_in_list')
        generate_sequence_file(other, int_dict, fasta, prefix+'_other')
        split = True
        minsites = [int(0.75*len(in_list)),int(0.75*len(other))]
        if minsites[0] > 600: minsites[0] = 600
        if minsites[1] > 600: minsites[1] = 600
    else:
        peak_df.to_csv(prefix+'_by_gene.csv')
        generate_sequence_file(peak_df, int_dict, fasta, prefix)
        split = False
        minsites = int(0.75*len(peak_df))
        if minsites > 600: minsites = 600
    call_meme(prefix, minsites, split=split)
Example #22
0
def add_cdf_to_plot(ax, value_lists, label_list, color_list, ks_list, log2=False):
    all_cdfs = []
    all_lists = []
    n = 0 
    
    for n in range(len(value_lists)):
        if log2 is True:
            new_list = [np.log2(x) for x in value_lists[n]]
        else: new_list = value_lists[n]
        new_list = [x for x in new_list if (str(x) != 'inf' and str(x) != '-inf' and str(x) != 'nan') ]
        all_lists = all_lists+new_list
        cumulative, base = SP.cdf_values(new_list)
        ax.plot(base[1:], cumulative, c=color_list[n], linewidth=3.0, label=label_list[n])
        all_cdfs.append(cumulative)
        
    xmin = np.percentile(all_lists, 1)
    xmax = np.percentile(all_lists, 99)
    ax.set_xlim([xmin,xmax])
    ax.tick_params(axis='x', labelsize=12)
    ax.tick_params(axis='y', labelsize=12)
    
    if ks_list is not None:
        text = "p-values:    \n"+ks_list[0]+'    \n'+ks_list[1]+'    '
        if len(ks_list) == 4:
            text = text+'    \n'+ks_list[2]+'    \n'+ks_list[3]+'    '
        ax.annotate(text,  xy=(xmax,0.0), horizontalalignment='right', fontsize=12)
    
    return ax
Example #23
0
def find_polyA_sites(transcript_dict, window=220):
    polyA_bg = SP.read_CNAGsort_bedgraph2(
        '/home/jordan/GENOMES/POMBE/polyA_sites_CNAGsort.bedgraph',
        transcript_dict,
        organism='pombe')
    pA_dict = {}
    for tx, s in polyA_bg.iteritems():
        s = s[s > 0]
        if len(s) > 0:
            if transcript_dict[tx][2] == '+':
                #pA_site = max(s.index)
                s.sort_values(ascending=False, inplace=True)
                pA_site = s.index[0]
                pA_dict[tx] = [
                    pA_site - window, pA_site + window, transcript_dict[tx][2],
                    transcript_dict[tx][3]
                ]
            elif transcript_dict[tx][2] == '-':
                #pA_site = min(s.index)
                s.sort_values(ascending=False, inplace=True)
                pA_site = s.index[0]
                pA_dict[tx] = [
                    pA_site - window, pA_site + window, transcript_dict[tx][2],
                    transcript_dict[tx][3]
                ]
    return pA_dict
Example #24
0
def by_pos_plots(df, metrics=['Intermediate Level', 'Precursor']):
    col5 = [x for x in df.columns if 'Base 5' in x[1]]
    col3 = [x for x in df.columns if 'Base 3' in x[1]]

    for direction in ['Up', 'Down']:
        for metric in metrics:
            if len(df[df[('All', metric + ' change')] == direction]) > 5:
                for n in range(len(col5)):
                    if n == 0:
                        s5 = df[df[('All',
                                    metric + ' change')] == direction][col5[n]]
                    else:
                        s5 = s5.str.cat(
                            df[df[('All',
                                   metric + ' change')] == direction][col5[n]])
                print len(s5)

                for n in range(len(col3)):
                    if n == 0:
                        s3 = df[df[('All',
                                    metric + ' change')] == direction][col3[n]]
                    else:
                        s3 = s3.str.cat(
                            df[df[('All',
                                   metric + ' change')] == direction][col3[n]])

                print metric + ' ' + direction
                fig = SP.position_wise_scores2(s5, s3, 'crypto')
Example #25
0
def main():
    gff3 = '/home/jordan/GENOMES/CNA3_all_transcripts.gff3'
    fasta = '/home/jordan/GENOMES/H99_fa.json'
    chrom_lengths = '/home/jordan/GENOMES/H99_chrom_lengths.json'
    prefix = sys.argv[1].split('/')[-1].split('.')[0]
    print prefix
    tx_dict = SP.build_transcript_dict(gff3)
    tx_by_chrom = sort_tx_by_chrom(tx_dict)
    int_dict = make_promoter_dict(tx_dict, chrom_lengths)
    peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1], cutoff=2)
    #peak_df = assign_peak_to_tx(tx_by_chrom, int_dict, sys.argv[1])
    peak_df = find_best_peaks(peak_df, int_dict, max_genes=300)
    if len(sys.argv) == 3:
        gene_list_file = sys.argv[2]
        in_list, other = split_by_gene(peak_df, gene_list_file)
        in_list.to_csv(prefix + '_by_gene_in_list.csv')
        other.to_csv(prefix + '_by_gene_other.csv')
        generate_sequence_file(in_list, int_dict, fasta, prefix + '_in_list')
        generate_sequence_file(other, int_dict, fasta, prefix + '_other')
        split = True
        minsites = [int(0.75 * len(in_list)), int(0.75 * len(other))]
        if minsites[0] > 600: minsites[0] = 600
        if minsites[1] > 600: minsites[1] = 600
    else:
        peak_df.to_csv(prefix + '_by_gene.csv')
        generate_sequence_file(peak_df, int_dict, fasta, prefix)
        split = False
        minsites = int(0.75 * len(peak_df))
        if minsites > 600: minsites = 600
    call_meme(prefix, minsites, split=split)
Example #26
0
def peak_to_seq_pipeline(untagged_peak_file, tagged1_peak_file, tagged2_peak_file, gff3, fasta, junction_df=None, branch_df=None, cutoff=5, name='CP_peaks'):
    
    if 'pombe' in gff3: organism = 'pombe'
    else: organism = None
        
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    print "Finding peaks in transcripts..."
    
    print untagged_peak_file
    untagged = CP_peaks_by_gene(untagged_peak_file, transcript_dict, cutoff=cutoff)
    
    print tagged1_peak_file
    tagged1 = CP_peaks_by_gene(tagged1_peak_file, transcript_dict, cutoff=cutoff)
    
    print tagged2_peak_file
    tagged2 = CP_peaks_by_gene(tagged2_peak_file, transcript_dict, cutoff=cutoff)
    
    print "Comparing peaks between replicates..."
    peaks = CP_compare_reps(untagged, tagged1, tagged2)
    
    print "Checking peaks against annotation..."
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    peak_df = CP_compare_to_annotation(peaks, ss_dict, transcript_dict)
    peak_df = collapse_unpredicted_peaks(peak_df)
    peak_df['genome coord'] = peak_df['chromosome'].str.cat(peak_df['position'].apply(int).apply(str), sep=':')
    
    if type(fasta) == str:
        fasta = SP.make_fasta_dict(fasta)
    print "Adding sequences..."
    peak_seq_df = add_sequence_to_df(peak_df, fasta, flag=flag)
    
    print "Writing bedgraph..."
    with open(name+'.bedgraph', 'w') as fout:
        for ix, r in peak_seq_df.iterrows():
            if r['strand'] == '+':
                position2 = r['position']+1
                height = r['height']
            elif r['strand'] == '-':
                position2 = r['position']-1
                height = r['height']*-1
            line_list = [r['chromosome'], r['position'], position2, height, '\n']
            line_list = [str(x) for x in line_list]
            line = '\t'.join(line_list)
            fout.write(line)
    
    print "Completed"
    return peak_seq_df
Example #27
0
def position_wise_scores2(seq5_list, seq3_list, organism, title='Intron position strength'):
    '''Uses chi-contingency test to score base proportions at each position in sample against population'''
    
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    all_5p, all_3p = generate_all_ss_seqs(gff3, fa_dict, organism)
    
    pop_5p = seq_list_to_totals(all_5p)
    pop_3p = seq_list_to_totals(all_3p)
    samp_5p = seq_list_to_totals(seq5_list)
    samp_3p = seq_list_to_totals(seq3_list)
    print samp_5p.shape

    p5 = []
    for n in range(samp_5p.shape[1]):
        if n == 2 or n == 3:
            p5.append(1)
        else:
            conting = np.array([samp_5p[:,n],pop_5p[:,n]])
            chi2, p, dof, expected = stats.chi2_contingency(conting)
            p5.append(np.log10(p)*-1)
        
    p3 = []
    for n in range(samp_3p.shape[1]):
        if n == 4 or n == 5:
            p3.append(1)
        else:
            conting = np.array([samp_3p[:,n],pop_3p[:,n]])
            chi2, p, dof, expected = stats.chi2_contingency(conting)
            p3.append(np.log10(p)*-1)
    
    fig, ax = plt.subplots(2, 1, figsize=(4,4))
    width = 0.7
    
    max_y = max(p5+p3) + 0.1*max(p5+p3)
    
    ind5 = np.arange(len(p5))
    ax[0].bar(ind5, p5, color='k')
    ax[0].plot([0,8], [2,2], '--', color='0.7')
    ax[0].set_xlim([0,len(p5)])
    ax[0].set_ylabel("5' splice site\n-log10(p-value)")
    ax[0].set_title(title)
    ax[0].set_ylim([0,max_y])

    ind3 = np.arange(len(p3))
    ax[1].bar(ind3, p3, color='k')
    ax[1].plot([0,8], [2,2], '--', color='0.7')
    ax[1].set_xlim([0,len(p3)])
    ax[1].set_ylabel("3' splice site\n-log10(p-value)")
    ax[1].set_ylim([0,max_y])

    ax[0].set_xticks(ind3 + width / 2)
    ax[1].set_xticks(ind3 + width / 2)
    ax[0].set_xticklabels(np.arange(-2,6))
    ax[1].set_xticklabels(np.arange(-5,3))

    fig.tight_layout()
    plt.show()
    return fig
Example #28
0
def read_sorted_bedgraphs(directory, transcript_dict):
    stranded_bedgraphs = {}
    for file in os.listdir(directory):
        if file.endswith("_CNAGsort.bedgraph"):
            if "plus" in file:
                if file.split('_plus')[0] not in stranded_bedgraphs:
                    stranded_bedgraphs[file.split('_plus')[0]] = [None, None]
                stranded_bedgraphs[file.split('_plus')
                                   [0]][0] = SP.read_CNAGsort_bedgraph2(
                                       file, transcript_dict, organism='pombe')
            elif 'minus' in file:
                if file.split('_minus')[0] not in stranded_bedgraphs:
                    stranded_bedgraphs[file.split('_minus')[0]] = [None, None]
                stranded_bedgraphs[file.split('_minus')
                                   [0]][1] = SP.read_CNAGsort_bedgraph2(
                                       file, transcript_dict, organism='pombe')
    return stranded_bedgraphs
Example #29
0
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) == dict:
        fasta_dict = fasta_file
    elif fasta_file.endswith('json'):
        with open(fasta_file, 'r') as f:
            fasta_dict = json.load(f)
    else:
        fasta_dict = make_fasta_dict(fasta_file)
    if ss_dict is not None:
        ss_dict=ss_dict
    elif junction_bed is not None:
        ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism)
    elif peak_df is not None:
        ss_dict = {}
        peak_df = peak_df[~peak_df['type'].str.contains('prime')]
        for ix, r in peak_df.iterrows():
            if r['transcript'] not in ss_dict:
                ss_dict[r['transcript']] = []
            if r['strand'] == '+':
                ss_dict[r['transcript']].append((r['position'],r['position']+50))
            elif r['strand'] == '-':
                ss_dict[r['transcript']].append((r['position'],r['position']-50))
                
    else:
        ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism)
        ss_dict = SP.collapse_ss_dict(ss_dict)
    
    seq_dict = {}
    for transcript, introns in ss_dict.iteritems():
        if junction_bed is None:
            if organism == 'pombe':
                transcript = transcript+'.1'
            else:
                transcript = transcript+'T0'
        introns = list(introns)
        strand = transcript_dict[transcript][2]
        chrom = transcript_dict[transcript][3]
        n = 0
        for n in range(len(introns)):
            if strand == '+':
                seq_dict[transcript+'-'+chrom+':'+str(introns[n][0]+1)] = fasta_dict[chrom][introns[n][0]+2:introns[n][0]+17]
            elif strand == '-':
                seq = fasta_dict[chrom][introns[n][0]-16:introns[n][0]-1]
                seq_dict[transcript+'-'+chrom+':'+str(introns[n][0])] = SP.reverse_complement(seq)
    return seq_dict
Example #30
0
def seq_simple(chrom, start, end, strand, fasta_dict):
    if type(fasta_dict) == str:
        with open(fasta_dict, 'r') as f:
            fasta_dict = json.load(f)
    seq = fasta_dict[chrom][start:end+1]
    if strand == '-':
        seq = SP.reverse_complement(seq)
    return seq
Example #31
0
def check_intron_position(transcript, position, gff3, organism):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    first = False
    last = False

    introns = ss_dict[transcript]

    for n, intron in enumerate(introns):
        if intron[0] in range(position - 3, position + 3):
            if n == 0:
                first = True
            elif n == len(intron):
                last = True
            break
    return first, last
Example #32
0
def check_intron_position(transcript, position, gff3, organism):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    
    first=False
    last=False
    
    introns = ss_dict[transcript]
    
    for n, intron in enumerate(introns):
        if intron[0] in range(position-3,position+3):
            if n == 0:
                first = True
            elif n == len(intron):
                last = True
            break
    return first, last
Example #33
0
def seq_simple(chrom, start, end, strand, fasta_dict):
    if type(fasta_dict) == str:
        with open(fasta_dict, 'r') as f:
            fasta_dict = json.load(f)
    seq = fasta_dict[chrom][start:end + 1]
    if strand == '-':
        seq = SP.reverse_complement(seq)
    return seq
Example #34
0
def gene_patches(tx, tx_dict, ax, arrow=False):
    iso_list = [x for x in tx_dict if tx in x]
    if len(iso_list) == 0:
        return None

    for n, iso in enumerate(iso_list):
        start, end, strand, CDS_start, CDS_end, exons, chrom = SP.tx_info(
            iso, tx_dict)
        if arrow is False:
            tx_patch = patches.Rectangle((start, 0.8 - n * 0.15),
                                         end - start,
                                         0.04,
                                         edgecolor='0.1',
                                         facecolor='0.1')
            ax.add_patch(tx_patch)
        else:
            if strand == '+':
                ax.arrow(start,
                         0.9,
                         end - start - 0.02 * (end - start),
                         0,
                         linewidth=2,
                         head_width=0.1,
                         head_length=0.02 * (end - start),
                         fc='k',
                         ec='k')
            elif strand == '-':
                ax.arrow(end,
                         0.9,
                         start - end - 0.02 * (start - end),
                         0,
                         linewidth=2,
                         head_width=0.1,
                         head_length=0.02 * (end - start),
                         fc='k',
                         ec='k')

        if exons is not None:
            exon_patches = []
            for exon_start, exon_stop in exons:
                exon_patches.append(
                    patches.Rectangle((exon_start, 0.775 - n * 0.15),
                                      exon_stop - exon_start,
                                      0.10,
                                      edgecolor='0.1',
                                      facecolor='0.1'))
            for patch in exon_patches:
                ax.add_patch(patch)
        else:
            CDS_patch = patches.Rectangle((CDS_start, 0.75 - n * 0.15),
                                          CDS_end - CDS_start,
                                          0.10,
                                          edgecolor='0.1',
                                          facecolor='0.1')
            ax.add_patch(CDS_patch)
        ax.get_yaxis().set_ticks([])
    return strand
Example #35
0
def peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism,
                         base_dir, name):
    branches = SP.list_branch_points(branch_bams[0],
                                     gff3,
                                     fa_dict,
                                     organism=organism)
    branch_df = SP.create_branch_df(branches, gff3, fa_dict, organism=organism)
    if len(branch_bams) == 2:
        branches2 = SP.list_branch_points(branch_bams[1],
                                          gff3,
                                          fa_dict,
                                          organism=organism)
        branch_df2 = SP.create_branch_df(branches2,
                                         gff3,
                                         fa_dict,
                                         organism=organism)
        branch_df = branch_df.append(branch_df2)

        bed1 = branch_bams[0].split('_sorted.bam')[0] + '.bed'
        bed2 = branch_bams[1].split('_sorted.bam')[0] + '.bed'
        cat_args = "cat {0} {1} > {2}_all_branches.bed".format(
            bed1, bed2, name)
        call(cat_args, shell=True)

        os.remove(bed1)
        os.remove(bed2)

    # Compare peaks and branches
    peaks_w_branch = branch_df[branch_df['genome coord'].isin(
        peak_df['genome coord'])]
    peaks_w_branch = peaks_w_branch.merge(peak_df[['type', 'genome coord']],
                                          right_on='genome coord',
                                          left_on='genome coord',
                                          how='left')
    peaks_w_branch.index = peaks_w_branch['branch coord']

    print "\nPeaks with corresponding branches:"
    print len(peaks_w_branch)
    print str(len(set(peaks_w_branch['genome coord']))) + " unpredicted"

    peaks_w_branch.to_csv(base_dir + name + '_peaks_w_branch.csv')
    peaks_w_branch.to_pickle(base_dir + name + '_peaks_w_branch.pickle')

    return peaks_w_branch
Example #36
0
def score_PyTract(df, fa_dict, alt_column_name=None, from_branches=False):
    py_score1 = []
    py_score2 = []
    alt_py1 = []
    alt_py2 = []
    
    for ix, r in df.iterrows():
        strand = r['strand']
        chrom = r['chromosome']
        coord = r['annotated intron coords'][1]
        alt_coord = r['junction coords'][1]
        if strand == '+':
            if coord is not None:
                seq1 = fa_dict[chrom][coord-15:coord]
                seq2 = fa_dict[chrom][coord-30:coord-15]
            alt1 = fa_dict[chrom][alt_coord-15:alt_coord]
            alt2 = fa_dict[chrom][alt_coord-30:alt_coord-15]
        if strand == '-':
            if coord is not None:
                seq1 = fa_dict[chrom][coord:coord+15]
                seq2 = fa_dict[chrom][coord+15:coord+30]
                seq1 = SP.reverse_complement(seq1)
                seq2 = SP.reverse_complement(seq2)
            alt1 = fa_dict[chrom][alt_coord:alt_coord+15]
            alt2 = fa_dict[chrom][alt_coord+15:alt_coord+30]
            alt1 = SP.reverse_complement(alt1)
            alt2 = SP.reverse_complement(alt2)

        alt_py1.append(percent_py(alt1))
        alt_py2.append(percent_py(alt2))
        
        if coord is not None:
            py_score1.append(percent_py(seq1))
            py_score2.append(percent_py(seq2))
        else:
            py_score1.append(np.NaN)
            py_score2.append(np.NaN)
    
    df['Py score annotated -15:0'] = py_score1
    df['Py score annotated -30:-15'] = py_score2
    df['Py score alternative -15:0'] = alt_py1
    df['Py score alternative -30:-15'] = alt_py2
    return df
Example #37
0
def score_PyTract(df, fa_dict, alt_column_name=None, from_branches=False):
    py_score1 = []
    py_score2 = []
    alt_py1 = []
    alt_py2 = []

    for ix, r in df.iterrows():
        strand = r['strand']
        chrom = r['chromosome']
        coord = r['annotated intron coords'][1]
        alt_coord = r['junction coords'][1]
        if strand == '+':
            if coord is not None:
                seq1 = fa_dict[chrom][coord - 15:coord]
                seq2 = fa_dict[chrom][coord - 30:coord - 15]
            alt1 = fa_dict[chrom][alt_coord - 15:alt_coord]
            alt2 = fa_dict[chrom][alt_coord - 30:alt_coord - 15]
        if strand == '-':
            if coord is not None:
                seq1 = fa_dict[chrom][coord:coord + 15]
                seq2 = fa_dict[chrom][coord + 15:coord + 30]
                seq1 = SP.reverse_complement(seq1)
                seq2 = SP.reverse_complement(seq2)
            alt1 = fa_dict[chrom][alt_coord:alt_coord + 15]
            alt2 = fa_dict[chrom][alt_coord + 15:alt_coord + 30]
            alt1 = SP.reverse_complement(alt1)
            alt2 = SP.reverse_complement(alt2)

        alt_py1.append(percent_py(alt1))
        alt_py2.append(percent_py(alt2))

        if coord is not None:
            py_score1.append(percent_py(seq1))
            py_score2.append(percent_py(seq2))
        else:
            py_score1.append(np.NaN)
            py_score2.append(np.NaN)

    df['Py score annotated -15:0'] = py_score1
    df['Py score annotated -30:-15'] = py_score2
    df['Py score alternative -15:0'] = alt_py1
    df['Py score alternative -30:-15'] = alt_py2
    return df
Example #38
0
def peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism,
                           base_dir, name):
    # Load in junctions
    junc_df1 = SP.build_junction_df(junc_beds[0],
                                    gff3,
                                    fa_dict,
                                    organism=organism)
    junc_df2 = SP.build_junction_df(junc_beds[1],
                                    gff3,
                                    fa_dict,
                                    organism=organism)

    junc_df = SP.combine_junctions(junc_df1, junc_df2)
    #print junc_df

    # Compare peaks and junctions
    peaks_w_junc = SP.compare_peak_junc_df(peak_df, junc_df, organism=organism)
    peaks_w_junc = SP.score_peaks(peaks_w_junc, gff3, fa_dict)

    # Reformat dataframe - add index, sort so that the annotated intron is first in each cluster
    peaks_w_junc.index = peaks_w_junc['genome coord']
    peaks_w_junc['type index'] = np.where(
        peaks_w_junc['junction type'] == 'Annotated', 0, 1)
    peaks_w_junc = peaks_w_junc.sort_values('type index')
    peaks_w_junc.groupby(peaks_w_junc.index).first()
    peaks_w_junc = peaks_w_junc.drop(['index', 'type index'], axis=1)
    peaks_w_junc['intron tuple'] = zip(
        peaks_w_junc['transcript'].tolist(),
        peaks_w_junc['annotated intron size'].tolist())

    print "\nPeaks with corresponding exon-exon junctions:"
    print len(peaks_w_junc)
    print str(
        len(
            set(peaks_w_junc[~peaks_w_junc['type'].str.contains('prime')]
                ['genome coord']))) + " unpredicted"

    peaks_w_junc.to_csv(base_dir + name + '_peaks_w_junc.csv')
    peaks_w_junc.to_pickle(base_dir + name + '_peaks_w_junc.pickle')

    return peaks_w_junc
Example #39
0
def peaks_only(config_file, untagged, organism):
    CP_out = []
    quant_bams = {}
    with open(config_file, 'r') as config:
        for line in config:
            if untagged in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = config_file.split('/')[-1].split('_config')[0]
    base_dir = config_file.split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: " + base_dir + name

    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    peak_df = SP.peak_to_seq_pipeline(CP_untagged,
                                      CP_out[0],
                                      CP_out[1],
                                      gff3,
                                      fa_dict,
                                      name=name + '_CP_peaks')
    peak_df.to_pickle(base_dir + name + '_all_peaks.pickle')

    quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism)
    quant_df = SP.quantitate_junction_df(quant_bams,
                                         quant_df,
                                         gff3,
                                         organism=organism)

    quant_df.to_pickle(base_dir + name + '_quantitation.pickle')
    quant_df.to_csv(base_dir + name + '_quantitation.csv')

    scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
Example #40
0
def build_tss_dict(gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3", window=220):
    transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')
    
    tss_dict = {}
    for tx, info in transcript_dict.iteritems():
        if info[2] == '+':
            start = info[0]-window
            end = info[0]+window
            tss_dict[tx] = [start, end, info[2], info[3]]
        elif info[2] == '-':
            start = info[1]-window
            end = info[1]+window
            tss_dict[tx] = [start, end, info[2], info[3]]
    return tss_dict
Example #41
0
def find_3p_site(branch_df, gff3, organism=None):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    
    three_coord = []
    for ix, r in branch_df.iterrows():
        introns = ss_dict[r['transcript'][:-2]]
        matched = False
        for intron in introns:
            if r['5p splice site'] in range(intron[0]-1,intron[0]+2):
                three_coord.append(intron[1])
                matched = True
                break
        if matched is False:
            three_coord.append(np.NaN)
    
    branch_df['3p splice site'] = three_coord
    branch_df['intron size'] = branch_df['5p splice site']-branch_df['3p splice site']
    branch_df['intron size'] = branch_df['intron size'].apply(abs)
    branch_df['Branch to 3p distance'] = branch_df['branch site']-branch_df['3p splice site']
    branch_df['Branch to 3p distance'] = branch_df['Branch to 3p distance'].apply(abs)
    
    return branch_df
Example #42
0
def write_transcript_fasta(transcript_dict, fasta_dict, prefix='transcripts', sense=True, spliced=False):
    seq_dict = {}
    for transcript, values in transcript_dict.iteritems():
        start = values[0]
        end = values[1]
        strand = values[2]
        chrom = values[3]
        CDS_start_list = values[4]
        CDS_end_list = values[5]
        
        if spliced is False:
            seq = fasta_dict[chrom][start-1:end]
            if strand == '-':
                seq = SP.reverse_complement(seq)

        elif spliced is True:
            seq = ''
            for n in range(len(CDS_start_list)):
                if strand == '+':
                    seq = seq+fasta_dict[chrom][CDS_start_list[n]-1:CDS_end_list[n]]
                elif strand == '-':
                    new_seq = fasta_dict[chrom][CDS_start_list[n]-1:CDS_end_list[n]]
                    new_seq = SP.reverse_complement(new_seq)
                    seq = seq+new_seq
        
        if sense is False:
            seq = SP.reverse_complement(seq)
            
        seq_dict[transcript] = seq
        
    with open('{}.fa'.format(prefix), 'w') as fout:
        for transcript, seq in seq_dict.iteritems():
            fout.write('>'+transcript+'\n')
            fout.write(seq+'\n')
    
    return seq_dict
Example #43
0
def build_tss_dict(
        gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3",
        window=220):
    transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')

    tss_dict = {}
    for tx, info in transcript_dict.iteritems():
        if info[2] == '+':
            start = info[0] - window
            end = info[0] + window
            tss_dict[tx] = [start, end, info[2], info[3]]
        elif info[2] == '-':
            start = info[1] - window
            end = info[1] + window
            tss_dict[tx] = [start, end, info[2], info[3]]
    return tss_dict
Example #44
0
def create_branch_df(branch_dict, gff3, fa_dict, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    chroms = []
    fives = []
    transcripts = []
    branches = []
    depths = []
    strands = []
    distances = []
    for tx, five_sites in branch_dict.iteritems():
        for five_site in five_sites:
            chrom = five_site[0].split(':')[0]
            pos = int(five_site[0].split(':')[1])
            n = 0
            for n in range(len(five_site[1])):
                if abs(five_site[1][n] -
                       pos) > 5 and abs(five_site[1][n] -
                                        pos) <= 1000 and five_site[2][n] >= 5:
                    chroms.append(chrom)
                    fives.append(pos)
                    transcripts.append(tx)
                    branches.append(five_site[1][n])
                    depths.append(five_site[2][n])
                    strands.append(tx_dict[tx][2])
                    if tx_dict[tx][2] == '+':
                        distances.append(five_site[1][n] - pos)
                    elif tx_dict[tx][2] == '-':
                        distances.append(pos - five_site[1][n])
    branch_df = pd.DataFrame(index=range(len(fives)))
    branch_df['transcript'] = transcripts
    branch_df['chromosome'] = chroms
    branch_df['5p splice site'] = fives
    branch_df['branch site'] = branches
    branch_df['depth'] = depths
    branch_df['distance'] = distances
    branch_df['strand'] = strands

    branch_df = branch_df[branch_df['distance'] > 0]
    branch_df['genome coord'] = branch_df['chromosome'].str.cat(
        branch_df['5p splice site'].apply(int).apply(str), sep=':')
    branch_df['branch coord'] = branch_df['chromosome'].str.cat(
        branch_df['branch site'].apply(int).apply(str), sep=':')

    branch_df = add_seq(branch_df, fa_dict)
    branch_df = find_3p_site(branch_df, gff3, organism=organism)
    return branch_df
Example #45
0
def find_polyA_sites(transcript_dict, window=220):
    polyA_bg = SP.read_CNAGsort_bedgraph2('/home/jordan/GENOMES/POMBE/polyA_sites_CNAGsort.bedgraph', transcript_dict, organism='pombe')
    pA_dict = {}
    for tx, s in polyA_bg.iteritems():
        s = s[s > 0]
        if len(s) > 0:
            if transcript_dict[tx][2] == '+':
                #pA_site = max(s.index)
                s.sort_values(ascending=False, inplace=True)
                pA_site = s.index[0]
                pA_dict[tx] = [pA_site-window, pA_site+window, transcript_dict[tx][2], transcript_dict[tx][3]]
            elif transcript_dict[tx][2] == '-':
                #pA_site = min(s.index)
                s.sort_values(ascending=False, inplace=True)
                pA_site = s.index[0]
                pA_dict[tx] = [pA_site-window, pA_site+window, transcript_dict[tx][2], transcript_dict[tx][3]]
    return pA_dict
Example #46
0
def build_transcript_dict(
        gff3="/home/jordan/GENOMES/POMBE/schizosaccharomyces_pombe.chr.gff3",
        expand=False,
        convert_chroms=False):
    transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')

    lat_rom = {'chr1': 'I', 'chr2': 'II', 'chr3': 'III', 'MT': 'MT'}

    if convert_chroms is True:
        transcript_dict = {
            k: [start, end, strand, lat_rom[chrom], cds_start, cds_end]
            for k, [start, end, strand, chrom, cds_start, cds_end] in
            transcript_dict.items()
        }

    chrom_lengths = {
        'I': 5818680,
        'II': 4744158,
        'III': 2598968,
        'chr1': 5818680,
        'chr2': 4744158,
        'chr3': 2598968
    }

    if expand is True:
        expanded_dict = {}
        for tx, info in transcript_dict.iteritems():
            new_start = info[0] - 300
            if new_start < 0:
                new_start = 0
            new_end = info[1] + 300
            if info[3] in chrom_lengths:
                if new_end > chrom_lengths[info[3]]:
                    new_end = chrom_lengths[info[3]]
            #else: print info[3]
            if len(info[4]) == 0:
                info[4] = [info[0]]
            if len(info[5]) == 0:
                info[5] = [info[1]]
            expanded_dict[tx] = [
                new_start, new_end, info[2], info[3], info[4], info[5]
            ]
        transcript_dict = expanded_dict

    return transcript_dict
Example #47
0
def add_cdf_to_plot(ax,
                    value_lists,
                    label_list,
                    color_list,
                    ks_list,
                    log2=False):
    all_cdfs = []
    all_lists = []
    n = 0

    for n in range(len(value_lists)):
        if log2 is True:
            new_list = [np.log2(x) for x in value_lists[n]]
        else:
            new_list = value_lists[n]
        new_list = [
            x for x in new_list
            if (str(x) != 'inf' and str(x) != '-inf' and str(x) != 'nan')
        ]
        all_lists = all_lists + new_list
        cumulative, base = SP.cdf_values(new_list)
        ax.plot(base[1:],
                cumulative,
                c=color_list[n],
                linewidth=3.0,
                label=label_list[n])
        all_cdfs.append(cumulative)

    xmin = np.percentile(all_lists, 1)
    xmax = np.percentile(all_lists, 99)
    ax.set_xlim([xmin, xmax])
    ax.tick_params(axis='x', labelsize=12)
    ax.tick_params(axis='y', labelsize=12)

    if ks_list is not None:
        text = "p-values:    \n" + ks_list[0] + '    \n' + ks_list[1] + '    '
        if len(ks_list) == 4:
            text = text + '    \n' + ks_list[2] + '    \n' + ks_list[3] + '    '
        ax.annotate(text,
                    xy=(xmax, 0.0),
                    horizontalalignment='right',
                    fontsize=12)

    return ax
Example #48
0
def create_branch_df(branch_dict, gff3, fa_dict, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    chroms = []
    fives = []
    transcripts = []
    branches = []
    depths = []
    strands = []
    distances = []
    for tx, five_sites in branch_dict.iteritems():
        for five_site in five_sites:
            chrom = five_site[0].split(':')[0]
            pos = int(five_site[0].split(':')[1])
            n=0
            for n in range(len(five_site[1])):
                if abs(five_site[1][n]-pos) > 5 and abs(five_site[1][n]-pos) <= 1000 and five_site[2][n] >= 5:
                    chroms.append(chrom)
                    fives.append(pos)
                    transcripts.append(tx)
                    branches.append(five_site[1][n])
                    depths.append(five_site[2][n])
                    strands.append(tx_dict[tx][2])
                    if tx_dict[tx][2] == '+':
                        distances.append(five_site[1][n]-pos)
                    elif tx_dict[tx][2] == '-':
                        distances.append(pos-five_site[1][n])
    branch_df = pd.DataFrame(index = range(len(fives)))
    branch_df['transcript'] = transcripts
    branch_df['chromosome'] = chroms
    branch_df['5p splice site'] = fives
    branch_df['branch site'] = branches
    branch_df['depth'] = depths
    branch_df['distance'] = distances
    branch_df['strand'] = strands
    
    branch_df = branch_df[branch_df['distance'] > 0]
    branch_df['genome coord'] = branch_df['chromosome'].str.cat(branch_df['5p splice site'].apply(int).apply(str), sep=':')
    branch_df['branch coord'] = branch_df['chromosome'].str.cat(branch_df['branch site'].apply(int).apply(str), sep=':')
    
    branch_df = add_seq(branch_df, fa_dict)
    branch_df = find_3p_site(branch_df, gff3, organism=organism)
    return branch_df
Example #49
0
def by_pos_plots(df, metrics=['Intermediate Level', 'Precursor']):
    col5 = [x for x in df.columns if 'Base 5' in x[1]]
    col3 = [x for x in df.columns if 'Base 3' in x[1]]
    
    for direction in ['Up','Down']:
        for metric in metrics:
            if len(df[df[('All',metric+' change')] == direction]) > 5:
                for n in range(len(col5)):
                    if n == 0:
                        s5 = df[df[('All',metric+' change')] == direction][col5[n]]
                    else:
                        s5 = s5.str.cat(df[df[('All',metric+' change')] == direction][col5[n]])
                print len(s5)

                for n in range(len(col3)):
                    if n == 0:
                        s3 = df[df[('All',metric+' change')] == direction][col3[n]]
                    else:
                        s3 = s3.str.cat(df[df[('All',metric+' change')] == direction][col3[n]])

                print metric+' '+direction
                fig = SP.position_wise_scores2(s5, s3, 'crypto')
Example #50
0
def position_wise_scores(seq_5p, seq_3p, organism):
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    PSSM_5p, PSSM_3p = generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    base_dict = {"A":0, "C":1, "T":2, "G":3}
    
    seq_5p = [x for x in seq_5p if x is not None]
    seq_3p = [x for x in seq_3p if x is not None]
    
    score_5prime = np.empty([2,len(seq_5p[0])])
    score_3prime = np.empty([2,len(seq_3p[0])])
    all_5p = np.empty([len(seq_5p), len(seq_5p[0])])
    all_3p = np.empty([len(seq_3p), len(seq_3p[0])])
                       
    n=0
    for n in range(len(seq_5p)):
        for a, base in enumerate(seq_5p[n]):
            all_5p[n,a] = PSSM_5p[base_dict[base], a]
    
    a=0
    for a in range(len(score_5prime[0])):
        score_5prime[0,a] = np.median(all_5p[0:,a])
        score_5prime[1,a] = (max(all_5p[0:,a])-min(all_5p[0:,a]))/2.
    print score_5prime
            
    m=0
    for m in range(len(seq_3p)):
        for b, base in enumerate(seq_3p[m]):
            all_3p[m,b] = PSSM_3p[base_dict[base], b]
        
    b=0
    for b in range(len(score_3prime[0])):
        score_3prime[0,b] = np.median(all_3p[0:,b])
        score_3prime[1,b] = (max(all_3p[0:,b])-min(all_3p[0:,b]))/2.
    print score_3prime
    
    return all_5p, all_3p
Example #51
0
def position_wise_scores(seq_5p, seq_3p, organism):
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    PSSM_5p, PSSM_3p = generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    base_dict = {"A": 0, "C": 1, "T": 2, "G": 3}

    seq_5p = [x for x in seq_5p if x is not None]
    seq_3p = [x for x in seq_3p if x is not None]

    score_5prime = np.empty([2, len(seq_5p[0])])
    score_3prime = np.empty([2, len(seq_3p[0])])
    all_5p = np.empty([len(seq_5p), len(seq_5p[0])])
    all_3p = np.empty([len(seq_3p), len(seq_3p[0])])

    n = 0
    for n in range(len(seq_5p)):
        for a, base in enumerate(seq_5p[n]):
            all_5p[n, a] = PSSM_5p[base_dict[base], a]

    a = 0
    for a in range(len(score_5prime[0])):
        score_5prime[0, a] = np.median(all_5p[0:, a])
        score_5prime[1, a] = (max(all_5p[0:, a]) - min(all_5p[0:, a])) / 2.
    print score_5prime

    m = 0
    for m in range(len(seq_3p)):
        for b, base in enumerate(seq_3p[m]):
            all_3p[m, b] = PSSM_3p[base_dict[base], b]

    b = 0
    for b in range(len(score_3prime[0])):
        score_3prime[0, b] = np.median(all_3p[0:, b])
        score_3prime[1, b] = (max(all_3p[0:, b]) - min(all_3p[0:, b])) / 2.
    print score_3prime

    return all_5p, all_3p
Example #52
0
def make_transcript_df(gff3):
    '''Creates a dataframe with all annotated transcripts from the gff3 file
    
    Parameters
    ----------
    gff3 : str
            Your favorite annotation file
            
    Returns
    ------
    df : pandas.DataFrame
            Pandas dataframe instance with location of transcripts from gff3 file'''
    
    if 'pombe' in gff3.lower():
        organism='pombe'
    else:
        organism=None
    
    # Get transcript dictionary
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    
    # Organize by transcript
    tx_dict = OrderedDict(sorted(tx_dict.items(), key=lambda t: t[0]))
    
    # Convert to dataframe
    tx_df = pd.DataFrame(index=tx_dict.keys(), columns=['start','end','strand','chromosome'])
    for n, col in enumerate(tx_df.columns):
        tx_df.loc[:,col] = zip(*tx_dict.values())[n]
    
    # Add CDS starts and ends
    CDS_starts = [min(l) if len(l) > 0 else np.NaN for l in zip(*tx_dict.values())[4]]
    CDS_ends = [max(l) if len(l) > 0 else np.NaN for l in zip(*tx_dict.values())[5]]
    tx_df.loc[:,'CDS start'] = CDS_starts
    tx_df.loc[:,'CDS end'] = CDS_ends
    
    return tx_df
Example #53
0
def log2_Zscore_df(df, wt, mut, metrics=['Intermediate Level', 'Precursor'], Z=2, by_pos_scores=False):
    if type(df) == str:
        new_df = pd.read_pickle(df)
    else:
        new_df = copy.deepcopy(df)
        
    print len(new_df)
    mutA = [x for x in new_df.columns if (x[0] == mut) and (x[1][-2:] == '-A')]
    #wtA = [x for x in new_df.columns if (x[0] == wt) and (x[1][-2:] == '-A')]
    new_df = new_df[new_df[mutA].sum(axis=1) >= 10]
    print len(new_df)
    
    for metric in metrics:
        columns = [x for x in new_df.columns if (metric in x[1]) and ('avg' not in x[1])]
        wt_cols = [x for x in columns if (x[0] == wt) and ('avg' not in x[1])]
        mut_cols = [x for x in columns if (x[0] == mut) and ('avg' not in x[1])]
        
        for column in columns:
            new_df[(column[0], column[1]+' log2')] = s_log2(new_df[column])
            
        if len(wt_cols) != len(mut_cols):
            print "Number of WT reps must match number of mutant reps!"
            print wt_cols
            print mut_cols
            return None
        
        for n, wt_col in enumerate(wt_cols):
            new_df[('All',metric+' log2 ratio'+str(n+1))] = s_log2(new_df[mut_cols[n]]/new_df[wt_col])
            new_index = [x+'-'+str(n) for x in new_df.index]
            
            wt_s = new_df[wt_col]
            wt_s.index = new_index
            mut_s = new_df[mut_cols[n]]
            mut_s.index = new_index
            
            if n == 0:
                wt_s_for_Z = wt_s
                mut_s_for_Z = mut_s
            else:
                wt_s_for_Z = wt_s_for_Z.append(wt_s)
                mut_s_for_Z = mut_s_for_Z.append(mut_s)
            
        Zlist = s_log2_ratio_Zscore(wt_s_for_Z.dropna(), mut_s_for_Z.dropna())
        
        for n, wt_col in enumerate(wt_cols):
            n_up = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist >= Z)]
            n_up = [x[:-2] for x in n_up.index]
            
            n_down = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist <= -1*Z)]
            n_down = [x[:-2] for x in n_down.index]
            
            n_other = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist < Z) & (Zlist > -1*Z)]
            n_other = [x[:-2] for x in n_other.index]
            
            if n == 0:
                up = set(n_up)
                down = set(n_down)
                other = set(n_other)
            
            else:
                up = up.intersection(n_up)
                up = up.difference(n_down).difference(n_other)
                down = down.intersection(n_down)
                down = down.difference(n_up).difference(n_other)
                other = other.intersection(n_other)
                other = other.difference(n_up).difference(n_down)
        
            print len(up)
            print len(down)
        
        new_df[('All',metric+' change')] = None
        new_df.loc[up, ('All',metric+' change')] = 'Up'
        new_df.loc[down, ('All',metric+' change')] = 'Down'
        new_df.loc[other, ('All', metric+' change')] = 'Other'
        
        plot_df = copy.deepcopy(new_df)
        
        fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(8,8))
        groups = {'Other':'0.8','Up':'tomato','Down':'cornflowerblue'}
        for group in ['Other','Up','Down']:
            gr_df = plot_df[plot_df[('All',metric+' change')] == group]
            if len(gr_df) >= 15:
                for n, wt_col in enumerate(wt_cols):
                    ax[0][n].scatter(s_log2(gr_df[wt_col]), s_log2(gr_df[mut_cols[n]]), 
                                color=groups[group], alpha=0.9, label=group, s=20)

                    ax[0][n].set_xlabel(wt_col[0]+' log2 '+metric, fontsize=12)
                    ax[0][n].set_ylabel(mut_cols[n][0]+' log2 '+metric, fontsize=12)
                    ax[0][n].set_title('Replicate '+str(n+1), fontsize=14)

                    ax[0][n], limits = SP.draw_diagonal(ax[0][n])
                    ax[0][n].legend(fontsize=12)

                sns.kdeplot(gr_df[('Peaks','intron size')], ax=ax[1][0], bw=2, cumulative=True, linewidth=3, 
                            color=groups[group], label=group)
                ax[1][0].set_xlim([30, 400])

                sns.kdeplot(gr_df[('Peaks','5p score')], ax=ax[1][1], bw=2, cumulative=True, linewidth=3, 
                            color=groups[group], label=group)

                ax[1][0].set_xlabel('Intron size (nt)')
                ax[1][0].set_ylabel('Fraction of introns')
                ax[1][1].set_xlabel('5prime splice site score')
                ax[1][1].set_ylabel('Fraction of introns')
        
        ax[1][1].set_xlim([np.percentile(plot_df[('Peaks','5p score')], 0.5),
                              np.percentile(plot_df[('Peaks','5p score')], 99.9)+5])
            
        fig.tight_layout()
        plt.show()
        plt.clf()

    if by_pos_scores is True:
        SP.by_pos_plots(new_df, metrics=metrics)
    
    new_df[('Peaks','predicted')] = True
    new_df.loc[~new_df[('Peaks','type')].str.contains('prime'), ('Peaks','predicted')] = False    
    return new_df
Example #54
0
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None):
    count1 = 0
    count2 = 0

    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    quant_df = peak_df[(peak_df['type'] != '3prime')
                       & (peak_df['looks like'] != 'AG')]
    quant_df['genome coord'] = quant_df['chromosome'].str.cat(
        quant_df['position'].values.astype(str), sep=':')
    quant_df.index = quant_df['genome coord']
    quant_df = quant_df.drop('index', axis=1)

    column_dict = {
        'intron size': [],
        'alt splicing': [],
        '5p score': [],
        '3p score': [],
        'seq5': [],
        'seq3': []
    }
    new_index = []
    seq5 = []
    seq3 = []

    for coord in quant_df.index:
        coord_df = quant_df[quant_df.index == coord]
        three_site = None
        alt3 = False
        if len(coord_df) > 0:
            coord_df = coord_df.sort_values('height', ascending=False).ix[0]
        introns = ss_dict[coord_df['transcript']]
        if 'prime' in coord_df['type']:
            peak_range = range(coord_df['position'] - 5,
                               coord_df['position'] + 5)
            for intron in introns:
                if intron[0] in peak_range:
                    five_site = intron[0]
                    three_site = intron[1]
                    break
            if len(quant_df[(quant_df['transcript'] == coord_df['transcript'])
                            & (quant_df['type'] == 'AG')]) > 0:
                alt3 = True
        else:
            if 'AG' in quant_df[quant_df['transcript'] ==
                                coord_df['transcript']]['type']:
                five_site = coord_df['position']
                three_df = quant_df[
                    (quant_df['transcript'] == coord_df['transcript'])
                    & (quant_df['type'] == 'AG')]
                three_df = three_df.sort_values('height', ascending=False)
                three_site = three_df.ix[0]['position']

        if three_site is not None:
            new_index.append(coord)
            size = abs(three_site - five_site) / 1000.
            column_dict['intron size'].append(size)
            column_dict['alt splicing'].append(alt3)

            if coord_df['strand'] == '+':
                s5 = fa_dict[coord_df['chromosome']][five_site - 2:five_site +
                                                     6]
                s3 = fa_dict[coord_df['chromosome']][three_site -
                                                     6:three_site + 2]
            elif coord_df['strand'] == '-':
                s5 = fa_dict[coord_df['chromosome']][five_site - 6:five_site +
                                                     2]
                s5 = SP.reverse_complement(s5)
                s3 = fa_dict[coord_df['chromosome']][three_site -
                                                     2:three_site + 6]
                s3 = SP.reverse_complement(s3)
            column_dict['seq5'].append(s5)
            column_dict['seq3'].append(s3)
            scores = SP.simple_score_junction(s5, s3, pssm)
            column_dict['3p score'].append(scores[1])
            column_dict['5p score'].append(scores[0])

    new_quant_df = quant_df[quant_df.index.isin(new_index)][[
        'genome coord', 'chromosome', 'strand', 'transcript', 'position',
        'type'
    ]]
    for column, data in column_dict.iteritems():
        new_quant_df[column] = data

    new_quant_df = new_quant_df.drop_duplicates(
        subset='genome coord', keep='first').set_index('genome coord')

    new_quant_df = SP.backfill_splice_sites(new_quant_df,
                                            gff3,
                                            fa_dict,
                                            pssm,
                                            organism=organism)

    #for n in range(len(new_quant_df['seq5'].iloc[0])):
    #    new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    #for n in range(len(new_quant_df['seq3'].iloc[0])):
    #    new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)

    new_quant_df = SP.find_score_branches_ppy(
        new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt',
        fa_dict)

    return new_quant_df
Example #55
0
def main():
    '''Each line will be : bam_file,genotype,sample
    e.g. CM763-A_sorted.bam,WT,A1'''

    bam_dict = {}
    with open(sys.argv[1], 'r') as config:
        for line in config:
            info = line.split(',')
            genotype = info[1]
            sample = info[2].strip()

            if genotype not in bam_dict:
                bam_dict[genotype] = {}

            bam_dict[genotype][sample] = info[0]

    prefix = sys.argv[1].split('_config')[0]

    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    columns = [
        '5p score', 'exon size (us)', 'exon size (ds)',
        'introns in transcript', 'type', 'transcript size', 'intron size',
        'chromosome', 'position', 'alt splicing', '3p score', 'transcript',
        'intron position', 'strand', 'peak', 'Base 5-0', 'Base 5-1',
        'Base 5-2', 'Base 5-3', 'Base 5-4', 'Base 5-5', 'Base 5-6', 'Base 5-7',
        'Base 3-0', 'Base 3-1', 'Base 3-2', 'Base 3-3', 'Base 3-4', 'Base 3-5',
        'Base 3-6', 'Base 3-7', 'branch score', 'branch to 3p distance',
        'percent pPy', 'branch-0', 'branch-1', 'branch-2', 'branch-3',
        'branch-4'
    ]

    quant_df = pd.read_csv(sys.argv[2], index_col=0)
    try:
        quant_df = quant_df[columns]
    except KeyError:
        print "Columns missing from dataframe..."
        print columns
        print quant_df.columns
        return None

    final_df = copy.deepcopy(quant_df)
    final_df.columns = pd.MultiIndex.from_product([['Peaks'],
                                                   final_df.columns])

    for genotype, samples in bam_dict.iteritems():
        # Determine if whole cell extract samples are present
        Ws = [x for x in samples.keys() if "W" in x]
        if len(Ws) > 1:
            W = True
        else:
            W = False

        # Quantitate all samples with genotype
        new_df = quantitate_junction_df(samples, quant_df, gff3, W=W)

        # Remove original columns and rename new ones with multiindex
        new_columns = [x for x in new_df.columns if x not in columns]
        new_df = new_df[new_columns]
        new_df.columns = pd.MultiIndex.from_product([[genotype],
                                                     new_df.columns])
        final_df = final_df.join(new_df, how='inner')
        #final_df = final_df.merge(new_df, right_index=True, left_index=True)

    final_df.to_csv(prefix + '_quant_df.csv')
    final_df.to_pickle(prefix + '_quant_df.pickle')

    SP.SP_quant_scatters(final_df.dropna(how='any'), bam_dict, W=W)
Example #56
0
def count_reads_in_transcript(bam_files, df, gff3, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)

    bams = {}
    for bam_file in bam_files:
        bams[bam_file] = pysam.Samfile(bam_file)

    all_reads = {}

    for bam, reader in bams.iteritems():
        all_reads[bam] = pd.DataFrame(index=df.index,
                                      columns=['total', 'intron'])

        for tx in set(df['transcript']):
            tx_df = df[df['transcript'] == tx]
            if organism == 'pombe':
                tx = tx + '.1'
            else:
                tx = tx + 'T0'

            start, end, strand, CDS_start, CDS_end, exons, chrom = SP.tx_info(
                tx, tx_dict)
            if organism == 'pombe':
                lat_rom = {'chr1': 'I', 'chr2': 'II', 'chr3': 'III'}
                chrom = lat_rom[chrom]

            tx_iter = reader.fetch(chrom, start, end)

            intron_ranges = {}
            for ix, r in tx_df.iterrows():
                if strand == '+':
                    intron_start = int(r['position'])
                    intron_end = int(r['position'] + r['intron size']) + 1
                elif strand == '-':
                    intron_start = int(r['position'] - r['intron size'])
                    intron_end = int(r['position']) + 1
                intron_ranges[ix] = [range(intron_start, intron_end), 0]

            reads = 0
            for read in tx_iter:
                if read.is_reverse and strand == '+':
                    reads += 1

                    for ix in intron_ranges:
                        if read.reference_end in intron_ranges[ix][0]:
                            intron_ranges[ix][1] += 1

                elif not read.is_reverse and strand == '-':
                    reads += 1
                    for ix in intron_ranges:
                        if read.reference_start in intron_ranges[ix][0]:
                            intron_ranges[ix][1] += 1

            for ix in intron_ranges:
                try:
                    all_reads[bam].loc[ix,
                                       'total'] = reads / float(end -
                                                                start) * 1000
                    all_reads[bam].loc[ix, 'intron'] = (
                        (intron_ranges[ix][1] /
                         float(tx_df.loc[ix, 'intron size'])) /
                        (reads / float(end - start)))
                except ZeroDivisionError:
                    all_reads[bam].loc[ix, 'total'] = np.NaN
                    all_reads[bam].loc[ix, 'intron'] = np.NaN
                    print ix

    return all_reads
Example #57
0
def log2_Zscore_df(df,
                   wt,
                   mut,
                   metrics=['Intermediate Level', 'Precursor'],
                   Z=2,
                   by_pos_scores=False):
    if type(df) == str:
        new_df = pd.read_pickle(df)
    else:
        new_df = copy.deepcopy(df)

    print len(new_df)
    mutA = [x for x in new_df.columns if (x[0] == mut) and (x[1][-2:] == '-A')]
    #wtA = [x for x in new_df.columns if (x[0] == wt) and (x[1][-2:] == '-A')]
    new_df = new_df[new_df[mutA].sum(axis=1) >= 10]
    print len(new_df)

    for metric in metrics:
        columns = [
            x for x in new_df.columns
            if (metric in x[1]) and ('avg' not in x[1])
        ]
        wt_cols = [x for x in columns if (x[0] == wt) and ('avg' not in x[1])]
        mut_cols = [
            x for x in columns if (x[0] == mut) and ('avg' not in x[1])
        ]

        for column in columns:
            new_df[(column[0], column[1] + ' log2')] = s_log2(new_df[column])

        if len(wt_cols) != len(mut_cols):
            print "Number of WT reps must match number of mutant reps!"
            print wt_cols
            print mut_cols
            return None

        for n, wt_col in enumerate(wt_cols):
            new_df[('All', metric + ' log2 ratio' + str(n + 1))] = s_log2(
                new_df[mut_cols[n]] / new_df[wt_col])
            new_index = [x + '-' + str(n) for x in new_df.index]

            wt_s = new_df[wt_col]
            wt_s.index = new_index
            mut_s = new_df[mut_cols[n]]
            mut_s.index = new_index

            if n == 0:
                wt_s_for_Z = wt_s
                mut_s_for_Z = mut_s
            else:
                wt_s_for_Z = wt_s_for_Z.append(wt_s)
                mut_s_for_Z = mut_s_for_Z.append(mut_s)

        Zlist = s_log2_ratio_Zscore(wt_s_for_Z.dropna(), mut_s_for_Z.dropna())

        for n, wt_col in enumerate(wt_cols):
            n_up = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist >= Z)]
            n_up = [x[:-2] for x in n_up.index]

            n_down = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist <= -1 * Z)]
            n_down = [x[:-2] for x in n_down.index]

            n_other = Zlist[(Zlist.index.str[-1] == str(n)) & (Zlist < Z) &
                            (Zlist > -1 * Z)]
            n_other = [x[:-2] for x in n_other.index]

            if n == 0:
                up = set(n_up)
                down = set(n_down)
                other = set(n_other)

            else:
                up = up.intersection(n_up)
                up = up.difference(n_down).difference(n_other)
                down = down.intersection(n_down)
                down = down.difference(n_up).difference(n_other)
                other = other.intersection(n_other)
                other = other.difference(n_up).difference(n_down)

            print len(up)
            print len(down)

        new_df[('All', metric + ' change')] = None
        new_df.loc[up, ('All', metric + ' change')] = 'Up'
        new_df.loc[down, ('All', metric + ' change')] = 'Down'
        new_df.loc[other, ('All', metric + ' change')] = 'Other'

        plot_df = copy.deepcopy(new_df)

        fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(8, 8))
        groups = {'Other': '0.8', 'Up': 'tomato', 'Down': 'cornflowerblue'}
        for group in ['Other', 'Up', 'Down']:
            gr_df = plot_df[plot_df[('All', metric + ' change')] == group]
            if len(gr_df) >= 15:
                for n, wt_col in enumerate(wt_cols):
                    ax[0][n].scatter(s_log2(gr_df[wt_col]),
                                     s_log2(gr_df[mut_cols[n]]),
                                     color=groups[group],
                                     alpha=0.9,
                                     label=group,
                                     s=20)

                    ax[0][n].set_xlabel(wt_col[0] + ' log2 ' + metric,
                                        fontsize=12)
                    ax[0][n].set_ylabel(mut_cols[n][0] + ' log2 ' + metric,
                                        fontsize=12)
                    ax[0][n].set_title('Replicate ' + str(n + 1), fontsize=14)

                    ax[0][n], limits = SP.draw_diagonal(ax[0][n])
                    ax[0][n].legend(fontsize=12)

                sns.kdeplot(gr_df[('Peaks', 'intron size')],
                            ax=ax[1][0],
                            bw=2,
                            cumulative=True,
                            linewidth=3,
                            color=groups[group],
                            label=group)
                ax[1][0].set_xlim([30, 400])

                sns.kdeplot(gr_df[('Peaks', '5p score')],
                            ax=ax[1][1],
                            bw=2,
                            cumulative=True,
                            linewidth=3,
                            color=groups[group],
                            label=group)

                ax[1][0].set_xlabel('Intron size (nt)')
                ax[1][0].set_ylabel('Fraction of introns')
                ax[1][1].set_xlabel('5prime splice site score')
                ax[1][1].set_ylabel('Fraction of introns')

        ax[1][1].set_xlim([
            np.percentile(plot_df[('Peaks', '5p score')], 0.5),
            np.percentile(plot_df[('Peaks', '5p score')], 99.9) + 5
        ])

        fig.tight_layout()
        plt.show()
        plt.clf()

    if by_pos_scores is True:
        SP.by_pos_plots(new_df, metrics=metrics)

    new_df[('Peaks', 'predicted')] = True
    new_df.loc[~new_df[('Peaks', 'type')].str.contains('prime'),
               ('Peaks', 'predicted')] = False
    return new_df