Ejemplo n.º 1
0
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None):
    count1 = 0
    count2 = 0
    
    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    
    quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')]
    quant_df['genome coord'] = quant_df['chromosome'].str.cat(quant_df['position'].values.astype(str), sep=':')
    quant_df.index = quant_df['genome coord']
    quant_df = quant_df.drop('index', axis=1)
    
    column_dict = {'intron size':[], 'alt splicing':[], '5p score':[], '3p score':[], 'seq5':[], 'seq3':[]}
    new_index = []
    seq5 = []
    seq3 = []

    for coord in quant_df.index:
        coord_df = quant_df[quant_df.index == coord]
        three_site = None
        alt3 = False
        if len(coord_df) > 0:
            coord_df = coord_df.sort_values('height', ascending=False).ix[0]
        introns = ss_dict[coord_df['transcript']]
        if 'prime' in coord_df['type']:
            peak_range = range(coord_df['position']-5,coord_df['position']+5)
            for intron in introns:
                if intron[0] in peak_range:
                    five_site = intron[0]
                    three_site = intron[1]
                    break
            if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0:
                alt3=True
        else:
            if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']:
                five_site = coord_df['position']
                three_df = quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]
                three_df = three_df.sort_values('height', ascending=False)
                three_site = three_df.ix[0]['position']
        
        if three_site is not None:
            new_index.append(coord)
            size = abs(three_site-five_site)/1000.
            column_dict['intron size'].append(size)
            column_dict['alt splicing'].append(alt3)
            
            if coord_df['strand'] == '+':
                s5 = fa_dict[coord_df['chromosome']][five_site-2:five_site+6]
                s3 = fa_dict[coord_df['chromosome']][three_site-6:three_site+2]
            elif coord_df['strand'] == '-':
                s5 = fa_dict[coord_df['chromosome']][five_site-6:five_site+2]
                s5 = SP.reverse_complement(s5)
                s3 = fa_dict[coord_df['chromosome']][three_site-2:three_site+6]
                s3 = SP.reverse_complement(s3)
            column_dict['seq5'].append(s5)
            column_dict['seq3'].append(s3)
            scores = SP.simple_score_junction(s5, s3, pssm)
            column_dict['3p score'].append(scores[1])
            column_dict['5p score'].append(scores[0])
            
    new_quant_df = quant_df[quant_df.index.isin(new_index)][['genome coord','chromosome',
                                                             'strand','transcript','position','type']]
    for column, data in column_dict.iteritems():
        new_quant_df[column] = data
    
    new_quant_df = new_quant_df.drop_duplicates(subset='genome coord', keep='first').set_index('genome coord')
    
    new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism)
    
    #for n in range(len(new_quant_df['seq5'].iloc[0])):     
    #    new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    #for n in range(len(new_quant_df['seq3'].iloc[0])):
    #    new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)
    
    new_quant_df = SP.find_score_branches_ppy(new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict)
    
    return new_quant_df
Ejemplo n.º 2
0
def main():
    '''Usage: run SP_pipeline.py config_file untagged_sample_name organism
    config file : file that lists all branch, junction and peak files
    untagged_sample_name : prefix for untagged sample
    organism : pombe, crypto or cerevisiae'''
    junc_beds = []
    branch_bams = []
    CP_out = []
    CP_untagged = None
    quant_bams = {}

    # Read configuration file
    with open(sys.argv[1], 'r') as config:
        for line in config:
            if 'junctions.bed' in line.lower():
                junc_beds.append(line.strip())
            elif 'branch' in line.lower():
                branch_bams.append(line.strip())
            elif sys.argv[2] in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = sys.argv[1].split('/')[-1].split('_config')[0]
    base_dir = sys.argv[1].split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: " + base_dir + name

    print "\nJunction bed files"
    print junc_beds
    print "\nBranch bam files"

    if len(branch_bams) == 2:
        print branch_bams
        use_branches = True
    elif len(branch_bams) == 0:
        print "No data for branches, continuing with only junctions"
        use_branches = False

    print "\nUntagged peaks"
    print CP_untagged
    print "\nChangepoint peaks"
    print CP_out
    print ''

    if CP_untagged is None:
        print "\n Error: no untagged file indicated"
        return None

    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    #### Generate peak df
    if name + '_peaks_w_branch.csv' not in os.listdir(
            base_dir) or name + '_peaks_w_junc.csv' not in os.listdir(
                base_dir):
        if name + '_all_peaks.pickle' not in os.listdir(base_dir):
            peak_df = SP.peak_to_seq_pipeline(CP_untagged,
                                              CP_out[0],
                                              CP_out[1],
                                              gff3,
                                              fa_dict,
                                              name=name + '_CP_peaks')
            peak_df.to_pickle(base_dir + name + '_all_peaks.pickle')
        else:
            peak_df = pd.read_pickle(base_dir + name + '_all_peaks.pickle')

    #### Junction to peak comparison
    if name + '_peaks_w_junc.csv' not in os.listdir(base_dir):
        print "Generating peaks vs. junctions dataframe..."
        peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3,
                                              fa_dict, organism, base_dir,
                                              name)

    else:
        peaks_w_junc = pd.read_pickle(base_dir + name + '_peaks_w_junc.pickle')
        print "Peaks vs. junction dataframe already exists"

    #### Branch to peak comparison
    if use_branches is True:
        if name + '_peaks_w_branch.csv' not in os.listdir(base_dir):
            print "Generating peaks vs. branches dataframe..."
            peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3,
                                                  fa_dict, organism, base_dir,
                                                  name)
        else:
            peaks_w_branch = pd.read_pickle(base_dir + name +
                                            '_peaks_w_branch.pickle')
            print "Peaks vs. branches dataframe already exists"

    #### Clean up dataframe for quantitation
    if name + '_quantitation.csv' not in os.listdir(base_dir):
        quant_df, lariat_df = SP.make_quant_df(peaks_w_junc,
                                               peaks_w_branch,
                                               gff3,
                                               fa_dict,
                                               organism=organism)
        quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch,
                                              fa_dict)
        print "Counting reads in transcripts and at peaks..."
        quant_df = SP.quantitate_junction_df(quant_bams,
                                             quant_df,
                                             gff3,
                                             organism=organism)

        quant_df.to_pickle(base_dir + name + '_quantitation.pickle')
        quant_df.to_csv(base_dir + name + '_quantitation.csv')
        lariat_df.to_pickle(base_dir + name + '_lariats.pickle')
        lariat_df.to_csv(base_dir + name + '_lariats.csv')

        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)

    else:
        quant_df = pd.read_pickle(base_dir + name + '_quantitation.pickle')
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)

    print "\n****Finished****"
Ejemplo n.º 3
0
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None):
    count1 = 0
    count2 = 0

    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    quant_df = peak_df[(peak_df['type'] != '3prime')
                       & (peak_df['looks like'] != 'AG')]
    quant_df['genome coord'] = quant_df['chromosome'].str.cat(
        quant_df['position'].values.astype(str), sep=':')
    quant_df.index = quant_df['genome coord']
    quant_df = quant_df.drop('index', axis=1)

    column_dict = {
        'intron size': [],
        'alt splicing': [],
        '5p score': [],
        '3p score': [],
        'seq5': [],
        'seq3': []
    }
    new_index = []
    seq5 = []
    seq3 = []

    for coord in quant_df.index:
        coord_df = quant_df[quant_df.index == coord]
        three_site = None
        alt3 = False
        if len(coord_df) > 0:
            coord_df = coord_df.sort_values('height', ascending=False).ix[0]
        introns = ss_dict[coord_df['transcript']]
        if 'prime' in coord_df['type']:
            peak_range = range(coord_df['position'] - 5,
                               coord_df['position'] + 5)
            for intron in introns:
                if intron[0] in peak_range:
                    five_site = intron[0]
                    three_site = intron[1]
                    break
            if len(quant_df[(quant_df['transcript'] == coord_df['transcript'])
                            & (quant_df['type'] == 'AG')]) > 0:
                alt3 = True
        else:
            if 'AG' in quant_df[quant_df['transcript'] ==
                                coord_df['transcript']]['type']:
                five_site = coord_df['position']
                three_df = quant_df[
                    (quant_df['transcript'] == coord_df['transcript'])
                    & (quant_df['type'] == 'AG')]
                three_df = three_df.sort_values('height', ascending=False)
                three_site = three_df.ix[0]['position']

        if three_site is not None:
            new_index.append(coord)
            size = abs(three_site - five_site) / 1000.
            column_dict['intron size'].append(size)
            column_dict['alt splicing'].append(alt3)

            if coord_df['strand'] == '+':
                s5 = fa_dict[coord_df['chromosome']][five_site - 2:five_site +
                                                     6]
                s3 = fa_dict[coord_df['chromosome']][three_site -
                                                     6:three_site + 2]
            elif coord_df['strand'] == '-':
                s5 = fa_dict[coord_df['chromosome']][five_site - 6:five_site +
                                                     2]
                s5 = SP.reverse_complement(s5)
                s3 = fa_dict[coord_df['chromosome']][three_site -
                                                     2:three_site + 6]
                s3 = SP.reverse_complement(s3)
            column_dict['seq5'].append(s5)
            column_dict['seq3'].append(s3)
            scores = SP.simple_score_junction(s5, s3, pssm)
            column_dict['3p score'].append(scores[1])
            column_dict['5p score'].append(scores[0])

    new_quant_df = quant_df[quant_df.index.isin(new_index)][[
        'genome coord', 'chromosome', 'strand', 'transcript', 'position',
        'type'
    ]]
    for column, data in column_dict.iteritems():
        new_quant_df[column] = data

    new_quant_df = new_quant_df.drop_duplicates(
        subset='genome coord', keep='first').set_index('genome coord')

    new_quant_df = SP.backfill_splice_sites(new_quant_df,
                                            gff3,
                                            fa_dict,
                                            pssm,
                                            organism=organism)

    #for n in range(len(new_quant_df['seq5'].iloc[0])):
    #    new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    #for n in range(len(new_quant_df['seq3'].iloc[0])):
    #    new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)

    new_quant_df = SP.find_score_branches_ppy(
        new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt',
        fa_dict)

    return new_quant_df
Ejemplo n.º 4
0
def main():
    '''Usage: run SP_pipeline.py config_file untagged_sample_name organism
    config file : file that lists all branch, junction and peak files
    untagged_sample_name : prefix for untagged sample
    organism : pombe, crypto or cerevisiae'''
    junc_beds = []
    branch_bams = []
    CP_out = []
    CP_untagged = None
    quant_bams = {}
    
    # Read configuration file
    with open(sys.argv[1], 'r') as config:
        for line in config:
            if 'junctions.bed' in line.lower():
                junc_beds.append(line.strip())
            elif 'branch' in line.lower():
                branch_bams.append(line.strip())
            elif sys.argv[2] in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = sys.argv[1].split('/')[-1].split('_config')[0]
    base_dir = sys.argv[1].split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: "+base_dir+name
    
    print "\nJunction bed files"
    print junc_beds
    print "\nBranch bam files"
    
    if len(branch_bams) == 2:
        print branch_bams
        use_branches = True
    elif len(branch_bams) == 0:
        print "No data for branches, continuing with only junctions"
        use_branches = False
    
    print "\nUntagged peaks"
    print CP_untagged
    print "\nChangepoint peaks"
    print CP_out
    print ''
    
    if CP_untagged is None:
        print "\n Error: no untagged file indicated"
        return None
    
    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    
    #### Generate peak df
    if name+'_peaks_w_branch.csv' not in os.listdir(base_dir) or name+'_peaks_w_junc.csv' not in os.listdir(base_dir):
        if name+'_all_peaks.pickle' not in os.listdir(base_dir):
            peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks')
            peak_df.to_pickle(base_dir+name+'_all_peaks.pickle')
        else:
            peak_df = pd.read_pickle(base_dir+name+'_all_peaks.pickle')
    
    #### Junction to peak comparison
    if name+'_peaks_w_junc.csv' not in os.listdir(base_dir):
        print "Generating peaks vs. junctions dataframe..."
        peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name)
        
    else: 
        peaks_w_junc = pd.read_pickle(base_dir+name+'_peaks_w_junc.pickle')
        print "Peaks vs. junction dataframe already exists"
    
    
    #### Branch to peak comparison
    if use_branches is True:
        if name+'_peaks_w_branch.csv' not in os.listdir(base_dir):
            print "Generating peaks vs. branches dataframe..."
            peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name)
        else: 
            peaks_w_branch = pd.read_pickle(base_dir+name+'_peaks_w_branch.pickle')
            print "Peaks vs. branches dataframe already exists"
    
    #### Clean up dataframe for quantitation
    if name+'_quantitation.csv' not in os.listdir(base_dir):
        quant_df, lariat_df = SP.make_quant_df(peaks_w_junc, peaks_w_branch, gff3, fa_dict, organism=organism)
        quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch, fa_dict)
        print "Counting reads in transcripts and at peaks..."
        quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism)
        
        quant_df.to_pickle(base_dir+name+'_quantitation.pickle')
        quant_df.to_csv(base_dir+name+'_quantitation.csv')
        lariat_df.to_pickle(base_dir+name+'_lariats.pickle')
        lariat_df.to_csv(base_dir+name+'_lariats.csv')
        
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
    else:
        quant_df = pd.read_pickle(base_dir+name+'_quantitation.pickle')
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
    print "\n****Finished****"