Ejemplo n.º 1
0
def peaks_only(config_file, untagged, organism):
    CP_out = []
    quant_bams = {}
    with open(config_file, 'r') as config:
        for line in config:
            if untagged in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = config_file.split('/')[-1].split('_config')[0]
    base_dir = config_file.split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: "+base_dir+name
    
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    
    peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks')
    peak_df.to_pickle(base_dir+name+'_all_peaks.pickle')
    
    quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism)
    quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism)
    
    quant_df.to_pickle(base_dir+name+'_quantitation.pickle')
    quant_df.to_csv(base_dir+name+'_quantitation.csv')
    
    scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
Ejemplo n.º 2
0
def position_wise_scores2(seq5_list, seq3_list, organism, title='Intron position strength'):
    '''Uses chi-contingency test to score base proportions at each position in sample against population'''
    
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    all_5p, all_3p = generate_all_ss_seqs(gff3, fa_dict, organism)
    
    pop_5p = seq_list_to_totals(all_5p)
    pop_3p = seq_list_to_totals(all_3p)
    samp_5p = seq_list_to_totals(seq5_list)
    samp_3p = seq_list_to_totals(seq3_list)
    print samp_5p.shape

    p5 = []
    for n in range(samp_5p.shape[1]):
        if n == 2 or n == 3:
            p5.append(1)
        else:
            conting = np.array([samp_5p[:,n],pop_5p[:,n]])
            chi2, p, dof, expected = stats.chi2_contingency(conting)
            p5.append(np.log10(p)*-1)
        
    p3 = []
    for n in range(samp_3p.shape[1]):
        if n == 4 or n == 5:
            p3.append(1)
        else:
            conting = np.array([samp_3p[:,n],pop_3p[:,n]])
            chi2, p, dof, expected = stats.chi2_contingency(conting)
            p3.append(np.log10(p)*-1)
    
    fig, ax = plt.subplots(2, 1, figsize=(4,4))
    width = 0.7
    
    max_y = max(p5+p3) + 0.1*max(p5+p3)
    
    ind5 = np.arange(len(p5))
    ax[0].bar(ind5, p5, color='k')
    ax[0].plot([0,8], [2,2], '--', color='0.7')
    ax[0].set_xlim([0,len(p5)])
    ax[0].set_ylabel("5' splice site\n-log10(p-value)")
    ax[0].set_title(title)
    ax[0].set_ylim([0,max_y])

    ind3 = np.arange(len(p3))
    ax[1].bar(ind3, p3, color='k')
    ax[1].plot([0,8], [2,2], '--', color='0.7')
    ax[1].set_xlim([0,len(p3)])
    ax[1].set_ylabel("3' splice site\n-log10(p-value)")
    ax[1].set_ylim([0,max_y])

    ax[0].set_xticks(ind3 + width / 2)
    ax[1].set_xticks(ind3 + width / 2)
    ax[0].set_xticklabels(np.arange(-2,6))
    ax[1].set_xticklabels(np.arange(-5,3))

    fig.tight_layout()
    plt.show()
    return fig
Ejemplo n.º 3
0
def peak_seq_enrichment(df, organism):
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    nuc_prob = SP.gc_content(fa_dict)
    p_dict = {'A':nuc_prob[0], 'T':nuc_prob[2], 'C':nuc_prob[1], 'G':nuc_prob[3]}
    
    unpeaks = df[df['type'] == 'other']
    unpeaks = unpeaks.append(df[df['type'] == 'intronic'])
    print "Number of unpredicted peaks:"
    print len(unpeaks)
    nucs = ['G','A','C','T']
    dinucs = set()
    for nuc in nucs:
        for nuc2 in nucs:
            dinucs.add(nuc+nuc2)
    
    five = {}
    three = {}
    for dinuc in dinucs:
        five[dinuc] = len(unpeaks[unpeaks['sequence'].str[6:8].str.contains(dinuc)])
        three[dinuc] = len(unpeaks[unpeaks['sequence'].str[4:6].str.contains(dinuc)])

    five_LO = {}
    three_LO = {}
    for dinuc in five.keys():
        p_dinuc = p_dict[dinuc[0]]*p_dict[dinuc[1]]
        phat_dinuc = five[dinuc]/float(len(unpeaks))
        phat_dinuc2 = three[dinuc]/float(len(unpeaks))

        SE = np.sqrt(phat_dinuc*(1-phat_dinuc)/len(unpeaks))
        SE2 = np.sqrt(phat_dinuc2*(1-phat_dinuc2)/len(unpeaks))
        Z = (phat_dinuc-p_dinuc)/SE
        Z2 = (phat_dinuc2-p_dinuc)/SE2

        pvalue = stats.norm.sf(Z)
        pvalue2 = stats.norm.sf(Z2)
        LO = np.log((1-pvalue)/pvalue)
        LO2 = np.log((1-pvalue2)/pvalue2)

        five_LO[dinuc] = LO
        three_LO[dinuc] = LO2

    fig, ax = plt.subplots(figsize=(12,6))
    width = 0.35
    ind = np.arange(len(five_LO.keys()))
    rects2 = ax.bar(ind, three_LO.values(), width, color='crimson', edgecolor='crimson', label='Before peak')
    rects1 = ax.bar(ind + width, five_LO.values(), width, color='indigo', edgecolor='indigo', label='After peak')
    ax.plot([-1,17],[0,0],'-', color='black')
    ax.plot([-1,17],[2.94,2.94], '--', color='0.7', label='95% CI')
    ax.plot([-1,17],[-2.94,-2.94], '--', color='0.7')

    ax.set_xlim([-1,17])
    ax.set_xticklabels(five_LO.keys(), fontsize=12)
    ax.set_xticks(ind + width / 2)
    ax.set_ylabel('Log odds dinucleotide enrichment', fontsize=14)
    ax.set_title('Unpredicted peaks', fontsize=14)
    ax.legend(fontsize=12)
    
    return fig
Ejemplo n.º 4
0
def peaks_only(config_file, untagged, organism):
    CP_out = []
    quant_bams = {}
    with open(config_file, 'r') as config:
        for line in config:
            if untagged in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = config_file.split('/')[-1].split('_config')[0]
    base_dir = config_file.split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: " + base_dir + name

    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    peak_df = SP.peak_to_seq_pipeline(CP_untagged,
                                      CP_out[0],
                                      CP_out[1],
                                      gff3,
                                      fa_dict,
                                      name=name + '_CP_peaks')
    peak_df.to_pickle(base_dir + name + '_all_peaks.pickle')

    quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism)
    quant_df = SP.quantitate_junction_df(quant_bams,
                                         quant_df,
                                         gff3,
                                         organism=organism)

    quant_df.to_pickle(base_dir + name + '_quantitation.pickle')
    quant_df.to_csv(base_dir + name + '_quantitation.csv')

    scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
Ejemplo n.º 5
0
def position_wise_scores(seq_5p, seq_3p, organism):
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    PSSM_5p, PSSM_3p = generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    base_dict = {"A":0, "C":1, "T":2, "G":3}
    
    seq_5p = [x for x in seq_5p if x is not None]
    seq_3p = [x for x in seq_3p if x is not None]
    
    score_5prime = np.empty([2,len(seq_5p[0])])
    score_3prime = np.empty([2,len(seq_3p[0])])
    all_5p = np.empty([len(seq_5p), len(seq_5p[0])])
    all_3p = np.empty([len(seq_3p), len(seq_3p[0])])
                       
    n=0
    for n in range(len(seq_5p)):
        for a, base in enumerate(seq_5p[n]):
            all_5p[n,a] = PSSM_5p[base_dict[base], a]
    
    a=0
    for a in range(len(score_5prime[0])):
        score_5prime[0,a] = np.median(all_5p[0:,a])
        score_5prime[1,a] = (max(all_5p[0:,a])-min(all_5p[0:,a]))/2.
    print score_5prime
            
    m=0
    for m in range(len(seq_3p)):
        for b, base in enumerate(seq_3p[m]):
            all_3p[m,b] = PSSM_3p[base_dict[base], b]
        
    b=0
    for b in range(len(score_3prime[0])):
        score_3prime[0,b] = np.median(all_3p[0:,b])
        score_3prime[1,b] = (max(all_3p[0:,b])-min(all_3p[0:,b]))/2.
    print score_3prime
    
    return all_5p, all_3p
Ejemplo n.º 6
0
def position_wise_scores(seq_5p, seq_3p, organism):
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    PSSM_5p, PSSM_3p = generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    base_dict = {"A": 0, "C": 1, "T": 2, "G": 3}

    seq_5p = [x for x in seq_5p if x is not None]
    seq_3p = [x for x in seq_3p if x is not None]

    score_5prime = np.empty([2, len(seq_5p[0])])
    score_3prime = np.empty([2, len(seq_3p[0])])
    all_5p = np.empty([len(seq_5p), len(seq_5p[0])])
    all_3p = np.empty([len(seq_3p), len(seq_3p[0])])

    n = 0
    for n in range(len(seq_5p)):
        for a, base in enumerate(seq_5p[n]):
            all_5p[n, a] = PSSM_5p[base_dict[base], a]

    a = 0
    for a in range(len(score_5prime[0])):
        score_5prime[0, a] = np.median(all_5p[0:, a])
        score_5prime[1, a] = (max(all_5p[0:, a]) - min(all_5p[0:, a])) / 2.
    print score_5prime

    m = 0
    for m in range(len(seq_3p)):
        for b, base in enumerate(seq_3p[m]):
            all_3p[m, b] = PSSM_3p[base_dict[base], b]

    b = 0
    for b in range(len(score_3prime[0])):
        score_3prime[0, b] = np.median(all_3p[0:, b])
        score_3prime[1, b] = (max(all_3p[0:, b]) - min(all_3p[0:, b])) / 2.
    print score_3prime

    return all_5p, all_3p
Ejemplo n.º 7
0
def main():
    '''Each line will be : bam_file,genotype,sample
    e.g. CM763-A_sorted.bam,WT,A1'''
    
    bam_dict = {}
    with open(sys.argv[1], 'r') as config:
        for line in config:
            info = line.split(',')
            genotype = info[1]
            sample = info[2].strip()
            
            if genotype not in bam_dict:
                bam_dict[genotype] = {}
            
            bam_dict[genotype][sample] = info[0]
    
    prefix = sys.argv[1].split('_config')[0]
    
    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    
    columns = ['5p score','exon size (us)','exon size (ds)','introns in transcript','type','transcript size','intron size',
               'chromosome','position','alt splicing','3p score','transcript','intron position','strand','peak',
               'Base 5-0','Base 5-1','Base 5-2','Base 5-3','Base 5-4','Base 5-5','Base 5-6','Base 5-7','Base 3-0',
               'Base 3-1','Base 3-2','Base 3-3','Base 3-4','Base 3-5','Base 3-6','Base 3-7','branch score',
               'branch to 3p distance','percent pPy','branch-0','branch-1','branch-2','branch-3','branch-4']
    
    quant_df = pd.read_csv(sys.argv[2], index_col=0)
    try:
        quant_df = quant_df[columns]
    except KeyError:
        print "Columns missing from dataframe..."
        print columns
        print quant_df.columns
        return None
    
    final_df = copy.deepcopy(quant_df)
    final_df.columns = pd.MultiIndex.from_product([['Peaks'], final_df.columns])

    for genotype, samples in bam_dict.iteritems():
        # Determine if whole cell extract samples are present
        Ws = [x for x in samples.keys() if "W" in x]
        if len(Ws) > 1: 
            W = True
        else: 
            W=False
        
        # Quantitate all samples with genotype
        new_df = quantitate_junction_df(samples, quant_df, gff3, W=W)
        
        # Remove original columns and rename new ones with multiindex
        new_columns = [x for x in new_df.columns if x not in columns]
        new_df = new_df[new_columns]
        new_df.columns = pd.MultiIndex.from_product([[genotype], new_df.columns])
        final_df = final_df.join(new_df, how='inner')
        #final_df = final_df.merge(new_df, right_index=True, left_index=True)
        
    final_df.to_csv(prefix+'_quant_df.csv')
    final_df.to_pickle(prefix+'_quant_df.pickle')
        
    SP.SP_quant_scatters(final_df.dropna(how='any'), bam_dict, W=W)
Ejemplo n.º 8
0
def main():
    '''Each line will be : bam_file,genotype,sample
    e.g. CM763-A_sorted.bam,WT,A1'''

    bam_dict = {}
    with open(sys.argv[1], 'r') as config:
        for line in config:
            info = line.split(',')
            genotype = info[1]
            sample = info[2].strip()

            if genotype not in bam_dict:
                bam_dict[genotype] = {}

            bam_dict[genotype][sample] = info[0]

    prefix = sys.argv[1].split('_config')[0]

    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    columns = [
        '5p score', 'exon size (us)', 'exon size (ds)',
        'introns in transcript', 'type', 'transcript size', 'intron size',
        'chromosome', 'position', 'alt splicing', '3p score', 'transcript',
        'intron position', 'strand', 'peak', 'Base 5-0', 'Base 5-1',
        'Base 5-2', 'Base 5-3', 'Base 5-4', 'Base 5-5', 'Base 5-6', 'Base 5-7',
        'Base 3-0', 'Base 3-1', 'Base 3-2', 'Base 3-3', 'Base 3-4', 'Base 3-5',
        'Base 3-6', 'Base 3-7', 'branch score', 'branch to 3p distance',
        'percent pPy', 'branch-0', 'branch-1', 'branch-2', 'branch-3',
        'branch-4'
    ]

    quant_df = pd.read_csv(sys.argv[2], index_col=0)
    try:
        quant_df = quant_df[columns]
    except KeyError:
        print "Columns missing from dataframe..."
        print columns
        print quant_df.columns
        return None

    final_df = copy.deepcopy(quant_df)
    final_df.columns = pd.MultiIndex.from_product([['Peaks'],
                                                   final_df.columns])

    for genotype, samples in bam_dict.iteritems():
        # Determine if whole cell extract samples are present
        Ws = [x for x in samples.keys() if "W" in x]
        if len(Ws) > 1:
            W = True
        else:
            W = False

        # Quantitate all samples with genotype
        new_df = quantitate_junction_df(samples, quant_df, gff3, W=W)

        # Remove original columns and rename new ones with multiindex
        new_columns = [x for x in new_df.columns if x not in columns]
        new_df = new_df[new_columns]
        new_df.columns = pd.MultiIndex.from_product([[genotype],
                                                     new_df.columns])
        final_df = final_df.join(new_df, how='inner')
        #final_df = final_df.merge(new_df, right_index=True, left_index=True)

    final_df.to_csv(prefix + '_quant_df.csv')
    final_df.to_pickle(prefix + '_quant_df.pickle')

    SP.SP_quant_scatters(final_df.dropna(how='any'), bam_dict, W=W)
Ejemplo n.º 9
0
def main():
    '''Usage: run SP_pipeline.py config_file untagged_sample_name organism
    config file : file that lists all branch, junction and peak files
    untagged_sample_name : prefix for untagged sample
    organism : pombe, crypto or cerevisiae'''
    junc_beds = []
    branch_bams = []
    CP_out = []
    CP_untagged = None
    quant_bams = {}

    # Read configuration file
    with open(sys.argv[1], 'r') as config:
        for line in config:
            if 'junctions.bed' in line.lower():
                junc_beds.append(line.strip())
            elif 'branch' in line.lower():
                branch_bams.append(line.strip())
            elif sys.argv[2] in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = sys.argv[1].split('/')[-1].split('_config')[0]
    base_dir = sys.argv[1].split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: " + base_dir + name

    print "\nJunction bed files"
    print junc_beds
    print "\nBranch bam files"

    if len(branch_bams) == 2:
        print branch_bams
        use_branches = True
    elif len(branch_bams) == 0:
        print "No data for branches, continuing with only junctions"
        use_branches = False

    print "\nUntagged peaks"
    print CP_untagged
    print "\nChangepoint peaks"
    print CP_out
    print ''

    if CP_untagged is None:
        print "\n Error: no untagged file indicated"
        return None

    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    #### Generate peak df
    if name + '_peaks_w_branch.csv' not in os.listdir(
            base_dir) or name + '_peaks_w_junc.csv' not in os.listdir(
                base_dir):
        if name + '_all_peaks.pickle' not in os.listdir(base_dir):
            peak_df = SP.peak_to_seq_pipeline(CP_untagged,
                                              CP_out[0],
                                              CP_out[1],
                                              gff3,
                                              fa_dict,
                                              name=name + '_CP_peaks')
            peak_df.to_pickle(base_dir + name + '_all_peaks.pickle')
        else:
            peak_df = pd.read_pickle(base_dir + name + '_all_peaks.pickle')

    #### Junction to peak comparison
    if name + '_peaks_w_junc.csv' not in os.listdir(base_dir):
        print "Generating peaks vs. junctions dataframe..."
        peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3,
                                              fa_dict, organism, base_dir,
                                              name)

    else:
        peaks_w_junc = pd.read_pickle(base_dir + name + '_peaks_w_junc.pickle')
        print "Peaks vs. junction dataframe already exists"

    #### Branch to peak comparison
    if use_branches is True:
        if name + '_peaks_w_branch.csv' not in os.listdir(base_dir):
            print "Generating peaks vs. branches dataframe..."
            peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3,
                                                  fa_dict, organism, base_dir,
                                                  name)
        else:
            peaks_w_branch = pd.read_pickle(base_dir + name +
                                            '_peaks_w_branch.pickle')
            print "Peaks vs. branches dataframe already exists"

    #### Clean up dataframe for quantitation
    if name + '_quantitation.csv' not in os.listdir(base_dir):
        quant_df, lariat_df = SP.make_quant_df(peaks_w_junc,
                                               peaks_w_branch,
                                               gff3,
                                               fa_dict,
                                               organism=organism)
        quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch,
                                              fa_dict)
        print "Counting reads in transcripts and at peaks..."
        quant_df = SP.quantitate_junction_df(quant_bams,
                                             quant_df,
                                             gff3,
                                             organism=organism)

        quant_df.to_pickle(base_dir + name + '_quantitation.pickle')
        quant_df.to_csv(base_dir + name + '_quantitation.csv')
        lariat_df.to_pickle(base_dir + name + '_lariats.pickle')
        lariat_df.to_csv(base_dir + name + '_lariats.csv')

        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)

    else:
        quant_df = pd.read_pickle(base_dir + name + '_quantitation.pickle')
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)

    print "\n****Finished****"
Ejemplo n.º 10
0
def igv_plots_general(bam_list,
                      gene_list,
                      organism,
                      colors=None,
                      names=None,
                      save_dir=None,
                      unstranded=False,
                      end_only=False,
                      same_yaxis=False,
                      specific_range=None,
                      transcript_direction=True,
                      log_scale=False,
                      rpm=True,
                      PE=False,
                      plot_junctions=False):
    '''Usage:
    Parameters
    ----------
    bam_list : list, bam files in order of plotting (top to bottom)
    gene_list : list of transcripts to plot (should be genes not transcript isoforms)
            if dataframe passed instead of list, will plot introns (must have intron information in datafame)
    organism : str, pombe or crypto
    colors : list, default `None`
            list of colors to use, same length as bam_list, check matplotlib documentation for valid color names
    names : list, default `None`
            list of sample names to use instead of bam file names. Same length as bam_files
    save_dir : str, default `None`
            directory to save eps files. If None, does not save files
    unstranded : bool, default `False`
            Use True for ChIP or DNA sequencing data (or unstranded RNAseq)
    end_only : bool or list, default `False`
            Whether to plot only the ends of reads. If different for each bam, make a list of bools same length as bam_list
    same_yaxis : bool, default `False`
            Whether all samples should be plotted on the same axis after normalizing to total number of aligned reads
    specific_range : str, default `None`
            Options: ('end', window)
                     ('start', window)
                     ([coordinate], window)
    transcript_direction : bool, default `True`
            If True, will plot in the direction of transcription, not in the direction of the DNA
    '''

    # Get all organism information (annotation etc.)
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    fix_info = {
        'I': 'chr1',
        'II': 'chr2',
        'III': 'chr3',
        'chr1': 'I',
        'chr2': 'II',
        'chr4': 'IV',
        'chr5': 'V',
        'chr6': 'VI',
        'chr7': 'VII',
        'chr8': 'VIII',
        'chr9': 'IX',
        'chr10': 'X',
        'chr11': 'XI',
        'chr12': 'XII',
        'chr13': 'XIII',
        'chr14': 'XIV',
        'chr15': 'XV',
        'chr16': 'XVI',
        '-': '+',
        '+': '-',
        'chr1': 'I',
        'chr2': 'II',
        'chr3': 'III'
    }
    if organism == 'pombe':
        tx_suffix = '.1'
    else:
        tx_suffix = 'T0'

    # Set up range parameters if specific range is indicated
    if specific_range is not None:
        window = int(specific_range[1])
        new_tx_dict = {}
        for gene in gene_list:
            info = tx_dict[gene + tx_suffix]
            if specific_range[0] == 'end':
                if info[2] == '+':
                    start = info[1] - window
                    end = info[1] + window
                else:
                    start = info[0] - window
                    end = info[0] + window
            elif specific_range[0] == 'start':
                if info[2] == '-':
                    start = info[1] - window
                    end = info[1] + window
                else:
                    start = info[0] - window
                    end = info[0] + window
            else:
                start = int(specific_range[0]) - window
                end = int(specific_range[0]) + window
            new_tx_dict[gene + tx_suffix] = [start, end, info[2], info[3]]
    else:
        new_tx_dict = tx_dict

    # Open bam files and count reads if rpm is True
    open_bams = {}
    total_list = []
    for bam in bam_list:
        open_bams[bam] = pysam.Samfile(bam)
        if rpm is True:
            total = check_output(['samtools', 'view', '-F 0x04', '-c',
                                  bam]).strip()
            total = float(total) / 1000000.
            total_list.append(total)
        else:
            total_list.append(1.)

    # Expand optional arguments to lists if necessary
    colors = list_from_arg(colors, len(bam_list))
    end_only = list_from_arg(end_only, len(bam_list))
    log_scale = list_from_arg(log_scale, len(bam_list))
    unstranded = list_from_arg(unstranded, len(bam_list))

    # Get gene_list from dataframe if gene_list is not a list
    df = None
    if type(gene_list) == dict:
        new_tx_dict = gene_list
        gene_list = gene_list.keys()

    elif type(gene_list) != list:
        df = gene_list
        gene_list = df.index

    for tx in gene_list:
        num_ax = len(bam_list) + 1
        if plot_junctions is True:
            num_ax += len(bam_list)

        fig, ax = plt.subplots(num_ax, figsize=(10, num_ax), sharex=True)
        fig.subplots_adjust(hspace=0)

        # Get transcript info from transcript_dictionary
        if df is None:
            try:
                info = new_tx_dict[tx + tx_suffix]
            except KeyError:
                info = new_tx_dict[tx]
            chrom = info[3]
            start = info[0]
            end = info[1]
            strand = info[2]

        # If dataframe was passed, get plotting information from dataframe instead
        else:
            if isinstance(df.columns, pd.core.index.MultiIndex):
                new_columns = [x[1] for x in df.columns if x[0] == 'Peaks']
                df = df[[x for x in df.columns if x[0] == 'Peaks']]
                df.columns = new_columns
            strand = df.loc[tx, 'strand']
            chrom = df.loc[tx, 'chromosome']
            if strand == '+':
                start = df.loc[tx, 'position'] - 100
                end = df.loc[tx, 'position'] + df.loc[tx, 'intron size'] + 100
            elif strand == '-':
                start = df.loc[tx, 'position'] - df.loc[tx,
                                                        'intron size'] - 100
                end = df.loc[tx, 'position'] + 100
            start = int(start)
            end = int(end)

            tx = df.loc[tx, 'transcript']

        # Generate read series for each transcript
        max_y = 0
        junc_ymax = 0
        for n, bam in enumerate(bam_list):
            try:
                bam_iter = open_bams[bam].fetch(chrom, start, end)
            except ValueError:
                chrom = fix_info[chrom]
                bam_iter = open_bams[bam].fetch(chrom, start, end)
            if end_only[n] is not False:
                s = SP.generate_read_series_A(bam_iter, chrom, start, end,
                                              strand)
                linewidth = 2
            else:
                if PE is False:
                    s = SP.generate_read_series_B(bam_iter, chrom, start, end,
                                                  strand)
                else:
                    s = SP.generate_read_series_PE(bam_iter, chrom, start, end,
                                                   strand)
                linewidth = 1

            # Get reads from otherstrand if the library type is unstranded
            if unstranded[n] is True:
                bam_iter = open_bams[bam].fetch(chrom, start, end)
                if end_only[n] is not False:
                    s2 = SP.generate_read_series_A(bam_iter, chrom, start, end,
                                                   fix_info[strand])
                    linewidth = 2
                else:
                    if PE is False:
                        s2 = SP.generate_read_series_B(bam_iter, chrom, start,
                                                       end, fix_info[strand])
                    else:
                        s2 = SP.generate_read_series_PE(
                            bam_iter, chrom, start, end, fix_info[strand])
                    linewidth = 1
                s = s.add(s2)

            # Normalize to rpm (will just divide by 1 if rpm is False)
            s = s.divide(total_list[n])
            if log_scale[n] is True:
                s = s.apply(np.log2)

            # Plot!
            ax[n].bar(s.index,
                      s,
                      linewidth=linewidth,
                      color=colors[n],
                      edgecolor=colors[n],
                      zorder=2)
            ax[n].tick_params(axis='both', which='major', labelsize=14)

            max_y = max([max_y, max(s)])

            if plot_junctions is True:
                m = n + len(bam_list)
                intron_dict = get_junctions(open_bams[bam], chrom, start, end,
                                            strand)
                ax[m].plot((start, end), (0, 0), '-', c='k')
                for coords, heights in intron_dict.iteritems():
                    ax[m].plot(coords,
                               heights,
                               '-',
                               linewidth=2,
                               color=colors[n])
                    ax[m].fill_between(coords,
                                       0,
                                       heights,
                                       facecolor=colors[n],
                                       interpolate=True,
                                       alpha=0.5)
                if same_yaxis is True:
                    junc_ymax = max(
                        [junc_ymax,
                         max(zip(*intron_dict.values())[1])])

        # Add diagram of gene below traces
        if tx in tx_dict:
            strand = gene_patches(tx, tx_dict, ax[-1])
            ax[-1].set_xlim(start, end)
        else:
            try:
                new_tx = tx.split(' ')[0]
                if new_tx[-2] == 'T' or new_tx[-2] == '.':
                    new_tx = new_tx[:-2]
                strand = gene_patches(new_tx, tx_dict, ax[-1])
                ax[-1].set_xlim(start, end)
            except KeyError:
                print "Transcript unknown"

        # Flip minus strand transcripts if indicated
        if transcript_direction is True:
            if strand == '-':
                ax[-1].invert_xaxis()

        # Set x and y limits
        for n in range(len(bam_list)):
            ax[n].set_xlim(start, end)
            if same_yaxis is True:
                ax[n].set_ylim(0, max_y + 0.1 * max_y)

                if plot_junctions is True:
                    ax[n + len(bam_list)].set_ylim(0,
                                                   junc_ymax + 0.1 * junc_ymax)

            if strand == '-':
                ax[n].invert_xaxis()

        ax[0].set_ylabel('RPM', fontsize=16)
        ax[0].set_title(tx, fontsize=16)
        #ax[0].get_xaxis().set_ticks([])
        plt.show()

        # Save if indicated
        if save_dir is not None:
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            fig.savefig(save_dir + tx + '.eps', format='eps')

        plt.clf()
Ejemplo n.º 11
0
def main():
    '''Usage: run SP_pipeline.py config_file untagged_sample_name organism
    config file : file that lists all branch, junction and peak files
    untagged_sample_name : prefix for untagged sample
    organism : pombe, crypto or cerevisiae'''
    junc_beds = []
    branch_bams = []
    CP_out = []
    CP_untagged = None
    quant_bams = {}
    
    # Read configuration file
    with open(sys.argv[1], 'r') as config:
        for line in config:
            if 'junctions.bed' in line.lower():
                junc_beds.append(line.strip())
            elif 'branch' in line.lower():
                branch_bams.append(line.strip())
            elif sys.argv[2] in line:
                CP_untagged = line.strip()
            elif 'changepoint' in line.lower() or 'peak' in line.lower():
                CP_out.append(line.strip())
            #bam files for quantitation should be file,quant,A1
            elif 'quant' in line:
                quant_bams[line.split(',')[-1].strip()] = line.split(',')[0]

    name = sys.argv[1].split('/')[-1].split('_config')[0]
    base_dir = sys.argv[1].split(name)[0]
    if base_dir == '': base_dir = './'
    print "Output file location and prefix: "+base_dir+name
    
    print "\nJunction bed files"
    print junc_beds
    print "\nBranch bam files"
    
    if len(branch_bams) == 2:
        print branch_bams
        use_branches = True
    elif len(branch_bams) == 0:
        print "No data for branches, continuing with only junctions"
        use_branches = False
    
    print "\nUntagged peaks"
    print CP_untagged
    print "\nChangepoint peaks"
    print CP_out
    print ''
    
    if CP_untagged is None:
        print "\n Error: no untagged file indicated"
        return None
    
    organism = sys.argv[3]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    
    #### Generate peak df
    if name+'_peaks_w_branch.csv' not in os.listdir(base_dir) or name+'_peaks_w_junc.csv' not in os.listdir(base_dir):
        if name+'_all_peaks.pickle' not in os.listdir(base_dir):
            peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks')
            peak_df.to_pickle(base_dir+name+'_all_peaks.pickle')
        else:
            peak_df = pd.read_pickle(base_dir+name+'_all_peaks.pickle')
    
    #### Junction to peak comparison
    if name+'_peaks_w_junc.csv' not in os.listdir(base_dir):
        print "Generating peaks vs. junctions dataframe..."
        peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name)
        
    else: 
        peaks_w_junc = pd.read_pickle(base_dir+name+'_peaks_w_junc.pickle')
        print "Peaks vs. junction dataframe already exists"
    
    
    #### Branch to peak comparison
    if use_branches is True:
        if name+'_peaks_w_branch.csv' not in os.listdir(base_dir):
            print "Generating peaks vs. branches dataframe..."
            peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name)
        else: 
            peaks_w_branch = pd.read_pickle(base_dir+name+'_peaks_w_branch.pickle')
            print "Peaks vs. branches dataframe already exists"
    
    #### Clean up dataframe for quantitation
    if name+'_quantitation.csv' not in os.listdir(base_dir):
        quant_df, lariat_df = SP.make_quant_df(peaks_w_junc, peaks_w_branch, gff3, fa_dict, organism=organism)
        quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch, fa_dict)
        print "Counting reads in transcripts and at peaks..."
        quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism)
        
        quant_df.to_pickle(base_dir+name+'_quantitation.pickle')
        quant_df.to_csv(base_dir+name+'_quantitation.csv')
        lariat_df.to_pickle(base_dir+name+'_lariats.pickle')
        lariat_df.to_csv(base_dir+name+'_lariats.csv')
        
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
    else:
        quant_df = pd.read_pickle(base_dir+name+'_quantitation.pickle')
        scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
    
    print "\n****Finished****"
Ejemplo n.º 12
0
def position_wise_scores2(seq5_list,
                          seq3_list,
                          organism,
                          title='Intron position strength'):
    '''Uses chi-contingency test to score base proportions at each position in sample against population'''

    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    all_5p, all_3p = generate_all_ss_seqs(gff3, fa_dict, organism)

    pop_5p = seq_list_to_totals(all_5p)
    pop_3p = seq_list_to_totals(all_3p)
    samp_5p = seq_list_to_totals(seq5_list)
    samp_3p = seq_list_to_totals(seq3_list)
    print samp_5p.shape

    p5 = []
    for n in range(samp_5p.shape[1]):
        if n == 2 or n == 3:
            p5.append(1)
        else:
            conting = np.array([samp_5p[:, n], pop_5p[:, n]])
            chi2, p, dof, expected = stats.chi2_contingency(conting)
            p5.append(np.log10(p) * -1)

    p3 = []
    for n in range(samp_3p.shape[1]):
        if n == 4 or n == 5:
            p3.append(1)
        else:
            conting = np.array([samp_3p[:, n], pop_3p[:, n]])
            chi2, p, dof, expected = stats.chi2_contingency(conting)
            p3.append(np.log10(p) * -1)

    fig, ax = plt.subplots(2, 1, figsize=(4, 4))
    width = 0.7

    max_y = max(p5 + p3) + 0.1 * max(p5 + p3)

    ind5 = np.arange(len(p5))
    ax[0].bar(ind5, p5, color='k')
    ax[0].plot([0, 8], [2, 2], '--', color='0.7')
    ax[0].set_xlim([0, len(p5)])
    ax[0].set_ylabel("5' splice site\n-log10(p-value)")
    ax[0].set_title(title)
    ax[0].set_ylim([0, max_y])

    ind3 = np.arange(len(p3))
    ax[1].bar(ind3, p3, color='k')
    ax[1].plot([0, 8], [2, 2], '--', color='0.7')
    ax[1].set_xlim([0, len(p3)])
    ax[1].set_ylabel("3' splice site\n-log10(p-value)")
    ax[1].set_ylim([0, max_y])

    ax[0].set_xticks(ind3 + width / 2)
    ax[1].set_xticks(ind3 + width / 2)
    ax[0].set_xticklabels(np.arange(-2, 6))
    ax[1].set_xticklabels(np.arange(-5, 3))

    fig.tight_layout()
    plt.show()
    return fig
Ejemplo n.º 13
0
def main():
    '''Usage: run SPBranch.py unmapped1 unmapped2 threads organism [config_file] [untagged]
    
    Parameters
    -----------
    unmapped1 : bam or fastq file of unmapped reads from tophat or bowtie
    unmapped2 : bam or fastq file of unmapped reads from tophat or bowtie
    threads : number of processors to use
    organism : 'pombe or 'crypto'
    config_file : if using peaks to call - list of changepoint output file names and where to find them
    untagged : untagged sample name (must be in file name)
    
    Output
    ------
    bam files with aligned reads. Will be interpreted by SP_pipeline.
    '''
    
    unmapped1 = sys.argv[1]
    unmapped2 = sys.argv[2]
    threads = int(sys.argv[3])
    
    if unmapped1.endswith('bam'):
        btf_args = 'bamToFastq -i {0} -fq {1}'.format(unmapped1, unmapped1.split('.bam')[-1]+'.fq')
        call(btf_args, shell=False)
        unmapped1 = unmapped1.split('.bam')[-1]+'.fq'
    if unmapped2.endswith('bam'):
        btf_args = 'bamToFastq -i {0} -fq {1}'.format(unmapped2, unmapped2.split('.bam')[-1]+'.fq')
        call(btf_args, shell=False)
        unmapped2 = unmapped2.split('.bam')[-1]+'.fq'
        
    cat_args = 'cat {0} {1} > unmapped_all.fq'.format(unmapped1, unmapped2)
    call(cat_args, shell=True)
    
    organism = sys.argv[4]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
        
    peaks = False
    if len(sys.argv) == 7:
        peaks = True
        with open(sys.argv[5], 'r') as config:
            for line in config:
                if sys.argv[6] in line:
                    CP_untagged = line.strip()
                elif 'changepoint' in line.lower() or 'peak' in line.lower():
                    CP_out.append(line.strip())
        peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name='CP_peaks')

    ann_seqs = collect_intron_seq(gff3, fa_dict)
    
    print "Finding unaligned reads with annotated 5' splice sites"
    find_split_reads('unmapped_all.fq', ann_seqs, 'Ann_branches', threads=threads)
    
    print "Aligning split reads to the genome with Bowtie"
    bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Ann_branches_split.fa --sam Ann_branches.sam'.format(threads, bowtie_index)
    call(bowtie_args, shell=True)
    
    # sort and index
    print "Sorting and indexing bam file"
    samtools1 = 'samtools view -Sbo Ann_branches.bam Ann_branches.sam'
    call(samtools1, shell=True)
    
    samtools2 = 'samtools sort Ann_branches.bam -o Ann_branches_sorted.bam'
    call(samtools2, shell=True)
    
    samtools3 = 'samtools index Ann_branches_sorted.bam'
    call(samtools3, shell=True)
    
    if peaks is True:
        print "Finding unaligned reads with unpredicted splicing events"
        peak_seqs = collect_intron_seq(gff3, fa_dict, peak_df=peak_df)
        find_split_reads('Ann_branches_unsplit.fa', peak_seqs, 'Peak_branches', threads=threads)
        
        print "Aligning split reads to the genome with Bowtie"
        bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Peak_branches_split.fa --sam Peak_branches.sam'.format(threads, bowtie_index)
        call(bowtie_args, shell=True)
        
        print "Sorting and indexing bam file"
        samtools1 = 'samtools view -Sbo Peak_branches.bam Peak_branches.sam'
        call(samtools1, shell=True)

        samtools2 = 'samtools sort Peak_branches.bam -o Peak_branches_sorted.bam'
        call(samtools2, shell=True)

        samtools3 = 'samtools index Peak_branches_sorted.bam'
        call(samtools3, shell=True)
Ejemplo n.º 14
0
def igv_plots_general(bam_list, gene_list, organism, colors=None, names=None, save_dir=None, 
                      unstranded=False, end_only=False, same_yaxis=False, specific_range=None, transcript_direction=True,
                     log_scale=False, rpm=True, PE=False, plot_junctions=False):
    '''Usage:
    Parameters
    ----------
    bam_list : list, bam files in order of plotting (top to bottom)
    gene_list : list of transcripts to plot (should be genes not transcript isoforms)
            if dataframe passed instead of list, will plot introns (must have intron information in datafame)
    organism : str, pombe or crypto
    colors : list, default `None`
            list of colors to use, same length as bam_list, check matplotlib documentation for valid color names
    names : list, default `None`
            list of sample names to use instead of bam file names. Same length as bam_files
    save_dir : str, default `None`
            directory to save eps files. If None, does not save files
    unstranded : bool, default `False`
            Use True for ChIP or DNA sequencing data (or unstranded RNAseq)
    end_only : bool or list, default `False`
            Whether to plot only the ends of reads. If different for each bam, make a list of bools same length as bam_list
    same_yaxis : bool, default `False`
            Whether all samples should be plotted on the same axis after normalizing to total number of aligned reads
    specific_range : str, default `None`
            Options: ('end', window)
                     ('start', window)
                     ([coordinate], window)
    transcript_direction : bool, default `True`
            If True, will plot in the direction of transcription, not in the direction of the DNA
    '''
    
    # Get all organism information (annotation etc.)
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    fix_info = {'I':'chr1','II':'chr2','III':'chr3','chr1':'I','chr2':'II','chr4':'IV','chr5':'V','chr6':'VI',
                'chr7':'VII','chr8':'VIII','chr9':'IX','chr10':'X','chr11':'XI','chr12':'XII','chr13':'XIII',
                'chr14':'XIV','chr15':'XV','chr16':'XVI','-':'+','+':'-','chr1':'I','chr2':'II','chr3':'III'}
    if organism == 'pombe':
        tx_suffix = '.1'
    else:
        tx_suffix = 'T0'
    
    # Set up range parameters if specific range is indicated
    if specific_range is not None:
        window = int(specific_range[1])
        new_tx_dict = {}
        for gene in gene_list:
            info = tx_dict[gene+tx_suffix]
            if specific_range[0] == 'end':
                if info[2] == '+':
                    start = info[1]-window
                    end = info[1]+window
                else:
                    start = info[0]-window
                    end = info[0]+window
            elif specific_range[0] == 'start':
                if info[2] == '-':
                    start = info[1]-window
                    end = info[1]+window
                else:
                    start = info[0]-window
                    end = info[0]+window    
            else:
                start = int(specific_range[0])-window
                end = int(specific_range[0])+window           
            new_tx_dict[gene+tx_suffix] = [start, end, info[2], info[3]]
    else:
        new_tx_dict = tx_dict
                
    # Open bam files and count reads if rpm is True
    open_bams = {}
    total_list = []
    for bam in bam_list:
        open_bams[bam] = pysam.Samfile(bam)
        if rpm is True:
            total = check_output(['samtools','view','-F 0x04','-c',bam]).strip()
            total = float(total)/1000000.
            total_list.append(total)
        else:
            total_list.append(1.)
    
    # Expand optional arguments to lists if necessary
    colors = list_from_arg(colors, len(bam_list))
    end_only = list_from_arg(end_only, len(bam_list))
    log_scale = list_from_arg(log_scale, len(bam_list))
    unstranded = list_from_arg(unstranded, len(bam_list))
    
    # Get gene_list from dataframe if gene_list is not a list
    df = None
    if type(gene_list) == dict:
        new_tx_dict = gene_list
        gene_list = gene_list.keys()
        
    elif type(gene_list) != list:
        df = gene_list
        gene_list = df.index
    
    for tx in gene_list:
        num_ax = len(bam_list)+1
        if plot_junctions is True:
            num_ax += len(bam_list)
        
        fig, ax = plt.subplots(num_ax, figsize=(10,num_ax), sharex=True)
        fig.subplots_adjust(hspace=0)
        
        # Get transcript info from transcript_dictionary
        if df is None:
            try:
                info = new_tx_dict[tx+tx_suffix]
            except KeyError:
                info = new_tx_dict[tx]
            chrom = info[3]
            start = info[0]
            end = info[1]
            strand = info[2]
        
        # If dataframe was passed, get plotting information from dataframe instead
        else:
            if isinstance(df.columns, pd.core.index.MultiIndex):
                new_columns = [x[1] for x in df.columns if x[0] == 'Peaks']
                df = df[[x for x in df.columns if x[0] == 'Peaks']]
                df.columns = new_columns
            strand = df.loc[tx,'strand']
            chrom = df.loc[tx,'chromosome']
            if strand == '+':
                start = df.loc[tx,'position']-100
                end = df.loc[tx,'position'] + df.loc[tx,'intron size']+100
            elif strand == '-':
                start = df.loc[tx,'position']-df.loc[tx,'intron size']-100
                end = df.loc[tx,'position']+100
            start = int(start)
            end = int(end)
            
            tx = df.loc[tx,'transcript']
        
        # Generate read series for each transcript
        max_y = 0
        junc_ymax = 0
        for n, bam in enumerate(bam_list):
            try:
                bam_iter = open_bams[bam].fetch(chrom, start, end)
            except ValueError:
                chrom = fix_info[chrom]
                bam_iter = open_bams[bam].fetch(chrom, start, end)
            if end_only[n] is not False:
                s = SP.generate_read_series_A(bam_iter, chrom, start, end, strand)
                linewidth = 2
            else:
                if PE is False:
                    s = SP.generate_read_series_B(bam_iter, chrom, start, end, strand)
                else:
                    s = SP.generate_read_series_PE(bam_iter, chrom, start, end, strand)
                linewidth = 1
            
            # Get reads from otherstrand if the library type is unstranded
            if unstranded[n] is True:
                bam_iter = open_bams[bam].fetch(chrom, start, end)
                if end_only[n] is not False:
                    s2 = SP.generate_read_series_A(bam_iter, chrom, start, end, fix_info[strand])
                    linewidth = 2
                else:
                    if PE is False:
                        s2 = SP.generate_read_series_B(bam_iter, chrom, start, end, fix_info[strand])
                    else:
                        s2 = SP.generate_read_series_PE(bam_iter, chrom, start, end, fix_info[strand])
                    linewidth = 1
                s = s.add(s2)
            
            # Normalize to rpm (will just divide by 1 if rpm is False)
            s = s.divide(total_list[n])
            if log_scale[n] is True:
                s = s.apply(np.log2)
            
            # Plot!
            ax[n].bar(s.index, s, linewidth=linewidth, color=colors[n], edgecolor=colors[n], zorder=2)
            ax[n].tick_params(axis='both', which='major', labelsize=14)
            
            max_y = max([max_y,max(s)])
            
            if plot_junctions is True:
                m = n+len(bam_list)
                intron_dict = get_junctions(open_bams[bam], chrom, start, end, strand)
                ax[m].plot((start, end),(0,0),'-',c='k')
                for coords, heights in intron_dict.iteritems():
                    ax[m].plot(coords, heights, '-', linewidth=2, color=colors[n])
                    ax[m].fill_between(coords, 0, heights, facecolor=colors[n], interpolate=True, alpha=0.5)
                if same_yaxis is True:
                    junc_ymax = max([junc_ymax, max(zip(*intron_dict.values())[1])])
            
        # Add diagram of gene below traces
        if tx in tx_dict:
            strand = gene_patches(tx, tx_dict, ax[-1])
            ax[-1].set_xlim(start, end)
        else:
            try:
                new_tx = tx.split(' ')[0]
                if new_tx[-2] == 'T' or new_tx[-2] == '.':
                    new_tx = new_tx[:-2]
                strand = gene_patches(new_tx, tx_dict, ax[-1])
                ax[-1].set_xlim(start, end)
            except KeyError:
                print "Transcript unknown"
                
        
        # Flip minus strand transcripts if indicated
        if transcript_direction is True:
            if strand == '-':
                ax[-1].invert_xaxis()

        # Set x and y limits
        for n in range(len(bam_list)):
            ax[n].set_xlim(start, end)
            if same_yaxis is True:
                ax[n].set_ylim(0,max_y+0.1*max_y)
                
                if plot_junctions is True:
                    ax[n+len(bam_list)].set_ylim(0,junc_ymax+0.1*junc_ymax)
            
            if strand == '-':
                ax[n].invert_xaxis()

        ax[0].set_ylabel('RPM', fontsize=16)
        ax[0].set_title(tx, fontsize=16)
        #ax[0].get_xaxis().set_ticks([])
        plt.show()
        
        # Save if indicated
        if save_dir is not None:
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            fig.savefig(save_dir+tx+'.eps', format='eps')
            
        plt.clf()
Ejemplo n.º 15
0
def main():
    '''Usage: run SPBranch.py unmapped1 unmapped2 threads organism [config_file] [untagged]
    
    Parameters
    -----------
    unmapped1 : bam or fastq file of unmapped reads from tophat or bowtie
    unmapped2 : bam or fastq file of unmapped reads from tophat or bowtie
    threads : number of processors to use
    organism : 'pombe or 'crypto'
    config_file : if using peaks to call - list of changepoint output file names and where to find them
    untagged : untagged sample name (must be in file name)
    
    Output
    ------
    bam files with aligned reads. Will be interpreted by SP_pipeline.
    '''

    unmapped1 = sys.argv[1]
    unmapped2 = sys.argv[2]
    threads = int(sys.argv[3])

    if unmapped1.endswith('bam'):
        btf_args = 'bamToFastq -i {0} -fq {1}'.format(
            unmapped1,
            unmapped1.split('.bam')[-1] + '.fq')
        call(btf_args, shell=False)
        unmapped1 = unmapped1.split('.bam')[-1] + '.fq'
    if unmapped2.endswith('bam'):
        btf_args = 'bamToFastq -i {0} -fq {1}'.format(
            unmapped2,
            unmapped2.split('.bam')[-1] + '.fq')
        call(btf_args, shell=False)
        unmapped2 = unmapped2.split('.bam')[-1] + '.fq'

    cat_args = 'cat {0} {1} > unmapped_all.fq'.format(unmapped1, unmapped2)
    call(cat_args, shell=True)

    organism = sys.argv[4]
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)

    peaks = False
    if len(sys.argv) == 7:
        peaks = True
        with open(sys.argv[5], 'r') as config:
            for line in config:
                if sys.argv[6] in line:
                    CP_untagged = line.strip()
                elif 'changepoint' in line.lower() or 'peak' in line.lower():
                    CP_out.append(line.strip())
        peak_df = SP.peak_to_seq_pipeline(CP_untagged,
                                          CP_out[0],
                                          CP_out[1],
                                          gff3,
                                          fa_dict,
                                          name='CP_peaks')

    ann_seqs = collect_intron_seq(gff3, fa_dict)

    print "Finding unaligned reads with annotated 5' splice sites"
    find_split_reads('unmapped_all.fq',
                     ann_seqs,
                     'Ann_branches',
                     threads=threads)

    print "Aligning split reads to the genome with Bowtie"
    bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Ann_branches_split.fa --sam Ann_branches.sam'.format(
        threads, bowtie_index)
    call(bowtie_args, shell=True)

    # sort and index
    print "Sorting and indexing bam file"
    samtools1 = 'samtools view -Sbo Ann_branches.bam Ann_branches.sam'
    call(samtools1, shell=True)

    samtools2 = 'samtools sort Ann_branches.bam -o Ann_branches_sorted.bam'
    call(samtools2, shell=True)

    samtools3 = 'samtools index Ann_branches_sorted.bam'
    call(samtools3, shell=True)

    if peaks is True:
        print "Finding unaligned reads with unpredicted splicing events"
        peak_seqs = collect_intron_seq(gff3, fa_dict, peak_df=peak_df)
        find_split_reads('Ann_branches_unsplit.fa',
                         peak_seqs,
                         'Peak_branches',
                         threads=threads)

        print "Aligning split reads to the genome with Bowtie"
        bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Peak_branches_split.fa --sam Peak_branches.sam'.format(
            threads, bowtie_index)
        call(bowtie_args, shell=True)

        print "Sorting and indexing bam file"
        samtools1 = 'samtools view -Sbo Peak_branches.bam Peak_branches.sam'
        call(samtools1, shell=True)

        samtools2 = 'samtools sort Peak_branches.bam -o Peak_branches_sorted.bam'
        call(samtools2, shell=True)

        samtools3 = 'samtools index Peak_branches_sorted.bam'
        call(samtools3, shell=True)
Ejemplo n.º 16
0
def peak_seq_enrichment(df, organism):
    organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism)
    nuc_prob = SP.gc_content(fa_dict)
    p_dict = {
        'A': nuc_prob[0],
        'T': nuc_prob[2],
        'C': nuc_prob[1],
        'G': nuc_prob[3]
    }

    unpeaks = df[df['type'] == 'other']
    unpeaks = unpeaks.append(df[df['type'] == 'intronic'])
    print "Number of unpredicted peaks:"
    print len(unpeaks)
    nucs = ['G', 'A', 'C', 'T']
    dinucs = set()
    for nuc in nucs:
        for nuc2 in nucs:
            dinucs.add(nuc + nuc2)

    five = {}
    three = {}
    for dinuc in dinucs:
        five[dinuc] = len(
            unpeaks[unpeaks['sequence'].str[6:8].str.contains(dinuc)])
        three[dinuc] = len(
            unpeaks[unpeaks['sequence'].str[4:6].str.contains(dinuc)])

    five_LO = {}
    three_LO = {}
    for dinuc in five.keys():
        p_dinuc = p_dict[dinuc[0]] * p_dict[dinuc[1]]
        phat_dinuc = five[dinuc] / float(len(unpeaks))
        phat_dinuc2 = three[dinuc] / float(len(unpeaks))

        SE = np.sqrt(phat_dinuc * (1 - phat_dinuc) / len(unpeaks))
        SE2 = np.sqrt(phat_dinuc2 * (1 - phat_dinuc2) / len(unpeaks))
        Z = (phat_dinuc - p_dinuc) / SE
        Z2 = (phat_dinuc2 - p_dinuc) / SE2

        pvalue = stats.norm.sf(Z)
        pvalue2 = stats.norm.sf(Z2)
        LO = np.log((1 - pvalue) / pvalue)
        LO2 = np.log((1 - pvalue2) / pvalue2)

        five_LO[dinuc] = LO
        three_LO[dinuc] = LO2

    fig, ax = plt.subplots(figsize=(12, 6))
    width = 0.35
    ind = np.arange(len(five_LO.keys()))
    rects2 = ax.bar(ind,
                    three_LO.values(),
                    width,
                    color='crimson',
                    edgecolor='crimson',
                    label='Before peak')
    rects1 = ax.bar(ind + width,
                    five_LO.values(),
                    width,
                    color='indigo',
                    edgecolor='indigo',
                    label='After peak')
    ax.plot([-1, 17], [0, 0], '-', color='black')
    ax.plot([-1, 17], [2.94, 2.94], '--', color='0.7', label='95% CI')
    ax.plot([-1, 17], [-2.94, -2.94], '--', color='0.7')

    ax.set_xlim([-1, 17])
    ax.set_xticklabels(five_LO.keys(), fontsize=12)
    ax.set_xticks(ind + width / 2)
    ax.set_ylabel('Log odds dinucleotide enrichment', fontsize=14)
    ax.set_title('Unpredicted peaks', fontsize=14)
    ax.legend(fontsize=12)

    return fig