def peaks_only(config_file, untagged, organism): CP_out = [] quant_bams = {} with open(config_file, 'r') as config: for line in config: if untagged in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = config_file.split('/')[-1].split('_config')[0] base_dir = config_file.split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: "+base_dir+name organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks') peak_df.to_pickle(base_dir+name+'_all_peaks.pickle') quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism) quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir+name+'_quantitation.pickle') quant_df.to_csv(base_dir+name+'_quantitation.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
def position_wise_scores2(seq5_list, seq3_list, organism, title='Intron position strength'): '''Uses chi-contingency test to score base proportions at each position in sample against population''' organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) all_5p, all_3p = generate_all_ss_seqs(gff3, fa_dict, organism) pop_5p = seq_list_to_totals(all_5p) pop_3p = seq_list_to_totals(all_3p) samp_5p = seq_list_to_totals(seq5_list) samp_3p = seq_list_to_totals(seq3_list) print samp_5p.shape p5 = [] for n in range(samp_5p.shape[1]): if n == 2 or n == 3: p5.append(1) else: conting = np.array([samp_5p[:,n],pop_5p[:,n]]) chi2, p, dof, expected = stats.chi2_contingency(conting) p5.append(np.log10(p)*-1) p3 = [] for n in range(samp_3p.shape[1]): if n == 4 or n == 5: p3.append(1) else: conting = np.array([samp_3p[:,n],pop_3p[:,n]]) chi2, p, dof, expected = stats.chi2_contingency(conting) p3.append(np.log10(p)*-1) fig, ax = plt.subplots(2, 1, figsize=(4,4)) width = 0.7 max_y = max(p5+p3) + 0.1*max(p5+p3) ind5 = np.arange(len(p5)) ax[0].bar(ind5, p5, color='k') ax[0].plot([0,8], [2,2], '--', color='0.7') ax[0].set_xlim([0,len(p5)]) ax[0].set_ylabel("5' splice site\n-log10(p-value)") ax[0].set_title(title) ax[0].set_ylim([0,max_y]) ind3 = np.arange(len(p3)) ax[1].bar(ind3, p3, color='k') ax[1].plot([0,8], [2,2], '--', color='0.7') ax[1].set_xlim([0,len(p3)]) ax[1].set_ylabel("3' splice site\n-log10(p-value)") ax[1].set_ylim([0,max_y]) ax[0].set_xticks(ind3 + width / 2) ax[1].set_xticks(ind3 + width / 2) ax[0].set_xticklabels(np.arange(-2,6)) ax[1].set_xticklabels(np.arange(-5,3)) fig.tight_layout() plt.show() return fig
def peak_seq_enrichment(df, organism): organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) nuc_prob = SP.gc_content(fa_dict) p_dict = {'A':nuc_prob[0], 'T':nuc_prob[2], 'C':nuc_prob[1], 'G':nuc_prob[3]} unpeaks = df[df['type'] == 'other'] unpeaks = unpeaks.append(df[df['type'] == 'intronic']) print "Number of unpredicted peaks:" print len(unpeaks) nucs = ['G','A','C','T'] dinucs = set() for nuc in nucs: for nuc2 in nucs: dinucs.add(nuc+nuc2) five = {} three = {} for dinuc in dinucs: five[dinuc] = len(unpeaks[unpeaks['sequence'].str[6:8].str.contains(dinuc)]) three[dinuc] = len(unpeaks[unpeaks['sequence'].str[4:6].str.contains(dinuc)]) five_LO = {} three_LO = {} for dinuc in five.keys(): p_dinuc = p_dict[dinuc[0]]*p_dict[dinuc[1]] phat_dinuc = five[dinuc]/float(len(unpeaks)) phat_dinuc2 = three[dinuc]/float(len(unpeaks)) SE = np.sqrt(phat_dinuc*(1-phat_dinuc)/len(unpeaks)) SE2 = np.sqrt(phat_dinuc2*(1-phat_dinuc2)/len(unpeaks)) Z = (phat_dinuc-p_dinuc)/SE Z2 = (phat_dinuc2-p_dinuc)/SE2 pvalue = stats.norm.sf(Z) pvalue2 = stats.norm.sf(Z2) LO = np.log((1-pvalue)/pvalue) LO2 = np.log((1-pvalue2)/pvalue2) five_LO[dinuc] = LO three_LO[dinuc] = LO2 fig, ax = plt.subplots(figsize=(12,6)) width = 0.35 ind = np.arange(len(five_LO.keys())) rects2 = ax.bar(ind, three_LO.values(), width, color='crimson', edgecolor='crimson', label='Before peak') rects1 = ax.bar(ind + width, five_LO.values(), width, color='indigo', edgecolor='indigo', label='After peak') ax.plot([-1,17],[0,0],'-', color='black') ax.plot([-1,17],[2.94,2.94], '--', color='0.7', label='95% CI') ax.plot([-1,17],[-2.94,-2.94], '--', color='0.7') ax.set_xlim([-1,17]) ax.set_xticklabels(five_LO.keys(), fontsize=12) ax.set_xticks(ind + width / 2) ax.set_ylabel('Log odds dinucleotide enrichment', fontsize=14) ax.set_title('Unpredicted peaks', fontsize=14) ax.legend(fontsize=12) return fig
def peaks_only(config_file, untagged, organism): CP_out = [] quant_bams = {} with open(config_file, 'r') as config: for line in config: if untagged in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = config_file.split('/')[-1].split('_config')[0] base_dir = config_file.split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: " + base_dir + name organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name + '_CP_peaks') peak_df.to_pickle(base_dir + name + '_all_peaks.pickle') quant_df = SP.quant_from_peak_df(peak_df, gff3, fa_dict, organism=organism) quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir + name + '_quantitation.pickle') quant_df.to_csv(base_dir + name + '_quantitation.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name)
def position_wise_scores(seq_5p, seq_3p, organism): organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) PSSM_5p, PSSM_3p = generate_consensus_matrix(gff3, fa_dict, PSSM=True) base_dict = {"A":0, "C":1, "T":2, "G":3} seq_5p = [x for x in seq_5p if x is not None] seq_3p = [x for x in seq_3p if x is not None] score_5prime = np.empty([2,len(seq_5p[0])]) score_3prime = np.empty([2,len(seq_3p[0])]) all_5p = np.empty([len(seq_5p), len(seq_5p[0])]) all_3p = np.empty([len(seq_3p), len(seq_3p[0])]) n=0 for n in range(len(seq_5p)): for a, base in enumerate(seq_5p[n]): all_5p[n,a] = PSSM_5p[base_dict[base], a] a=0 for a in range(len(score_5prime[0])): score_5prime[0,a] = np.median(all_5p[0:,a]) score_5prime[1,a] = (max(all_5p[0:,a])-min(all_5p[0:,a]))/2. print score_5prime m=0 for m in range(len(seq_3p)): for b, base in enumerate(seq_3p[m]): all_3p[m,b] = PSSM_3p[base_dict[base], b] b=0 for b in range(len(score_3prime[0])): score_3prime[0,b] = np.median(all_3p[0:,b]) score_3prime[1,b] = (max(all_3p[0:,b])-min(all_3p[0:,b]))/2. print score_3prime return all_5p, all_3p
def position_wise_scores(seq_5p, seq_3p, organism): organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) PSSM_5p, PSSM_3p = generate_consensus_matrix(gff3, fa_dict, PSSM=True) base_dict = {"A": 0, "C": 1, "T": 2, "G": 3} seq_5p = [x for x in seq_5p if x is not None] seq_3p = [x for x in seq_3p if x is not None] score_5prime = np.empty([2, len(seq_5p[0])]) score_3prime = np.empty([2, len(seq_3p[0])]) all_5p = np.empty([len(seq_5p), len(seq_5p[0])]) all_3p = np.empty([len(seq_3p), len(seq_3p[0])]) n = 0 for n in range(len(seq_5p)): for a, base in enumerate(seq_5p[n]): all_5p[n, a] = PSSM_5p[base_dict[base], a] a = 0 for a in range(len(score_5prime[0])): score_5prime[0, a] = np.median(all_5p[0:, a]) score_5prime[1, a] = (max(all_5p[0:, a]) - min(all_5p[0:, a])) / 2. print score_5prime m = 0 for m in range(len(seq_3p)): for b, base in enumerate(seq_3p[m]): all_3p[m, b] = PSSM_3p[base_dict[base], b] b = 0 for b in range(len(score_3prime[0])): score_3prime[0, b] = np.median(all_3p[0:, b]) score_3prime[1, b] = (max(all_3p[0:, b]) - min(all_3p[0:, b])) / 2. print score_3prime return all_5p, all_3p
def main(): '''Each line will be : bam_file,genotype,sample e.g. CM763-A_sorted.bam,WT,A1''' bam_dict = {} with open(sys.argv[1], 'r') as config: for line in config: info = line.split(',') genotype = info[1] sample = info[2].strip() if genotype not in bam_dict: bam_dict[genotype] = {} bam_dict[genotype][sample] = info[0] prefix = sys.argv[1].split('_config')[0] organism = sys.argv[3] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) columns = ['5p score','exon size (us)','exon size (ds)','introns in transcript','type','transcript size','intron size', 'chromosome','position','alt splicing','3p score','transcript','intron position','strand','peak', 'Base 5-0','Base 5-1','Base 5-2','Base 5-3','Base 5-4','Base 5-5','Base 5-6','Base 5-7','Base 3-0', 'Base 3-1','Base 3-2','Base 3-3','Base 3-4','Base 3-5','Base 3-6','Base 3-7','branch score', 'branch to 3p distance','percent pPy','branch-0','branch-1','branch-2','branch-3','branch-4'] quant_df = pd.read_csv(sys.argv[2], index_col=0) try: quant_df = quant_df[columns] except KeyError: print "Columns missing from dataframe..." print columns print quant_df.columns return None final_df = copy.deepcopy(quant_df) final_df.columns = pd.MultiIndex.from_product([['Peaks'], final_df.columns]) for genotype, samples in bam_dict.iteritems(): # Determine if whole cell extract samples are present Ws = [x for x in samples.keys() if "W" in x] if len(Ws) > 1: W = True else: W=False # Quantitate all samples with genotype new_df = quantitate_junction_df(samples, quant_df, gff3, W=W) # Remove original columns and rename new ones with multiindex new_columns = [x for x in new_df.columns if x not in columns] new_df = new_df[new_columns] new_df.columns = pd.MultiIndex.from_product([[genotype], new_df.columns]) final_df = final_df.join(new_df, how='inner') #final_df = final_df.merge(new_df, right_index=True, left_index=True) final_df.to_csv(prefix+'_quant_df.csv') final_df.to_pickle(prefix+'_quant_df.pickle') SP.SP_quant_scatters(final_df.dropna(how='any'), bam_dict, W=W)
def main(): '''Each line will be : bam_file,genotype,sample e.g. CM763-A_sorted.bam,WT,A1''' bam_dict = {} with open(sys.argv[1], 'r') as config: for line in config: info = line.split(',') genotype = info[1] sample = info[2].strip() if genotype not in bam_dict: bam_dict[genotype] = {} bam_dict[genotype][sample] = info[0] prefix = sys.argv[1].split('_config')[0] organism = sys.argv[3] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) columns = [ '5p score', 'exon size (us)', 'exon size (ds)', 'introns in transcript', 'type', 'transcript size', 'intron size', 'chromosome', 'position', 'alt splicing', '3p score', 'transcript', 'intron position', 'strand', 'peak', 'Base 5-0', 'Base 5-1', 'Base 5-2', 'Base 5-3', 'Base 5-4', 'Base 5-5', 'Base 5-6', 'Base 5-7', 'Base 3-0', 'Base 3-1', 'Base 3-2', 'Base 3-3', 'Base 3-4', 'Base 3-5', 'Base 3-6', 'Base 3-7', 'branch score', 'branch to 3p distance', 'percent pPy', 'branch-0', 'branch-1', 'branch-2', 'branch-3', 'branch-4' ] quant_df = pd.read_csv(sys.argv[2], index_col=0) try: quant_df = quant_df[columns] except KeyError: print "Columns missing from dataframe..." print columns print quant_df.columns return None final_df = copy.deepcopy(quant_df) final_df.columns = pd.MultiIndex.from_product([['Peaks'], final_df.columns]) for genotype, samples in bam_dict.iteritems(): # Determine if whole cell extract samples are present Ws = [x for x in samples.keys() if "W" in x] if len(Ws) > 1: W = True else: W = False # Quantitate all samples with genotype new_df = quantitate_junction_df(samples, quant_df, gff3, W=W) # Remove original columns and rename new ones with multiindex new_columns = [x for x in new_df.columns if x not in columns] new_df = new_df[new_columns] new_df.columns = pd.MultiIndex.from_product([[genotype], new_df.columns]) final_df = final_df.join(new_df, how='inner') #final_df = final_df.merge(new_df, right_index=True, left_index=True) final_df.to_csv(prefix + '_quant_df.csv') final_df.to_pickle(prefix + '_quant_df.pickle') SP.SP_quant_scatters(final_df.dropna(how='any'), bam_dict, W=W)
def main(): '''Usage: run SP_pipeline.py config_file untagged_sample_name organism config file : file that lists all branch, junction and peak files untagged_sample_name : prefix for untagged sample organism : pombe, crypto or cerevisiae''' junc_beds = [] branch_bams = [] CP_out = [] CP_untagged = None quant_bams = {} # Read configuration file with open(sys.argv[1], 'r') as config: for line in config: if 'junctions.bed' in line.lower(): junc_beds.append(line.strip()) elif 'branch' in line.lower(): branch_bams.append(line.strip()) elif sys.argv[2] in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = sys.argv[1].split('/')[-1].split('_config')[0] base_dir = sys.argv[1].split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: " + base_dir + name print "\nJunction bed files" print junc_beds print "\nBranch bam files" if len(branch_bams) == 2: print branch_bams use_branches = True elif len(branch_bams) == 0: print "No data for branches, continuing with only junctions" use_branches = False print "\nUntagged peaks" print CP_untagged print "\nChangepoint peaks" print CP_out print '' if CP_untagged is None: print "\n Error: no untagged file indicated" return None organism = sys.argv[3] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) #### Generate peak df if name + '_peaks_w_branch.csv' not in os.listdir( base_dir) or name + '_peaks_w_junc.csv' not in os.listdir( base_dir): if name + '_all_peaks.pickle' not in os.listdir(base_dir): peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name + '_CP_peaks') peak_df.to_pickle(base_dir + name + '_all_peaks.pickle') else: peak_df = pd.read_pickle(base_dir + name + '_all_peaks.pickle') #### Junction to peak comparison if name + '_peaks_w_junc.csv' not in os.listdir(base_dir): print "Generating peaks vs. junctions dataframe..." peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name) else: peaks_w_junc = pd.read_pickle(base_dir + name + '_peaks_w_junc.pickle') print "Peaks vs. junction dataframe already exists" #### Branch to peak comparison if use_branches is True: if name + '_peaks_w_branch.csv' not in os.listdir(base_dir): print "Generating peaks vs. branches dataframe..." peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name) else: peaks_w_branch = pd.read_pickle(base_dir + name + '_peaks_w_branch.pickle') print "Peaks vs. branches dataframe already exists" #### Clean up dataframe for quantitation if name + '_quantitation.csv' not in os.listdir(base_dir): quant_df, lariat_df = SP.make_quant_df(peaks_w_junc, peaks_w_branch, gff3, fa_dict, organism=organism) quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch, fa_dict) print "Counting reads in transcripts and at peaks..." quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir + name + '_quantitation.pickle') quant_df.to_csv(base_dir + name + '_quantitation.csv') lariat_df.to_pickle(base_dir + name + '_lariats.pickle') lariat_df.to_csv(base_dir + name + '_lariats.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) else: quant_df = pd.read_pickle(base_dir + name + '_quantitation.pickle') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) print "\n****Finished****"
def igv_plots_general(bam_list, gene_list, organism, colors=None, names=None, save_dir=None, unstranded=False, end_only=False, same_yaxis=False, specific_range=None, transcript_direction=True, log_scale=False, rpm=True, PE=False, plot_junctions=False): '''Usage: Parameters ---------- bam_list : list, bam files in order of plotting (top to bottom) gene_list : list of transcripts to plot (should be genes not transcript isoforms) if dataframe passed instead of list, will plot introns (must have intron information in datafame) organism : str, pombe or crypto colors : list, default `None` list of colors to use, same length as bam_list, check matplotlib documentation for valid color names names : list, default `None` list of sample names to use instead of bam file names. Same length as bam_files save_dir : str, default `None` directory to save eps files. If None, does not save files unstranded : bool, default `False` Use True for ChIP or DNA sequencing data (or unstranded RNAseq) end_only : bool or list, default `False` Whether to plot only the ends of reads. If different for each bam, make a list of bools same length as bam_list same_yaxis : bool, default `False` Whether all samples should be plotted on the same axis after normalizing to total number of aligned reads specific_range : str, default `None` Options: ('end', window) ('start', window) ([coordinate], window) transcript_direction : bool, default `True` If True, will plot in the direction of transcription, not in the direction of the DNA ''' # Get all organism information (annotation etc.) organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) tx_dict = SP.build_transcript_dict(gff3, organism=organism) fix_info = { 'I': 'chr1', 'II': 'chr2', 'III': 'chr3', 'chr1': 'I', 'chr2': 'II', 'chr4': 'IV', 'chr5': 'V', 'chr6': 'VI', 'chr7': 'VII', 'chr8': 'VIII', 'chr9': 'IX', 'chr10': 'X', 'chr11': 'XI', 'chr12': 'XII', 'chr13': 'XIII', 'chr14': 'XIV', 'chr15': 'XV', 'chr16': 'XVI', '-': '+', '+': '-', 'chr1': 'I', 'chr2': 'II', 'chr3': 'III' } if organism == 'pombe': tx_suffix = '.1' else: tx_suffix = 'T0' # Set up range parameters if specific range is indicated if specific_range is not None: window = int(specific_range[1]) new_tx_dict = {} for gene in gene_list: info = tx_dict[gene + tx_suffix] if specific_range[0] == 'end': if info[2] == '+': start = info[1] - window end = info[1] + window else: start = info[0] - window end = info[0] + window elif specific_range[0] == 'start': if info[2] == '-': start = info[1] - window end = info[1] + window else: start = info[0] - window end = info[0] + window else: start = int(specific_range[0]) - window end = int(specific_range[0]) + window new_tx_dict[gene + tx_suffix] = [start, end, info[2], info[3]] else: new_tx_dict = tx_dict # Open bam files and count reads if rpm is True open_bams = {} total_list = [] for bam in bam_list: open_bams[bam] = pysam.Samfile(bam) if rpm is True: total = check_output(['samtools', 'view', '-F 0x04', '-c', bam]).strip() total = float(total) / 1000000. total_list.append(total) else: total_list.append(1.) # Expand optional arguments to lists if necessary colors = list_from_arg(colors, len(bam_list)) end_only = list_from_arg(end_only, len(bam_list)) log_scale = list_from_arg(log_scale, len(bam_list)) unstranded = list_from_arg(unstranded, len(bam_list)) # Get gene_list from dataframe if gene_list is not a list df = None if type(gene_list) == dict: new_tx_dict = gene_list gene_list = gene_list.keys() elif type(gene_list) != list: df = gene_list gene_list = df.index for tx in gene_list: num_ax = len(bam_list) + 1 if plot_junctions is True: num_ax += len(bam_list) fig, ax = plt.subplots(num_ax, figsize=(10, num_ax), sharex=True) fig.subplots_adjust(hspace=0) # Get transcript info from transcript_dictionary if df is None: try: info = new_tx_dict[tx + tx_suffix] except KeyError: info = new_tx_dict[tx] chrom = info[3] start = info[0] end = info[1] strand = info[2] # If dataframe was passed, get plotting information from dataframe instead else: if isinstance(df.columns, pd.core.index.MultiIndex): new_columns = [x[1] for x in df.columns if x[0] == 'Peaks'] df = df[[x for x in df.columns if x[0] == 'Peaks']] df.columns = new_columns strand = df.loc[tx, 'strand'] chrom = df.loc[tx, 'chromosome'] if strand == '+': start = df.loc[tx, 'position'] - 100 end = df.loc[tx, 'position'] + df.loc[tx, 'intron size'] + 100 elif strand == '-': start = df.loc[tx, 'position'] - df.loc[tx, 'intron size'] - 100 end = df.loc[tx, 'position'] + 100 start = int(start) end = int(end) tx = df.loc[tx, 'transcript'] # Generate read series for each transcript max_y = 0 junc_ymax = 0 for n, bam in enumerate(bam_list): try: bam_iter = open_bams[bam].fetch(chrom, start, end) except ValueError: chrom = fix_info[chrom] bam_iter = open_bams[bam].fetch(chrom, start, end) if end_only[n] is not False: s = SP.generate_read_series_A(bam_iter, chrom, start, end, strand) linewidth = 2 else: if PE is False: s = SP.generate_read_series_B(bam_iter, chrom, start, end, strand) else: s = SP.generate_read_series_PE(bam_iter, chrom, start, end, strand) linewidth = 1 # Get reads from otherstrand if the library type is unstranded if unstranded[n] is True: bam_iter = open_bams[bam].fetch(chrom, start, end) if end_only[n] is not False: s2 = SP.generate_read_series_A(bam_iter, chrom, start, end, fix_info[strand]) linewidth = 2 else: if PE is False: s2 = SP.generate_read_series_B(bam_iter, chrom, start, end, fix_info[strand]) else: s2 = SP.generate_read_series_PE( bam_iter, chrom, start, end, fix_info[strand]) linewidth = 1 s = s.add(s2) # Normalize to rpm (will just divide by 1 if rpm is False) s = s.divide(total_list[n]) if log_scale[n] is True: s = s.apply(np.log2) # Plot! ax[n].bar(s.index, s, linewidth=linewidth, color=colors[n], edgecolor=colors[n], zorder=2) ax[n].tick_params(axis='both', which='major', labelsize=14) max_y = max([max_y, max(s)]) if plot_junctions is True: m = n + len(bam_list) intron_dict = get_junctions(open_bams[bam], chrom, start, end, strand) ax[m].plot((start, end), (0, 0), '-', c='k') for coords, heights in intron_dict.iteritems(): ax[m].plot(coords, heights, '-', linewidth=2, color=colors[n]) ax[m].fill_between(coords, 0, heights, facecolor=colors[n], interpolate=True, alpha=0.5) if same_yaxis is True: junc_ymax = max( [junc_ymax, max(zip(*intron_dict.values())[1])]) # Add diagram of gene below traces if tx in tx_dict: strand = gene_patches(tx, tx_dict, ax[-1]) ax[-1].set_xlim(start, end) else: try: new_tx = tx.split(' ')[0] if new_tx[-2] == 'T' or new_tx[-2] == '.': new_tx = new_tx[:-2] strand = gene_patches(new_tx, tx_dict, ax[-1]) ax[-1].set_xlim(start, end) except KeyError: print "Transcript unknown" # Flip minus strand transcripts if indicated if transcript_direction is True: if strand == '-': ax[-1].invert_xaxis() # Set x and y limits for n in range(len(bam_list)): ax[n].set_xlim(start, end) if same_yaxis is True: ax[n].set_ylim(0, max_y + 0.1 * max_y) if plot_junctions is True: ax[n + len(bam_list)].set_ylim(0, junc_ymax + 0.1 * junc_ymax) if strand == '-': ax[n].invert_xaxis() ax[0].set_ylabel('RPM', fontsize=16) ax[0].set_title(tx, fontsize=16) #ax[0].get_xaxis().set_ticks([]) plt.show() # Save if indicated if save_dir is not None: if not os.path.exists(save_dir): os.makedirs(save_dir) fig.savefig(save_dir + tx + '.eps', format='eps') plt.clf()
def main(): '''Usage: run SP_pipeline.py config_file untagged_sample_name organism config file : file that lists all branch, junction and peak files untagged_sample_name : prefix for untagged sample organism : pombe, crypto or cerevisiae''' junc_beds = [] branch_bams = [] CP_out = [] CP_untagged = None quant_bams = {} # Read configuration file with open(sys.argv[1], 'r') as config: for line in config: if 'junctions.bed' in line.lower(): junc_beds.append(line.strip()) elif 'branch' in line.lower(): branch_bams.append(line.strip()) elif sys.argv[2] in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) #bam files for quantitation should be file,quant,A1 elif 'quant' in line: quant_bams[line.split(',')[-1].strip()] = line.split(',')[0] name = sys.argv[1].split('/')[-1].split('_config')[0] base_dir = sys.argv[1].split(name)[0] if base_dir == '': base_dir = './' print "Output file location and prefix: "+base_dir+name print "\nJunction bed files" print junc_beds print "\nBranch bam files" if len(branch_bams) == 2: print branch_bams use_branches = True elif len(branch_bams) == 0: print "No data for branches, continuing with only junctions" use_branches = False print "\nUntagged peaks" print CP_untagged print "\nChangepoint peaks" print CP_out print '' if CP_untagged is None: print "\n Error: no untagged file indicated" return None organism = sys.argv[3] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) #### Generate peak df if name+'_peaks_w_branch.csv' not in os.listdir(base_dir) or name+'_peaks_w_junc.csv' not in os.listdir(base_dir): if name+'_all_peaks.pickle' not in os.listdir(base_dir): peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name=name+'_CP_peaks') peak_df.to_pickle(base_dir+name+'_all_peaks.pickle') else: peak_df = pd.read_pickle(base_dir+name+'_all_peaks.pickle') #### Junction to peak comparison if name+'_peaks_w_junc.csv' not in os.listdir(base_dir): print "Generating peaks vs. junctions dataframe..." peaks_w_junc = peak_junction_analysis(peak_df, junc_beds, gff3, fa_dict, organism, base_dir, name) else: peaks_w_junc = pd.read_pickle(base_dir+name+'_peaks_w_junc.pickle') print "Peaks vs. junction dataframe already exists" #### Branch to peak comparison if use_branches is True: if name+'_peaks_w_branch.csv' not in os.listdir(base_dir): print "Generating peaks vs. branches dataframe..." peaks_w_branch = peak_branch_analysis(peak_df, branch_bams, gff3, fa_dict, organism, base_dir, name) else: peaks_w_branch = pd.read_pickle(base_dir+name+'_peaks_w_branch.pickle') print "Peaks vs. branches dataframe already exists" #### Clean up dataframe for quantitation if name+'_quantitation.csv' not in os.listdir(base_dir): quant_df, lariat_df = SP.make_quant_df(peaks_w_junc, peaks_w_branch, gff3, fa_dict, organism=organism) quant_df = SP.find_score_branches_ppy(quant_df, peaks_w_branch, fa_dict) print "Counting reads in transcripts and at peaks..." quant_df = SP.quantitate_junction_df(quant_bams, quant_df, gff3, organism=organism) quant_df.to_pickle(base_dir+name+'_quantitation.pickle') quant_df.to_csv(base_dir+name+'_quantitation.csv') lariat_df.to_pickle(base_dir+name+'_lariats.pickle') lariat_df.to_csv(base_dir+name+'_lariats.csv') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) else: quant_df = pd.read_pickle(base_dir+name+'_quantitation.pickle') scatter = SP.SP_pipeline_scatters(quant_df, base_dir, name) print "\n****Finished****"
def position_wise_scores2(seq5_list, seq3_list, organism, title='Intron position strength'): '''Uses chi-contingency test to score base proportions at each position in sample against population''' organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) all_5p, all_3p = generate_all_ss_seqs(gff3, fa_dict, organism) pop_5p = seq_list_to_totals(all_5p) pop_3p = seq_list_to_totals(all_3p) samp_5p = seq_list_to_totals(seq5_list) samp_3p = seq_list_to_totals(seq3_list) print samp_5p.shape p5 = [] for n in range(samp_5p.shape[1]): if n == 2 or n == 3: p5.append(1) else: conting = np.array([samp_5p[:, n], pop_5p[:, n]]) chi2, p, dof, expected = stats.chi2_contingency(conting) p5.append(np.log10(p) * -1) p3 = [] for n in range(samp_3p.shape[1]): if n == 4 or n == 5: p3.append(1) else: conting = np.array([samp_3p[:, n], pop_3p[:, n]]) chi2, p, dof, expected = stats.chi2_contingency(conting) p3.append(np.log10(p) * -1) fig, ax = plt.subplots(2, 1, figsize=(4, 4)) width = 0.7 max_y = max(p5 + p3) + 0.1 * max(p5 + p3) ind5 = np.arange(len(p5)) ax[0].bar(ind5, p5, color='k') ax[0].plot([0, 8], [2, 2], '--', color='0.7') ax[0].set_xlim([0, len(p5)]) ax[0].set_ylabel("5' splice site\n-log10(p-value)") ax[0].set_title(title) ax[0].set_ylim([0, max_y]) ind3 = np.arange(len(p3)) ax[1].bar(ind3, p3, color='k') ax[1].plot([0, 8], [2, 2], '--', color='0.7') ax[1].set_xlim([0, len(p3)]) ax[1].set_ylabel("3' splice site\n-log10(p-value)") ax[1].set_ylim([0, max_y]) ax[0].set_xticks(ind3 + width / 2) ax[1].set_xticks(ind3 + width / 2) ax[0].set_xticklabels(np.arange(-2, 6)) ax[1].set_xticklabels(np.arange(-5, 3)) fig.tight_layout() plt.show() return fig
def main(): '''Usage: run SPBranch.py unmapped1 unmapped2 threads organism [config_file] [untagged] Parameters ----------- unmapped1 : bam or fastq file of unmapped reads from tophat or bowtie unmapped2 : bam or fastq file of unmapped reads from tophat or bowtie threads : number of processors to use organism : 'pombe or 'crypto' config_file : if using peaks to call - list of changepoint output file names and where to find them untagged : untagged sample name (must be in file name) Output ------ bam files with aligned reads. Will be interpreted by SP_pipeline. ''' unmapped1 = sys.argv[1] unmapped2 = sys.argv[2] threads = int(sys.argv[3]) if unmapped1.endswith('bam'): btf_args = 'bamToFastq -i {0} -fq {1}'.format(unmapped1, unmapped1.split('.bam')[-1]+'.fq') call(btf_args, shell=False) unmapped1 = unmapped1.split('.bam')[-1]+'.fq' if unmapped2.endswith('bam'): btf_args = 'bamToFastq -i {0} -fq {1}'.format(unmapped2, unmapped2.split('.bam')[-1]+'.fq') call(btf_args, shell=False) unmapped2 = unmapped2.split('.bam')[-1]+'.fq' cat_args = 'cat {0} {1} > unmapped_all.fq'.format(unmapped1, unmapped2) call(cat_args, shell=True) organism = sys.argv[4] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) peaks = False if len(sys.argv) == 7: peaks = True with open(sys.argv[5], 'r') as config: for line in config: if sys.argv[6] in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name='CP_peaks') ann_seqs = collect_intron_seq(gff3, fa_dict) print "Finding unaligned reads with annotated 5' splice sites" find_split_reads('unmapped_all.fq', ann_seqs, 'Ann_branches', threads=threads) print "Aligning split reads to the genome with Bowtie" bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Ann_branches_split.fa --sam Ann_branches.sam'.format(threads, bowtie_index) call(bowtie_args, shell=True) # sort and index print "Sorting and indexing bam file" samtools1 = 'samtools view -Sbo Ann_branches.bam Ann_branches.sam' call(samtools1, shell=True) samtools2 = 'samtools sort Ann_branches.bam -o Ann_branches_sorted.bam' call(samtools2, shell=True) samtools3 = 'samtools index Ann_branches_sorted.bam' call(samtools3, shell=True) if peaks is True: print "Finding unaligned reads with unpredicted splicing events" peak_seqs = collect_intron_seq(gff3, fa_dict, peak_df=peak_df) find_split_reads('Ann_branches_unsplit.fa', peak_seqs, 'Peak_branches', threads=threads) print "Aligning split reads to the genome with Bowtie" bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Peak_branches_split.fa --sam Peak_branches.sam'.format(threads, bowtie_index) call(bowtie_args, shell=True) print "Sorting and indexing bam file" samtools1 = 'samtools view -Sbo Peak_branches.bam Peak_branches.sam' call(samtools1, shell=True) samtools2 = 'samtools sort Peak_branches.bam -o Peak_branches_sorted.bam' call(samtools2, shell=True) samtools3 = 'samtools index Peak_branches_sorted.bam' call(samtools3, shell=True)
def igv_plots_general(bam_list, gene_list, organism, colors=None, names=None, save_dir=None, unstranded=False, end_only=False, same_yaxis=False, specific_range=None, transcript_direction=True, log_scale=False, rpm=True, PE=False, plot_junctions=False): '''Usage: Parameters ---------- bam_list : list, bam files in order of plotting (top to bottom) gene_list : list of transcripts to plot (should be genes not transcript isoforms) if dataframe passed instead of list, will plot introns (must have intron information in datafame) organism : str, pombe or crypto colors : list, default `None` list of colors to use, same length as bam_list, check matplotlib documentation for valid color names names : list, default `None` list of sample names to use instead of bam file names. Same length as bam_files save_dir : str, default `None` directory to save eps files. If None, does not save files unstranded : bool, default `False` Use True for ChIP or DNA sequencing data (or unstranded RNAseq) end_only : bool or list, default `False` Whether to plot only the ends of reads. If different for each bam, make a list of bools same length as bam_list same_yaxis : bool, default `False` Whether all samples should be plotted on the same axis after normalizing to total number of aligned reads specific_range : str, default `None` Options: ('end', window) ('start', window) ([coordinate], window) transcript_direction : bool, default `True` If True, will plot in the direction of transcription, not in the direction of the DNA ''' # Get all organism information (annotation etc.) organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) tx_dict = SP.build_transcript_dict(gff3, organism=organism) fix_info = {'I':'chr1','II':'chr2','III':'chr3','chr1':'I','chr2':'II','chr4':'IV','chr5':'V','chr6':'VI', 'chr7':'VII','chr8':'VIII','chr9':'IX','chr10':'X','chr11':'XI','chr12':'XII','chr13':'XIII', 'chr14':'XIV','chr15':'XV','chr16':'XVI','-':'+','+':'-','chr1':'I','chr2':'II','chr3':'III'} if organism == 'pombe': tx_suffix = '.1' else: tx_suffix = 'T0' # Set up range parameters if specific range is indicated if specific_range is not None: window = int(specific_range[1]) new_tx_dict = {} for gene in gene_list: info = tx_dict[gene+tx_suffix] if specific_range[0] == 'end': if info[2] == '+': start = info[1]-window end = info[1]+window else: start = info[0]-window end = info[0]+window elif specific_range[0] == 'start': if info[2] == '-': start = info[1]-window end = info[1]+window else: start = info[0]-window end = info[0]+window else: start = int(specific_range[0])-window end = int(specific_range[0])+window new_tx_dict[gene+tx_suffix] = [start, end, info[2], info[3]] else: new_tx_dict = tx_dict # Open bam files and count reads if rpm is True open_bams = {} total_list = [] for bam in bam_list: open_bams[bam] = pysam.Samfile(bam) if rpm is True: total = check_output(['samtools','view','-F 0x04','-c',bam]).strip() total = float(total)/1000000. total_list.append(total) else: total_list.append(1.) # Expand optional arguments to lists if necessary colors = list_from_arg(colors, len(bam_list)) end_only = list_from_arg(end_only, len(bam_list)) log_scale = list_from_arg(log_scale, len(bam_list)) unstranded = list_from_arg(unstranded, len(bam_list)) # Get gene_list from dataframe if gene_list is not a list df = None if type(gene_list) == dict: new_tx_dict = gene_list gene_list = gene_list.keys() elif type(gene_list) != list: df = gene_list gene_list = df.index for tx in gene_list: num_ax = len(bam_list)+1 if plot_junctions is True: num_ax += len(bam_list) fig, ax = plt.subplots(num_ax, figsize=(10,num_ax), sharex=True) fig.subplots_adjust(hspace=0) # Get transcript info from transcript_dictionary if df is None: try: info = new_tx_dict[tx+tx_suffix] except KeyError: info = new_tx_dict[tx] chrom = info[3] start = info[0] end = info[1] strand = info[2] # If dataframe was passed, get plotting information from dataframe instead else: if isinstance(df.columns, pd.core.index.MultiIndex): new_columns = [x[1] for x in df.columns if x[0] == 'Peaks'] df = df[[x for x in df.columns if x[0] == 'Peaks']] df.columns = new_columns strand = df.loc[tx,'strand'] chrom = df.loc[tx,'chromosome'] if strand == '+': start = df.loc[tx,'position']-100 end = df.loc[tx,'position'] + df.loc[tx,'intron size']+100 elif strand == '-': start = df.loc[tx,'position']-df.loc[tx,'intron size']-100 end = df.loc[tx,'position']+100 start = int(start) end = int(end) tx = df.loc[tx,'transcript'] # Generate read series for each transcript max_y = 0 junc_ymax = 0 for n, bam in enumerate(bam_list): try: bam_iter = open_bams[bam].fetch(chrom, start, end) except ValueError: chrom = fix_info[chrom] bam_iter = open_bams[bam].fetch(chrom, start, end) if end_only[n] is not False: s = SP.generate_read_series_A(bam_iter, chrom, start, end, strand) linewidth = 2 else: if PE is False: s = SP.generate_read_series_B(bam_iter, chrom, start, end, strand) else: s = SP.generate_read_series_PE(bam_iter, chrom, start, end, strand) linewidth = 1 # Get reads from otherstrand if the library type is unstranded if unstranded[n] is True: bam_iter = open_bams[bam].fetch(chrom, start, end) if end_only[n] is not False: s2 = SP.generate_read_series_A(bam_iter, chrom, start, end, fix_info[strand]) linewidth = 2 else: if PE is False: s2 = SP.generate_read_series_B(bam_iter, chrom, start, end, fix_info[strand]) else: s2 = SP.generate_read_series_PE(bam_iter, chrom, start, end, fix_info[strand]) linewidth = 1 s = s.add(s2) # Normalize to rpm (will just divide by 1 if rpm is False) s = s.divide(total_list[n]) if log_scale[n] is True: s = s.apply(np.log2) # Plot! ax[n].bar(s.index, s, linewidth=linewidth, color=colors[n], edgecolor=colors[n], zorder=2) ax[n].tick_params(axis='both', which='major', labelsize=14) max_y = max([max_y,max(s)]) if plot_junctions is True: m = n+len(bam_list) intron_dict = get_junctions(open_bams[bam], chrom, start, end, strand) ax[m].plot((start, end),(0,0),'-',c='k') for coords, heights in intron_dict.iteritems(): ax[m].plot(coords, heights, '-', linewidth=2, color=colors[n]) ax[m].fill_between(coords, 0, heights, facecolor=colors[n], interpolate=True, alpha=0.5) if same_yaxis is True: junc_ymax = max([junc_ymax, max(zip(*intron_dict.values())[1])]) # Add diagram of gene below traces if tx in tx_dict: strand = gene_patches(tx, tx_dict, ax[-1]) ax[-1].set_xlim(start, end) else: try: new_tx = tx.split(' ')[0] if new_tx[-2] == 'T' or new_tx[-2] == '.': new_tx = new_tx[:-2] strand = gene_patches(new_tx, tx_dict, ax[-1]) ax[-1].set_xlim(start, end) except KeyError: print "Transcript unknown" # Flip minus strand transcripts if indicated if transcript_direction is True: if strand == '-': ax[-1].invert_xaxis() # Set x and y limits for n in range(len(bam_list)): ax[n].set_xlim(start, end) if same_yaxis is True: ax[n].set_ylim(0,max_y+0.1*max_y) if plot_junctions is True: ax[n+len(bam_list)].set_ylim(0,junc_ymax+0.1*junc_ymax) if strand == '-': ax[n].invert_xaxis() ax[0].set_ylabel('RPM', fontsize=16) ax[0].set_title(tx, fontsize=16) #ax[0].get_xaxis().set_ticks([]) plt.show() # Save if indicated if save_dir is not None: if not os.path.exists(save_dir): os.makedirs(save_dir) fig.savefig(save_dir+tx+'.eps', format='eps') plt.clf()
def main(): '''Usage: run SPBranch.py unmapped1 unmapped2 threads organism [config_file] [untagged] Parameters ----------- unmapped1 : bam or fastq file of unmapped reads from tophat or bowtie unmapped2 : bam or fastq file of unmapped reads from tophat or bowtie threads : number of processors to use organism : 'pombe or 'crypto' config_file : if using peaks to call - list of changepoint output file names and where to find them untagged : untagged sample name (must be in file name) Output ------ bam files with aligned reads. Will be interpreted by SP_pipeline. ''' unmapped1 = sys.argv[1] unmapped2 = sys.argv[2] threads = int(sys.argv[3]) if unmapped1.endswith('bam'): btf_args = 'bamToFastq -i {0} -fq {1}'.format( unmapped1, unmapped1.split('.bam')[-1] + '.fq') call(btf_args, shell=False) unmapped1 = unmapped1.split('.bam')[-1] + '.fq' if unmapped2.endswith('bam'): btf_args = 'bamToFastq -i {0} -fq {1}'.format( unmapped2, unmapped2.split('.bam')[-1] + '.fq') call(btf_args, shell=False) unmapped2 = unmapped2.split('.bam')[-1] + '.fq' cat_args = 'cat {0} {1} > unmapped_all.fq'.format(unmapped1, unmapped2) call(cat_args, shell=True) organism = sys.argv[4] organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) peaks = False if len(sys.argv) == 7: peaks = True with open(sys.argv[5], 'r') as config: for line in config: if sys.argv[6] in line: CP_untagged = line.strip() elif 'changepoint' in line.lower() or 'peak' in line.lower(): CP_out.append(line.strip()) peak_df = SP.peak_to_seq_pipeline(CP_untagged, CP_out[0], CP_out[1], gff3, fa_dict, name='CP_peaks') ann_seqs = collect_intron_seq(gff3, fa_dict) print "Finding unaligned reads with annotated 5' splice sites" find_split_reads('unmapped_all.fq', ann_seqs, 'Ann_branches', threads=threads) print "Aligning split reads to the genome with Bowtie" bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Ann_branches_split.fa --sam Ann_branches.sam'.format( threads, bowtie_index) call(bowtie_args, shell=True) # sort and index print "Sorting and indexing bam file" samtools1 = 'samtools view -Sbo Ann_branches.bam Ann_branches.sam' call(samtools1, shell=True) samtools2 = 'samtools sort Ann_branches.bam -o Ann_branches_sorted.bam' call(samtools2, shell=True) samtools3 = 'samtools index Ann_branches_sorted.bam' call(samtools3, shell=True) if peaks is True: print "Finding unaligned reads with unpredicted splicing events" peak_seqs = collect_intron_seq(gff3, fa_dict, peak_df=peak_df) find_split_reads('Ann_branches_unsplit.fa', peak_seqs, 'Peak_branches', threads=threads) print "Aligning split reads to the genome with Bowtie" bowtie_args = 'bowtie -p{0} -v1 -M1 --best {1} -f Peak_branches_split.fa --sam Peak_branches.sam'.format( threads, bowtie_index) call(bowtie_args, shell=True) print "Sorting and indexing bam file" samtools1 = 'samtools view -Sbo Peak_branches.bam Peak_branches.sam' call(samtools1, shell=True) samtools2 = 'samtools sort Peak_branches.bam -o Peak_branches_sorted.bam' call(samtools2, shell=True) samtools3 = 'samtools index Peak_branches_sorted.bam' call(samtools3, shell=True)
def peak_seq_enrichment(df, organism): organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) nuc_prob = SP.gc_content(fa_dict) p_dict = { 'A': nuc_prob[0], 'T': nuc_prob[2], 'C': nuc_prob[1], 'G': nuc_prob[3] } unpeaks = df[df['type'] == 'other'] unpeaks = unpeaks.append(df[df['type'] == 'intronic']) print "Number of unpredicted peaks:" print len(unpeaks) nucs = ['G', 'A', 'C', 'T'] dinucs = set() for nuc in nucs: for nuc2 in nucs: dinucs.add(nuc + nuc2) five = {} three = {} for dinuc in dinucs: five[dinuc] = len( unpeaks[unpeaks['sequence'].str[6:8].str.contains(dinuc)]) three[dinuc] = len( unpeaks[unpeaks['sequence'].str[4:6].str.contains(dinuc)]) five_LO = {} three_LO = {} for dinuc in five.keys(): p_dinuc = p_dict[dinuc[0]] * p_dict[dinuc[1]] phat_dinuc = five[dinuc] / float(len(unpeaks)) phat_dinuc2 = three[dinuc] / float(len(unpeaks)) SE = np.sqrt(phat_dinuc * (1 - phat_dinuc) / len(unpeaks)) SE2 = np.sqrt(phat_dinuc2 * (1 - phat_dinuc2) / len(unpeaks)) Z = (phat_dinuc - p_dinuc) / SE Z2 = (phat_dinuc2 - p_dinuc) / SE2 pvalue = stats.norm.sf(Z) pvalue2 = stats.norm.sf(Z2) LO = np.log((1 - pvalue) / pvalue) LO2 = np.log((1 - pvalue2) / pvalue2) five_LO[dinuc] = LO three_LO[dinuc] = LO2 fig, ax = plt.subplots(figsize=(12, 6)) width = 0.35 ind = np.arange(len(five_LO.keys())) rects2 = ax.bar(ind, three_LO.values(), width, color='crimson', edgecolor='crimson', label='Before peak') rects1 = ax.bar(ind + width, five_LO.values(), width, color='indigo', edgecolor='indigo', label='After peak') ax.plot([-1, 17], [0, 0], '-', color='black') ax.plot([-1, 17], [2.94, 2.94], '--', color='0.7', label='95% CI') ax.plot([-1, 17], [-2.94, -2.94], '--', color='0.7') ax.set_xlim([-1, 17]) ax.set_xticklabels(five_LO.keys(), fontsize=12) ax.set_xticks(ind + width / 2) ax.set_ylabel('Log odds dinucleotide enrichment', fontsize=14) ax.set_title('Unpredicted peaks', fontsize=14) ax.legend(fontsize=12) return fig