def base_fraction_plot( base_count_position_list_dict, flank_size=10, normalize_to_GC_contents=1, overall_GC_content=0.5, genome_info="", add_markers=True, ytick_scale=1, bases_plotstyles={"A": "g^-", "T": "rv-", "C": "bs-", "G": "yo-"}, ): """ Plot the base fractions at each position, with given flanksize, normalized to GC content or not. Base_count_position_list_dict should be the output of base_count_dict. Normalize_to_GC_contents can be: - 0 - no normalization (show raw base fractions) - 1 - difference between real and expected base contents (so the difference between 0.1 and 0.3, and between 0.3 and 0.5, will be the same 0.2), - 2 - ratio between real and expected base contents (the 0.1 to 0.3 ratio is same as 0.3 to 0.9, bigger than 0.3 to 0.5) - 3 - ratio on a log-scale (so that ratios of 1/4, 1/2, 1, 2, 4 are all equidistant, which makes sense, instead of 1, 2, 3 being equidistant and 1/2, 1/3, ..., 1/10 being all squished between 0 and 1 like on the linear scale) Ytick_scale is only applicable when normalize_to_GC_contents is 3: - if it's 1, the ticks will be ..., 2, 1, 1/2, ... - if it's 2, the ticks will be ..., 2, 3/2, 1, 3/2, 1/2, ... - if it's 3, the ticks will be ..., 2, 5/3, 4/3, 1, 3/4, 3/5, 1/2, ... """ if not 0 <= normalize_to_GC_contents <= 3: raise Exception("normalize_to_GC_contents must be 0/1/2/3, not %s!" % normalize_to_GC_contents) real_base_fraction_list_dict = base_fraction_dict_from_count_dict(base_count_position_list_dict) pos_after_insertion = int(len(real_base_fraction_list_dict["A"]) / 2) expected_base_fractions = base_fractions_from_GC_content(overall_GC_content) all_plot_data = [] for base in NORMAL_DNA_BASES: raw_plot_data = real_base_fraction_list_dict[base][ pos_after_insertion - flank_size : pos_after_insertion + flank_size ] assert len(raw_plot_data) == flank_size * 2 if normalize_to_GC_contents == 0: plot_data = raw_plot_data elif normalize_to_GC_contents == 1: plot_data = [x - expected_base_fractions[base] for x in raw_plot_data] else: plot_data = [x / expected_base_fractions[base] for x in raw_plot_data] all_plot_data.extend(plot_data) if add_markers: mplt.plot(plot_data, bases_plotstyles[base], label=base, markeredgecolor="none") else: mplt.plot(plot_data, bases_plotstyles[base][0], label=base) mplt.legend(loc=2, prop=FontProperties(size="smaller")) ylabel = "fraction of bases in given position" if normalize_to_GC_contents == 0: ylabel = "raw " + ylabel elif normalize_to_GC_contents == 1: ylabel += ",\nas a difference from %s GC content" % genome_info else: ylabel += ",\nas a ratio to %s GC content" % genome_info if normalize_to_GC_contents == 3: ylabel += " (log scale)" # make y logscale if desired; in that case I have to do the min/max/ticks sort of by hand... if normalize_to_GC_contents == 3: if min(all_plot_data) <= 0: raise ValueError("some bases have 0 fraction - can't plot log-scale!") # MAYBE-TODO plot it symlog if needed? But then all my work with limits/ticks needs to be redone... mplt.yscale("log") y_max = int(max(scipy.ceil(max(all_plot_data)), scipy.ceil(1 / min(all_plot_data)))) mplt.ylim(1 / y_max, y_max) half_yticks_x = [x for x in range(ytick_scale + 1, ytick_scale * y_max + 1)] yticks = [ytick_scale / x for x in half_yticks_x] + [1] + [x / ytick_scale for x in half_yticks_x] yticklabels = ( [Fraction(ytick_scale, x) for x in half_yticks_x] + [1] + [Fraction(x, ytick_scale) for x in half_yticks_x] ) mplt.yticks(yticks, yticklabels) mplt.minorticks_off() # change the xticks to use -1 before the insertion position and 1 after, no 0 xticks = range(flank_size * 2) mplt.xlim(0, flank_size * 2 - 1) mplt.xticks(xticks, [_relative_position_vs_cut(x, flank_size) for x in xticks]) mplt.xlabel("relative genome position (dotted line is the insertion position)") mplt.ylabel(ylabel, ha="right") # put a dashed line at the insertion position ylim = mplt.ylim() mplt.vlines(flank_size - 0.5, *ylim, linestyles="dashed") mplt.ylim(*ylim)
def base_fraction_stats( base_count_position_list_dict, overall_GC_content=0.5, print_single_pvalues=False, print_summary=True, pvalue_cutoffs=[0.05, 1e-10, 1e-99], cutoff_marks=["*", "**", "***"], ): """ Given the base counts at each position, give p-values for whether they're different from the overall GC content. Base_count_position_list_dict should be the output of base_count_dict. Statistical method: according to the Handbook of Biological Statistics, what we want is a goodness-of-fit test of the results vs the expected distribution (i.e. the GC content) - exact test, G-test, or Chi-square test. Scipy has the chi-square test, so we're using that. (MAYBE-TODO could also get more GoF tests from statsmodels - http://statsmodels.sourceforge.net/stable/stats.html#goodness-of-fit-tests-and-measures.) Optionally print details and/or summary, based on the pvalue cutoffs given. """ if not pvalue_cutoffs == sorted(pvalue_cutoffs, reverse=True): raise ValueError("pvalue_cutoffs must be sorted, largest first!") lengths = set([len(l) for l in base_count_position_list_dict.values()]) if len(lengths) > 1: raise ValueError("Different bases have different count list lengths! %s" % lengths) length = lengths.pop() expected_base_fractions = base_fractions_from_GC_content(overall_GC_content) raw_position_pvalues = [] FDRadj_position_pvalues = [] base_fractions_by_pos = [] for position, base_counts in enumerate(zip(*[base_count_position_list_dict[base] for base in NORMAL_DNA_BASES])): base_total = sum(base_counts) base_fractions = [count / base_total for count in base_counts] base_fractions_by_pos.append(dict(zip(NORMAL_DNA_BASES, base_fractions))) expected_base_fractions_list = [expected_base_fractions[base] for base in NORMAL_DNA_BASES] pvalue = statistics_utilities.chisquare_goodness_of_fit(base_counts, expected_base_fractions_list) raw_position_pvalues.append(pvalue) # adjust p-values for multiple testing - although it's not clear this is really needed, # since we EXPECT the significant parts to be right around the cut site, we're only checking a longer region just in case, # and how long a region we're checking is pretty arbitrary... FDRadj_position_pvalues = statistics_utilities.FDR_adjust_pvalues(raw_position_pvalues, method="BH") if print_single_pvalues or print_summary: def base_fractions_string(base_fraction_list_dict): return ", ".join(["%.2f %s" % (base_fraction_list_dict[base], base) for base in NORMAL_DNA_BASES]) print "expected base fractions: %s" % base_fractions_string(expected_base_fractions) if print_single_pvalues: relative_pos = lambda pos: _relative_position_vs_cut(pos, length / 2) # print info for only the LOWEST cutoff matched by the pvalue print "single positions with raw p-value <= %s:" % max(pvalue_cutoffs) for position, (pvalue, adj_pvalue, base_fractions) in enumerate( zip(raw_position_pvalues, FDRadj_position_pvalues, base_fractions_by_pos) ): for cutoff, mark in reversed(zip(pvalue_cutoffs, cutoff_marks)): if pvalue <= cutoff: print " %s pvalue %.2g (FDR-adjusted %.2g) for base %s (base fractions %s)" % ( mark, pvalue, adj_pvalue, relative_pos(position), base_fractions_string(base_fractions), ) break if print_summary: # grab the counts of raw and adjusted p-values over cutoffs (CUMULATIVE - a pvalue of 0 is counted for all cutoffs) raw_pvalue_cutoff_counts = defaultdict(lambda: 0) adj_pvalue_cutoff_counts = defaultdict(lambda: 0) for position, (pvalue, adj_pvalue) in enumerate(zip(raw_position_pvalues, FDRadj_position_pvalues)): for cutoff, mark in zip(pvalue_cutoffs, cutoff_marks): if adj_pvalue <= cutoff: adj_pvalue_cutoff_counts[cutoff] += 1 if pvalue <= cutoff: raw_pvalue_cutoff_counts[cutoff] += 1 def pvalue_cutoff_count_list(pvalue_count_dict, cutoffs): return ", ".join(["%s <= %s" % (pvalue_count_dict[cutoff], cutoff) for cutoff in cutoffs]) print "out of %s positions:\n raw p-values: %s\n FDR-adjusted p-values: %s" % ( length, pvalue_cutoff_count_list(raw_pvalue_cutoff_counts, pvalue_cutoffs), pvalue_cutoff_count_list(adj_pvalue_cutoff_counts, pvalue_cutoffs), ) return raw_position_pvalues, FDRadj_position_pvalues