コード例 #1
0
def base_fraction_plot(
    base_count_position_list_dict,
    flank_size=10,
    normalize_to_GC_contents=1,
    overall_GC_content=0.5,
    genome_info="",
    add_markers=True,
    ytick_scale=1,
    bases_plotstyles={"A": "g^-", "T": "rv-", "C": "bs-", "G": "yo-"},
):
    """ Plot the base fractions at each position, with given flanksize, normalized to GC content or not.

    Base_count_position_list_dict should be the output of base_count_dict.
    Normalize_to_GC_contents can be: 
     - 0 - no normalization (show raw base fractions)
     - 1 - difference between real and expected base contents 
            (so the difference between 0.1 and 0.3, and between 0.3 and 0.5, will be the same 0.2), 
     - 2 - ratio between real and expected base contents (the 0.1 to 0.3 ratio is same as 0.3 to 0.9, bigger than 0.3 to 0.5)
     - 3 - ratio on a log-scale (so that ratios of 1/4, 1/2, 1, 2, 4 are all equidistant, which makes sense,
            instead of 1, 2, 3 being equidistant and 1/2, 1/3, ..., 1/10 being all squished between 0 and 1 like on the linear scale)

    Ytick_scale is only applicable when normalize_to_GC_contents is 3:  
     - if it's 1, the ticks will be ..., 2, 1, 1/2, ...
     - if it's 2, the ticks will be ..., 2, 3/2, 1, 3/2, 1/2, ...
     - if it's 3, the ticks will be ..., 2, 5/3, 4/3, 1, 3/4, 3/5, 1/2, ...
    """
    if not 0 <= normalize_to_GC_contents <= 3:
        raise Exception("normalize_to_GC_contents must be 0/1/2/3, not %s!" % normalize_to_GC_contents)
    real_base_fraction_list_dict = base_fraction_dict_from_count_dict(base_count_position_list_dict)
    pos_after_insertion = int(len(real_base_fraction_list_dict["A"]) / 2)
    expected_base_fractions = base_fractions_from_GC_content(overall_GC_content)
    all_plot_data = []
    for base in NORMAL_DNA_BASES:
        raw_plot_data = real_base_fraction_list_dict[base][
            pos_after_insertion - flank_size : pos_after_insertion + flank_size
        ]
        assert len(raw_plot_data) == flank_size * 2
        if normalize_to_GC_contents == 0:
            plot_data = raw_plot_data
        elif normalize_to_GC_contents == 1:
            plot_data = [x - expected_base_fractions[base] for x in raw_plot_data]
        else:
            plot_data = [x / expected_base_fractions[base] for x in raw_plot_data]
        all_plot_data.extend(plot_data)
        if add_markers:
            mplt.plot(plot_data, bases_plotstyles[base], label=base, markeredgecolor="none")
        else:
            mplt.plot(plot_data, bases_plotstyles[base][0], label=base)
    mplt.legend(loc=2, prop=FontProperties(size="smaller"))
    ylabel = "fraction of bases in given position"
    if normalize_to_GC_contents == 0:
        ylabel = "raw " + ylabel
    elif normalize_to_GC_contents == 1:
        ylabel += ",\nas a difference from %s GC content" % genome_info
    else:
        ylabel += ",\nas a ratio to %s GC content" % genome_info
    if normalize_to_GC_contents == 3:
        ylabel += " (log scale)"
    # make y logscale if desired; in that case I have to do the min/max/ticks sort of by hand...
    if normalize_to_GC_contents == 3:
        if min(all_plot_data) <= 0:
            raise ValueError("some bases have 0 fraction - can't plot log-scale!")
            # MAYBE-TODO plot it symlog if needed? But then all my work with limits/ticks needs to be redone...
        mplt.yscale("log")
        y_max = int(max(scipy.ceil(max(all_plot_data)), scipy.ceil(1 / min(all_plot_data))))
        mplt.ylim(1 / y_max, y_max)
        half_yticks_x = [x for x in range(ytick_scale + 1, ytick_scale * y_max + 1)]
        yticks = [ytick_scale / x for x in half_yticks_x] + [1] + [x / ytick_scale for x in half_yticks_x]
        yticklabels = (
            [Fraction(ytick_scale, x) for x in half_yticks_x] + [1] + [Fraction(x, ytick_scale) for x in half_yticks_x]
        )
        mplt.yticks(yticks, yticklabels)
        mplt.minorticks_off()

    # change the xticks to use -1 before the insertion position and 1 after, no 0
    xticks = range(flank_size * 2)
    mplt.xlim(0, flank_size * 2 - 1)
    mplt.xticks(xticks, [_relative_position_vs_cut(x, flank_size) for x in xticks])
    mplt.xlabel("relative genome position (dotted line is the insertion position)")
    mplt.ylabel(ylabel, ha="right")
    # put a dashed line at the insertion position
    ylim = mplt.ylim()
    mplt.vlines(flank_size - 0.5, *ylim, linestyles="dashed")
    mplt.ylim(*ylim)
コード例 #2
0
def base_fraction_stats(
    base_count_position_list_dict,
    overall_GC_content=0.5,
    print_single_pvalues=False,
    print_summary=True,
    pvalue_cutoffs=[0.05, 1e-10, 1e-99],
    cutoff_marks=["*", "**", "***"],
):
    """ Given the base counts at each position, give p-values for whether they're different from the overall GC content.

    Base_count_position_list_dict should be the output of base_count_dict.

    Statistical method: according to the Handbook of Biological Statistics, what we want is a goodness-of-fit test 
      of the results vs the expected distribution (i.e. the GC content) - exact test, G-test, or Chi-square test.  
     Scipy has the chi-square test, so we're using that.  (MAYBE-TODO could also get more GoF tests from statsmodels - 
      http://statsmodels.sourceforge.net/stable/stats.html#goodness-of-fit-tests-and-measures.)

    Optionally print details and/or summary, based on the pvalue cutoffs given.
    """
    if not pvalue_cutoffs == sorted(pvalue_cutoffs, reverse=True):
        raise ValueError("pvalue_cutoffs must be sorted, largest first!")

    lengths = set([len(l) for l in base_count_position_list_dict.values()])
    if len(lengths) > 1:
        raise ValueError("Different bases have different count list lengths! %s" % lengths)
    length = lengths.pop()

    expected_base_fractions = base_fractions_from_GC_content(overall_GC_content)

    raw_position_pvalues = []
    FDRadj_position_pvalues = []
    base_fractions_by_pos = []
    for position, base_counts in enumerate(zip(*[base_count_position_list_dict[base] for base in NORMAL_DNA_BASES])):
        base_total = sum(base_counts)
        base_fractions = [count / base_total for count in base_counts]
        base_fractions_by_pos.append(dict(zip(NORMAL_DNA_BASES, base_fractions)))
        expected_base_fractions_list = [expected_base_fractions[base] for base in NORMAL_DNA_BASES]
        pvalue = statistics_utilities.chisquare_goodness_of_fit(base_counts, expected_base_fractions_list)
        raw_position_pvalues.append(pvalue)
    # adjust p-values for multiple testing - although it's not clear this is really needed,
    #  since we EXPECT the significant parts to be right around the cut site, we're only checking a longer region just in case,
    #  and how long a region we're checking is pretty arbitrary...
    FDRadj_position_pvalues = statistics_utilities.FDR_adjust_pvalues(raw_position_pvalues, method="BH")

    if print_single_pvalues or print_summary:

        def base_fractions_string(base_fraction_list_dict):
            return ", ".join(["%.2f %s" % (base_fraction_list_dict[base], base) for base in NORMAL_DNA_BASES])

        print "expected base fractions: %s" % base_fractions_string(expected_base_fractions)
    if print_single_pvalues:
        relative_pos = lambda pos: _relative_position_vs_cut(pos, length / 2)
        # print info for only the LOWEST cutoff matched by the pvalue
        print "single positions with raw p-value <= %s:" % max(pvalue_cutoffs)
        for position, (pvalue, adj_pvalue, base_fractions) in enumerate(
            zip(raw_position_pvalues, FDRadj_position_pvalues, base_fractions_by_pos)
        ):
            for cutoff, mark in reversed(zip(pvalue_cutoffs, cutoff_marks)):
                if pvalue <= cutoff:
                    print " %s pvalue %.2g (FDR-adjusted %.2g) for base %s (base fractions %s)" % (
                        mark,
                        pvalue,
                        adj_pvalue,
                        relative_pos(position),
                        base_fractions_string(base_fractions),
                    )
                    break
    if print_summary:
        # grab the counts of raw and adjusted p-values over cutoffs (CUMULATIVE - a pvalue of 0 is counted for all cutoffs)
        raw_pvalue_cutoff_counts = defaultdict(lambda: 0)
        adj_pvalue_cutoff_counts = defaultdict(lambda: 0)
        for position, (pvalue, adj_pvalue) in enumerate(zip(raw_position_pvalues, FDRadj_position_pvalues)):
            for cutoff, mark in zip(pvalue_cutoffs, cutoff_marks):
                if adj_pvalue <= cutoff:
                    adj_pvalue_cutoff_counts[cutoff] += 1
                if pvalue <= cutoff:
                    raw_pvalue_cutoff_counts[cutoff] += 1

        def pvalue_cutoff_count_list(pvalue_count_dict, cutoffs):
            return ", ".join(["%s <= %s" % (pvalue_count_dict[cutoff], cutoff) for cutoff in cutoffs])

        print "out of %s positions:\n raw p-values: %s\n FDR-adjusted p-values: %s" % (
            length,
            pvalue_cutoff_count_list(raw_pvalue_cutoff_counts, pvalue_cutoffs),
            pvalue_cutoff_count_list(adj_pvalue_cutoff_counts, pvalue_cutoffs),
        )
    return raw_position_pvalues, FDRadj_position_pvalues