Ejemplo n.º 1
0
def gcdepth(args):
    """
    %prog gcdepth sample_name tag

    Plot GC content vs depth vs genomnic bins. Inputs are mosdepth output:
    - NA12878_S1.mosdepth.global.dist.txt
    - NA12878_S1.mosdepth.region.dist.txt
    - NA12878_S1.regions.bed.gz
    - NA12878_S1.regions.bed.gz.csi
    - NA12878_S1.regions.gc.bed.gz

    A sample mosdepth.sh script might look like:
    ```
    #!/bin/bash
    LD_LIBRARY_PATH=mosdepth/htslib/ mosdepth/mosdepth $1 \\
        bams/$1.bam -t 4 -c chr1 -n --by 1000

    bedtools nuc -fi GRCh38/WholeGenomeFasta/genome.fa \\
        -bed $1.regions.bed.gz \\
        | pigz -c > $1.regions.gc.bed.gz
    ```
    """
    import hashlib
    from jcvi.algorithms.formula import MAD_interval as confidence_interval
    from jcvi.graphics.base import latex, plt, savefig, set2

    p = OptionParser(gcdepth.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    sample_name, tag = args
    # The tag is used to add to title, also provide a random (hashed) color
    coloridx = int(hashlib.sha256(tag).hexdigest(), 16) % len(set2)
    color = set2[coloridx]

    # mosdepth outputs a table that we can use to plot relationship
    gcbedgz = sample_name + ".regions.gc.bed.gz"
    df = pd.read_csv(gcbedgz, delimiter="\t")
    mf = df.loc[:, ("4_usercol", "6_pct_gc")]
    mf.columns = ["depth", "gc"]

    # We discard any bins that are gaps
    mf = mf[(mf["depth"] > 0.001) | (mf["gc"] > 0.001)]

    # Create GC bins
    gcbins = defaultdict(list)
    for i, row in mf.iterrows():
        gcp = int(round(row["gc"] * 100))
        gcbins[gcp].append(row["depth"])
    gcd = sorted(
        (k * 0.01, confidence_interval(v)) for (k, v) in gcbins.items())
    gcd_x, gcd_y = zip(*gcd)
    m, lo, hi = zip(*gcd_y)

    # Plot
    plt.plot(
        mf["gc"],
        mf["depth"],
        ".",
        color="lightslategray",
        ms=2,
        mec="lightslategray",
        alpha=0.1,
    )
    patch = plt.fill_between(
        gcd_x,
        lo,
        hi,
        facecolor=color,
        alpha=0.25,
        zorder=10,
        linewidth=0.0,
        label="Median +/- MAD band",
    )
    plt.plot(gcd_x, m, "-", color=color, lw=2, zorder=20)

    ax = plt.gca()
    ax.legend(handles=[patch], loc="best")
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 100)
    ax.set_title("{} ({})".format(latex(sample_name), tag))
    ax.set_xlabel("GC content")
    ax.set_ylabel("Depth")
    savefig(sample_name + ".gcdepth.png")
Ejemplo n.º 2
0
Archivo: cnv.py Proyecto: xuanblo/jcvi
def gcdepth(args):
    """
    %prog gcdepth sample_name tag

    Plot GC content vs depth vs genomnic bins. Inputs are mosdepth output:
    - NA12878_S1.mosdepth.global.dist.txt
    - NA12878_S1.mosdepth.region.dist.txt
    - NA12878_S1.regions.bed.gz
    - NA12878_S1.regions.bed.gz.csi
    - NA12878_S1.regions.gc.bed.gz

    A sample mosdepth.sh script might look like:
    ```
    #!/bin/bash
    LD_LIBRARY_PATH=mosdepth/htslib/ mosdepth/mosdepth $1 \\
        bams/$1.bam -t 4 -c chr1 -n --by 1000

    bedtools nuc -fi GRCh38/WholeGenomeFasta/genome.fa \\
        -bed $1.regions.bed.gz \\
        | pigz -c > $1.regions.gc.bed.gz
    ```
    """
    import hashlib
    from jcvi.algorithms.formula import MAD_interval as confidence_interval
    from jcvi.graphics.base import latex, plt, savefig, set2

    p = OptionParser(gcdepth.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    sample_name, tag = args
    # The tag is used to add to title, also provide a random (hashed) color
    coloridx = int(hashlib.sha1(tag).hexdigest(), 16) % len(set2)
    color = set2[coloridx]

    # mosdepth outputs a table that we can use to plot relationship
    gcbedgz = sample_name + ".regions.gc.bed.gz"
    df = pd.read_csv(gcbedgz, delimiter="\t")
    mf = df.loc[:, ("4_usercol", "6_pct_gc")]
    mf.columns = ["depth", "gc"]

    # We discard any bins that are gaps
    mf = mf[(mf["depth"] > .001) | (mf["gc"] > .001)]

    # Create GC bins
    gcbins = defaultdict(list)
    for i, row in mf.iterrows():
        gcp = int(round(row["gc"] * 100))
        gcbins[gcp].append(row["depth"])
    gcd = sorted((k * .01, confidence_interval(v))
                 for (k, v) in gcbins.items())
    gcd_x, gcd_y = zip(*gcd)
    m, lo, hi = zip(*gcd_y)

    # Plot
    plt.plot(mf["gc"], mf["depth"], ".", color="lightslategray", ms=2,
             mec="lightslategray", alpha=.1)
    patch = plt.fill_between(gcd_x, lo, hi,
                             facecolor=color, alpha=.25, zorder=10,
                             linewidth=0.0, label="Median +/- MAD band")
    plt.plot(gcd_x, m, "-", color=color, lw=2, zorder=20)

    ax = plt.gca()
    ax.legend(handles=[patch], loc="best")
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 100)
    ax.set_title("{} ({})".format(latex(sample_name), tag))
    ax.set_xlabel("GC content")
    ax.set_ylabel("Depth")
    savefig(sample_name + ".gcdepth.png")