Example #1
0
def plotCisToTransHotFragments(dataset="../../../mouse/data/combined/"\
                   "mouse1_merged.frag",
                   workingFile="../../../tcc/working/workingMouse.frag",
                   cacheFile="../../../tcc/working/workingMouseFiltered.frag",
                   genomeFolder="../../../data/mm9", label=None):
    mirnylib.systemutils.setExceptionHook()
    if not os.path.exists(cacheFile):
        print "caching parsed data"
        FH = HiCdataset(workingFile, genomeFolder)
        FH.load(dataset)
        FH.filterRsiteStart(offset=5)
        FH.filterDuplicates()
        #TR.save(filename[1]+".dat")
        FH.filterLarge()
        FH.maskFilter(FH.DS)
        FH.save(cacheFile)

    FH = HiCdataset(workingFile, genomeFolder)
    FH.load(cacheFile)

    fs = FH.fragmentSum()

    FH.saveFragments()
    FH.maskFilter(FH.chrms1 == FH.chrms2)

    FH.originalFragments()

    fsCis = FH.fragmentSum()
    args = numpy.argsort(fs)

    fsSort = 1. * fs[args]
    fsCisSort = 1. * fsCis[args]

    cisToTrans = fsCisSort / fsSort

    p1, p2, p3 = numpy.percentile(fsSort, [99, 99.5, 99.9])

    bins = mirnylib.numutils.logbins(1, fsSort.max(), 1.08)
    counts = numpy.histogram(fsSort, bins)
    values = numpy.histogram(fsSort, bins, weights=cisToTrans)

    plt.plot(0.5 * (values[1][:-1] + values[1][1:]), values[0] /
             counts[0], '.', label=label)

    for linep in p1, p2, p3:
        plt.vlines(linep, 0, 1)

    plt.xlabel("Counts per fragment")
    plt.ylabel("Cis-to-trans ratio")
    plt.title("Vertical lines are at 99%,99.5% and 99.9% reads per fragment")

    niceShow()
def plotFigure2c():
    TR = HiCdataset()
    TR.load("GM-all.refined")
    hm = TR.buildHeatmap(1, 1, 1000000, False, False)
    TR.calculateWeights()
    TR.weights = np.ones(len(TR.weights), float)  # if you want to correct just by fragment density, not by length dependence
    hm2 = TR.buildHeatmap(1, 1, 1000000, False, weights=True)
    hm2[np.isnan(hm2)] = 0
    mask = np.sum(hm, axis=0) > 0
    """p1-6 are 6 lines to be plotted, below is plotting only"""
    p1 = np.sum(hm, axis=0)[mask]
    p3 = np.sum(correct(hm), axis=0)[mask]
    p5 = np.sum(ultracorrect(hm, 40), axis=0)[mask]
    p4 = np.sum(correct(hm2), axis=0)[mask]
    p2 = np.sum(hm2, axis=0)[mask]
    p6 = np.sum(ultracorrect(hm2, 40), axis=0)[mask]
    matplotlib.rcParams['font.sans-serif'] = 'Arial'
    dashstyle = (3, 3)
    plt.figure(figsize=(4, 4))

    ax = plt.subplot(2, 1, 1)
    plt.xlim((0, 80))
    plt.ylim((0, 2))
    plt.ylabel("Total coverage", fontsize=8)

    line21 = plt.plot(p1 / p1.mean(), "-", linewidth=1, color="#e5a826")[0]
    line22 = plt.plot(
        p3 / p3.mean(), "--", linewidth=1, color="#e5a826")[0]
    line22.set_dashes(dashstyle)
    line23 = plt.plot(p5 / p5.mean(), linewidth=1, color="grey")[0]

    for xlabel_i in ax.get_xticklabels():
        xlabel_i.set_fontsize(8)
    for xlabel_i in ax.get_yticklabels():
        xlabel_i.set_fontsize(8)
    legend = plt.legend([line21, line22, line23],
                        ["Raw data", "Single correction", "Iterative correction"], prop={"size": 6}, loc=1, handlelength=2)
    legend.draw_frame(False)
    removeAxes(shift=0, ax=ax)

    for i in ax.spines.values():
        i.set_color('none')
    ax.axhline(linewidth=1, color='black')
    ax.axvline(linewidth=1, color='black')

    ax2 = plt.subplot(2, 1, 2, sharex=ax)
    plt.xlim((0, 80))
    plt.ylim((0, 2))
    plt.xlabel("Position on chom 1 (MB)", fontsize=8)
    plt.ylabel("Total coverage", fontsize=8)

    line1 = plt.plot(p4 / p4.mean(), "--", color="#9b3811", linewidth=1)[0]
    line1.set_dashes(dashstyle)
    line2 = plt.plot(p2 / p2.mean(), "-", color="#9b3811", linewidth=1)[0]
    line3 = plt.plot(p6 / p6.mean(), linewidth=1, color="grey")[0]

    for xlabel_i in ax2.get_xticklabels():
        xlabel_i.set_fontsize(8)
    for xlabel_i in ax2.get_yticklabels():
        xlabel_i.set_fontsize(8)

    legend = plt.legend([line2, line1, line3],
                        ["HindIII corrected", "Single correction", "Iterative correction"], prop={"size": 6}, loc=1, handlelength=2)
    legend.draw_frame(False)
    removeAxes(shift=0, ax=ax2)
    plotting.niceShow()
def plotFigure2c():
    TR = HiCdataset()
    TR.load("GM-all.refined")
    hm = TR.buildHeatmap(1, 1, 1000000, False, False)
    TR.calculateWeights()
    TR.weights = np.ones(
        len(TR.weights), float
    )  # if you want to correct just by fragment density, not by length dependence
    hm2 = TR.buildHeatmap(1, 1, 1000000, False, weights=True)
    hm2[np.isnan(hm2)] = 0
    mask = np.sum(hm, axis=0) > 0
    """p1-6 are 6 lines to be plotted, below is plotting only"""
    p1 = np.sum(hm, axis=0)[mask]
    p3 = np.sum(correct(hm), axis=0)[mask]
    p5 = np.sum(ultracorrect(hm, 40), axis=0)[mask]
    p4 = np.sum(correct(hm2), axis=0)[mask]
    p2 = np.sum(hm2, axis=0)[mask]
    p6 = np.sum(ultracorrect(hm2, 40), axis=0)[mask]
    matplotlib.rcParams['font.sans-serif'] = 'Arial'
    dashstyle = (3, 3)
    plt.figure(figsize=(4, 4))

    ax = plt.subplot(2, 1, 1)
    plt.xlim((0, 80))
    plt.ylim((0, 2))
    plt.ylabel("Total coverage", fontsize=8)

    line21 = plt.plot(p1 / p1.mean(), "-", linewidth=1, color="#e5a826")[0]
    line22 = plt.plot(p3 / p3.mean(), "--", linewidth=1, color="#e5a826")[0]
    line22.set_dashes(dashstyle)
    line23 = plt.plot(p5 / p5.mean(), linewidth=1, color="grey")[0]

    for xlabel_i in ax.get_xticklabels():
        xlabel_i.set_fontsize(8)
    for xlabel_i in ax.get_yticklabels():
        xlabel_i.set_fontsize(8)
    legend = plt.legend(
        [line21, line22, line23],
        ["Raw data", "Single correction", "Iterative correction"],
        prop={"size": 6},
        loc=1,
        handlelength=2)
    legend.draw_frame(False)
    removeAxes(shift=0, ax=ax)

    for i in ax.spines.values():
        i.set_color('none')
    ax.axhline(linewidth=1, color='black')
    ax.axvline(linewidth=1, color='black')

    ax2 = plt.subplot(2, 1, 2, sharex=ax)
    plt.xlim((0, 80))
    plt.ylim((0, 2))
    plt.xlabel("Position on chom 1 (MB)", fontsize=8)
    plt.ylabel("Total coverage", fontsize=8)

    line1 = plt.plot(p4 / p4.mean(), "--", color="#9b3811", linewidth=1)[0]
    line1.set_dashes(dashstyle)
    line2 = plt.plot(p2 / p2.mean(), "-", color="#9b3811", linewidth=1)[0]
    line3 = plt.plot(p6 / p6.mean(), linewidth=1, color="grey")[0]

    for xlabel_i in ax2.get_xticklabels():
        xlabel_i.set_fontsize(8)
    for xlabel_i in ax2.get_yticklabels():
        xlabel_i.set_fontsize(8)

    legend = plt.legend(
        [line2, line1, line3],
        ["HindIII corrected", "Single correction", "Iterative correction"],
        prop={"size": 6},
        loc=1,
        handlelength=2)
    legend.draw_frame(False)
    removeAxes(shift=0, ax=ax2)
    plotting.niceShow()
Example #4
0
plt.yscale("log")
for filename in filelist:
    scalings = give_slices(
        base="{0}/blockDATA2.dat".format(filename),
        tosave=None,
        nproc=4,
        slices=[250],
        sliceParams=(200),
        multipliers=numpy.arange(0.850001, 1.0001, 0.001),
        # multipliers=[1],
        mode="chain",
        loadFunction=polymerutils.load)
    setExceptionHook()

    values = [i[0] for i in scalings]
    values = numpy.array(values)
    values[:, 0, :] /= 2500

    labels = [
        "{0}; time = ".format(filename) + str(i[1]["slice"]) for i in scalings
    ]
    for scaling, label in map(None, values, labels):
        plt.plot(*scaling[0], label=label)

a = logbins(10, 10000, 1.2)
a = numpy.array(a)
plt.plot(a / 2500., 1e-5 * a**(-0.5), label="Proposed -0.5 scaling")
plt.xlabel("distance (MB)")
niceShow()
exit()
            plt.plot(*value[0],
                     color=color,
                     linestyle=linestyle,
                     label="L={0}; num={1}".format(len, num))

#values1[:, 0, :] /= (1000000. / 600)
#cPickle.dump((scalings, values), open("consScaff", 'wb'))
#exit()

setExceptionHook()

a = logbins(10, 10000, 1.2)
a = numpy.array(a)

plt.xlabel("distance (MB)")
niceShow("log")

exit()

for scaling, label in zip(values, labels):
    plt.plot(*scaling[1], label=label)
a = logbins(10, 10000, 1.2)
a = numpy.array(a)
plt.plot(a, 2 * a**(1 / 6.))
niceShow("log")
for scaling, label in zip(values, labels):
    plt.plot(*scaling[2], label=label)
niceShow("log")

pc = scalings[0][0][0][1]
rg = scalings[0][0][1][1]
Example #6
0
def plotCorrelationAtDifferentBinning():
    """Plots figure with correlation at different binning.
    Note the caching and creating of binned heatmaps flags below.
    Suppplementary paper figure
    """

    sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    setExceptionHook()

    cache = False
    create = False

    if create == True:
        if cache == True:
            #-------------------standard version code-----------------
            FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                        override=False, inMemory=True)
            FR.load("../../../ErezPaperData/hg18/GM-HindIII-hg18_refined.frag")

            FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR3.load("../../../ErezPaperData/hg18/GM-HindIII-hg18"\
                     "_refined.frag")

            FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR2.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag")

            #----------------------cross-check code----------------
#            FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
#                                        override=False, inMemory=True)
#            FR.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag")
#
#            FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
#                                         override=False, inMemory=True)
#            FR3.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag")
#
#            FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
#                                         override=False, inMemory=True)
#            FR2.load("../../../ErezPaperData/hg18/G"\
#                    "M-HindIII-hg18_refined.frag")
            #-------end corss-check code ---------------------------------
            #--------Filter only trans DS reads-----------------
            FR.maskFilter(FR.DS * (FR.chrms1 != FR.chrms2))
            FR2.maskFilter(FR2.DS * (FR2.chrms1 != FR2.chrms2))
            FR3.maskFilter(FR3.DS * (FR3.chrms1 != FR3.chrms2))

            #Now create two halfs of one dataset and down-sample second dataset
            #----------------------standard version code--------
            fraction = 0.5 * len(FR.DS) / float(len(FR2.DS))

            rarray = numpy.random.random(len(FR.DS))
            mask1 = rarray < 0.5
            mask3 = rarray >= 0.5
            mask2 = numpy.random.random(len(FR2.DS)) < fraction

            #-------------------- cross-check code---------
            #fraction = 0.5 * len(FR2.DS) / float(len(FR.DS))

            #rarray = numpy.random.random(len(FR.DS))
            #mask1 =  rarray  < fraction
            #mask3 = (rarray > fraction) * (rarray < fraction * 2)
            #mask2 =  numpy.random.random(len(FR2.DS)) > 0.5
            #-----------------------------------------

            FR.maskFilter(mask1)
            FR2.maskFilter(mask2)
            FR3.maskFilter(mask3)

            FR.save("../../../tcc/working/cache1")
            FR2.save("../../../tcc/working/cache2")
            FR3.save("../../../tcc/working/cache3")
        else:
            FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                        override=False, inMemory=True)
            FR.load("../../../tcc/working/cache1")

            FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR3.load("../../../tcc/working/cache3")

            FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR2.load("../../../tcc/working/cache2")

        for size in sizes:
            FR.saveHeatmap("../../../tcc/working/HindIII_%d.hm" %
                           size, size * 1000000)
            FR2.saveHeatmap("../../../tcc/working/NcoI_%d.hm" %
                            size, size * 1000000)
            FR3.saveHeatmap("../../../tcc/working/control_%d.hm" %
                            size, size * 1000000)

    p1 = []
    p2 = []
    p3 = []
    p4 = []
    evs = []
    for size in sizes:

        BD = binnedDataAnalysis(size * 1000000, "../../../data/hg18")
        BD.simpleLoad("../../../tcc/working/HindIII_%d.hm" % size, "HindIII")
        BD.simpleLoad("../../../tcc/working/NcoI_%d.hm" % size, "NcoI")
        BD.simpleLoad("../../../tcc/working/control_%d.hm" % size, "control")
        BD.removeDiagonal()
        BD.removePoorRegions(cutoff=2)
        BD.removeCis()

        data1 = BD.dataDict["HindIII"]
        data2 = BD.dataDict["NcoI"]
        data3 = BD.dataDict["control"]

        mask = (numpy.sum(
            data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0)
        validMask = mask[:, None] * mask[None, :]
        transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :]
        cormask = transmask * validMask

        c1 = scipy.stats.spearmanr(data1[cormask], data2[cormask])[0]
        c4 = scipy.stats.spearmanr(data1[cormask], data3[cormask])[0]

        if size == 1:
            evs.append(BD.interchromosomalValues("HindIII"))
            evs.append(BD.interchromosomalValues("NcoI"))
            evs.append(BD.interchromosomalValues("control"))
        p4.append(c4)
        p1.append(c1)

        print "size\t%d\traw:" % size, c1,
        BD.removeZeros()
        BD.fakeCis()  # does iterative correction as well
        BD.restoreZeros(value=0)

        data1 = BD.dataDict["HindIII"]
        data2 = BD.dataDict["NcoI"]
        data3 = BD.dataDict["control"]
        c2 = scipy.stats.spearmanr(data1[cormask], data2[cormask])[0]
        c3 = scipy.stats.spearmanr(data1[cormask], data3[cormask])[0]

        if size == 1:
            evs.append(BD.interchromosomalValues("HindIII"))
            evs.append(BD.interchromosomalValues("NcoI"))
            evs.append(BD.interchromosomalValues("control"))
            print evs

        p3.append(c3)
        p2.append(c2)

        print "\tcorrected:", c2, "\tcontrol", c3

    plt.plot(sizes, p1, label="Raw data, between enzymes")
    plt.plot(sizes, p2, label="Iteratively corrected, between")
    plt.plot(sizes, p3, label="IC, within")
    plt.xlabel("Bin size, MB")
    plt.xticks(range(1, 11))
    plt.ylabel("Spearman correlation coefficient")
    plt.legend()
    niceShow()

    setExceptionHook()
    0 / 0
Example #7
0
def plotTanayGenomicFeature():
    """Shows how genomic feature is spawned by Eig1, not Tanay domains
    paper supplementary  figure"""
    Tanay = experimentalBinnedData(1000000, myGenome)
    Tanay.simpleLoad(GM1M, "GM-all")
    Tanay.loadTanayDomains()

    Tanay.loadWigFile("../../../histoneMarks/hg18/wgEncodeUwDnaseSeqRawSignal"\
    "Rep1Gm06990.bigWig", label="feature")
    #Tanay.loadWigFile("../../../histoneMarks/hg18/wgEncodeBroadChipSeqSignal"\
    #"Gm12878H3k9ac.wig", label = "feature",
    #control = "../../../histoneMarks/hg18/wgEncodeBroadChipSeqSignal"\
    #"Gm12878Control.wig")
    #Tanay.loadWigFile("../../../histoneMarks/hg18/wgEncodeBroadChipSeqSignal"\
    #"Gm12878H3k4me3.wig", label = "feature",
    #control = "../../../histoneMarks/hg18/wgEncodeBroadChipSeqSignal"\
    #"Gm12878Control.wig")

    Tanay.removeDiagonal()
    Tanay.removePoorRegions()
    Tanay.removeZeros()
    Tanay.fakeCis()

    Tanay.doEig()
    E1 = Tanay.EigDict["GM-all"][0]
    E2 = Tanay.EigDict["GM-all"][1]
    GC = Tanay.trackDict["GC"]

    if scipy.stats.spearmanr(E1, GC)[0] < 0:
        E1 = -E1
    if scipy.stats.spearmanr(E2, GC)[0] < 0:
        E2 = -E2

    TD = Tanay.trackDict["TanayDomains"]
    print scipy.stats.spearmanr(Tanay.trackDict["feature"], E1)

    plt.scatter(Tanay.trackDict["feature"], E1, c=TD, s=4, linewidth=0)
    cm = plt.cm.get_cmap("jet")

    print "Our 2r is", (
        numpy.corrcoef(Tanay.trackDict["feature"], E1)[0, 1]) ** 2
    tset = set(TD)
    tfeature = numpy.zeros_like(TD, dtype=float)
    feature = Tanay.trackDict["feature"]
    for i in tset:
        #print i
        #print numpy.mean(feature[TD==i])
        tfeature[TD == i] = numpy.mean(feature[TD == i])
        #print tfeature
    print "Tanay r2 is", (numpy.corrcoef(tfeature, E1)[0, 1]) ** 2

    plt.legend([matplotlib.lines.Line2D([0], [0], color=cm(i), marker="o",
                                        markersize=8, linewidth=0) \
                for i in [0.333, 0.666, 0.999]],
               ["Active", "Centromere proximal", "Centromere distal"], loc=2)

    plt.ylabel("Eig1GW")
    #plt.xlabel("UWashington DNAse")
    #plt.xlabel("H3K9ac ChIP-seq")
    plt.xlabel("H3K4me3 ChIP-seq")
    plt.title("Color represents domain from (Yaffe 2011)")
    niceShow(subplotAdjust=(0.13, 0.11, 0.97, 0.92))