Ejemplo n.º 1
0
def diamondScore(dataset, size=10):
    """
    Extract a so-called "diamond score" - inspired by  Suzana Hadjur talks
    see Sevil Sofueva, EMBO 2013 - Supp Figure 11
    (but this is a bit different from Supp Figure 11!!!)
    """
    heatmap = 1. * h5dict(hm(dataset))["heatmap"]
    for _ in range(1):
        zeros = np.sum(heatmap, axis=0) == 0
        zeros = np.nonzero(zeros)[0]
        heatmap[zeros] = heatmap[zeros - 1]
        heatmap[:, zeros] = heatmap[:, zeros - 1]
    mirnylib.numutils.fillDiagonal(heatmap, 0, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 0, -1)
    heatmap = trunc(heatmap, low=0, high=0.0001)
    heatmap = ultracorrect(heatmap)
    diag2value = np.mean(np.diagonal(heatmap, 2))
    mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1)
    heatmap /= np.mean(np.sum(heatmap, axis=0))
    tiledHeatmap = np.hstack([heatmap, heatmap, heatmap])
    tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap])
    setExceptionHook()
    start = len(heatmap)
    end = 2 * len(heatmap)
    ratios = []
    for mon in xrange(start, end):
        diamond = tiledHeatmap[mon:mon + size, mon:mon - size:-1]
        inds = (np.arange(len(diamond))[:, None] + np.arange(len(diamond))[None, :]) < len(diamond)
        ratios.append(diamond[inds].sum())
    return np.array(ratios) - gaussian_filter(ratios, 30)

    return ratios
Ejemplo n.º 2
0
def plotScalingsForDifferentDatasets():
    setExceptionHook()  # for debug purposes only

    for dataset in datasets:

        #Creating one inMemory dataset to get scaling
        FR = hiclib.fragmentHiC.HiCStatistics("bla",
                                              getGenome(),
                                              inMemory=True)

        #Load data. Do not build fragments - maskFilter will do it anyways
        FR.load(frag(dataset), buildFragments=False)

        #Keep read paris from the same strand - see (Imakaev 2012)
        FR.maskFilter((FR.chrms1 == FR.chrms2) * (FR.strands1 == FR.strands2))

        #perform fragment-based IC
        FR.iterativeCorrectionFromMax()

        #These are coordiates of chromosomal arms
        regions = [(0, 200000, 1750000), (0, 2300000, 3800000)]

        #main P(s) code. Do not consider direct neighbors, but kees second
        #neighbors. Caulobacter is not badly affected by inefficient restriction
        #so we set excludeNeighbors to 1. In Eucaryotes it would be 2-4.
        #useWeights uses weights written by fragment-based IC
        scaling = FR.plotScaling(excludeNeighbors=1,
                                 enzyme=enzyme(dataset),
                                 normalize=True,
                                 useWeights=True,
                                 regions=regions,
                                 appendReadCount=True,
                                 mindist=5000,
                                 label=dataset,
                                 linewidth=2)
        #saving P(s)
        cPickle.dump(
            scaling,
            open(os.path.join("scalings",
                              os.path.split(dataset)[-1]), 'w'))
    #Plotting the result
    plt.xlim((5000, 2000000))
    plt.legend()
    plt.show()
Ejemplo n.º 3
0
def directionalityRatio(dataset, size=20):
    heatmap = 1. * h5dict(hm(dataset))["heatmap"]  # extract heatmap

    #filling in the gaps in the heatmap. Not really needed as heatmaps are with overlaps,
    #so they have no gaps
    for _ in range(1):
        zeros = np.sum(heatmap, axis=0) == 0
        zeros = np.nonzero(zeros)[0]
        heatmap[zeros] = heatmap[zeros - 1]
        heatmap[:, zeros] = heatmap[:, zeros - 1]
    #Following regular IC protocol (see 033_....py)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 0, -1)
    heatmap = trunc(heatmap, low=0, high=0.0001)
    heatmap = ultracorrect(heatmap)
    diag2value = np.mean(np.diagonal(heatmap, 2))
    mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1)
    heatmap /= np.mean(np.sum(heatmap, axis=0))

    #Put 9 copies of the heatmap in a huge square - Caulobacter is a ring.
    #this is a cheap-and-dirty way to account for that
    tiledHeatmap = np.hstack([heatmap, heatmap, heatmap])
    tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap])
    setExceptionHook()  # debug only
    start = len(heatmap)
    end = 2 * len(heatmap)
    ratios = []
    for mon in xrange(start, end):  #going through the central square
        upstream = tiledHeatmap[mon, mon:mon + size].sum()
        downstream = tiledHeatmap[mon - size:mon, mon].sum()
        #print upstream
        #print downstream
        ratios.append(
            upstream /
            (upstream + downstream))  #this is upstream/downstream ratio

    return ratios
Ejemplo n.º 4
0
def directionalityRatio(dataset, size=20):
    heatmap = 1. * h5dict(hm(dataset))["heatmap"]  # extract heatmap

    #filling in the gaps in the heatmap. Not really needed as heatmaps are with overlaps,
    #so they have no gaps
    for _ in range(1):
        zeros = np.sum(heatmap, axis=0) == 0
        zeros = np.nonzero(zeros)[0]
        heatmap[zeros] = heatmap[zeros - 1]
        heatmap[:, zeros] = heatmap[:, zeros - 1]
    #Following regular IC protocol (see 033_....py)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 0, -1)
    heatmap = trunc(heatmap, low=0, high=0.0001)
    heatmap = ultracorrect(heatmap)
    diag2value = np.mean(np.diagonal(heatmap, 2))
    mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1)
    heatmap /= np.mean(np.sum(heatmap, axis=0))

    #Put 9 copies of the heatmap in a huge square - Caulobacter is a ring.
    #this is a cheap-and-dirty way to account for that
    tiledHeatmap = np.hstack([heatmap, heatmap, heatmap])
    tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap])
    setExceptionHook()  # debug only
    start = len(heatmap)
    end = 2 * len(heatmap)
    ratios = []
    for mon in xrange(start, end):  #going through the central square
        upstream = tiledHeatmap[mon, mon:mon + size].sum()
        downstream = tiledHeatmap[mon - size:mon, mon].sum()
        #print upstream
        #print downstream
        ratios.append(upstream / (upstream + downstream))  #this is upstream/downstream ratio

    return ratios
Ejemplo n.º 5
0
def plotScalingsForDifferentDatasets():
    setExceptionHook()  # for debug purposes only

    for  dataset in datasets:

        #Creating one inMemory dataset to get scaling
        FR = hiclib.fragmentHiC.HiCStatistics("bla", getGenome(), inMemory=True)

        #Load data. Do not build fragments - maskFilter will do it anyways
        FR.load(frag(dataset), buildFragments=False)

        #Keep read paris from the same strand - see (Imakaev 2012)
        FR.maskFilter((FR.chrms1 == FR.chrms2) * (FR.strands1 == FR.strands2))

        #perform fragment-based IC
        FR.iterativeCorrectionFromMax()

        #These are coordiates of chromosomal arms
        regions = [(0, 200000, 1750000), (0, 2300000, 3800000)]

        #main P(s) code. Do not consider direct neighbors, but kees second
        #neighbors. Caulobacter is not badly affected by inefficient restriction
        #so we set excludeNeighbors to 1. In Eucaryotes it would be 2-4.
        #useWeights uses weights written by fragment-based IC
        scaling = FR.plotScaling(excludeNeighbors=1, enzyme=enzyme(dataset),
                                 normalize=True,
                                 useWeights=True,
                                 regions=regions,
                                 appendReadCount=True, mindist=5000,
                                 label=dataset, linewidth=2)
        #saving P(s)
        cPickle.dump(scaling, open(os.path.join("scalings", os.path.split(dataset)[-1]), 'w'))
    #Plotting the result
    plt.xlim((5000, 2000000))
    plt.legend()
    plt.show()
Ejemplo n.º 6
0
def diamondScore(dataset, size=10):
    """
    Extract a so-called "diamond score" - inspired by  Suzana Hadjur talks
    see Sevil Sofueva, EMBO 2013 - Supp Figure 11
    (but this is a bit different from Supp Figure 11!!!)
    """
    heatmap = 1. * h5dict(hm(dataset))["heatmap"]
    for _ in range(1):
        zeros = np.sum(heatmap, axis=0) == 0
        zeros = np.nonzero(zeros)[0]
        heatmap[zeros] = heatmap[zeros - 1]
        heatmap[:, zeros] = heatmap[:, zeros - 1]
    mirnylib.numutils.fillDiagonal(heatmap, 0, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 0, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 0, -1)
    heatmap = trunc(heatmap, low=0, high=0.0001)
    heatmap = ultracorrect(heatmap)
    diag2value = np.mean(np.diagonal(heatmap, 2))
    mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1)
    mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1)
    heatmap /= np.mean(np.sum(heatmap, axis=0))
    tiledHeatmap = np.hstack([heatmap, heatmap, heatmap])
    tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap])
    setExceptionHook()
    start = len(heatmap)
    end = 2 * len(heatmap)
    ratios = []
    for mon in xrange(start, end):
        diamond = tiledHeatmap[mon:mon + size, mon:mon - size:-1]
        inds = (np.arange(len(diamond))[:, None] +
                np.arange(len(diamond))[None, :]) < len(diamond)
        ratios.append(diamond[inds].sum())
    return np.array(ratios) - gaussian_filter(ratios, 30)

    return ratios
Ejemplo n.º 7
0
from mirnylib import plotting
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from mirnylib import genome
from mirnylib import h5dict
from hiclib import binnedData
import numpy as np
from mirnylib.systemutils import setExceptionHook

setExceptionHook()

########define file names and other params
#mirnylib genome params
genomeName = "GalGal5filtered"
genome_db = genome.Genome(
    "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/",
    readChrms=[],
    chrmFileTemplate="N%s.fa")

#where to find hic lib heatmap
basefolder = "/mnt/storage/home/vsfishman/HiC/data/chick/mapped-GalGal5filtered/B1_TTAGGC_L001_/"
filename = "chunk0001.hdf5.hm-res-1000kb"

#resulting file name
out_file = "all.glm"

###parameters requerd by LACHESIS to be in the header
header_string = """# GenomeLinkMatrix file - see GenomeLinkMatrix.h for documentation of this object type
# Species = chick
# N_bins = 524
Ejemplo n.º 8
0
def refine_paper(filename, create=True):
    """filename[0] is a list of filenames of incoming files
    filename[1] is a folder for outgoing file"""
    if create == True:
        for onename in filename[0]:
            #Parsing individual files
            if not os.path.exists(onename):
                raise StandardError("path not found: %s" % onename)
            TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII",maximumMoleculeLength=500, inMemory=True)
            print "\nTesting loading new data without rsite information    "
            TR.parseInputData(dictLike=onename,
                              enzymeToFillRsites="HindIII")
            #assert len(TR.DS) == 856143

            #assert len(TR.ufragments) == 634572
            TR.save(onename + "_parsed.frag")

        #Merging files alltogether, applying filters
        TR = HiCdataset(filename[1] + "_merged.frag",enzymeName = "HindIII",
                        genome=genomeFolder, mode="w")
        TR.merge([i + "_parsed.frag" for i in filename[0]])

        TR = HiCdataset("refined", genome=genomeFolder,enzymeName = "HindIII",
                        mode="w", inMemory=True)

        print "\nTesting chunking during all tests"
        TR.chunksize = 30000
        #because we do many operations, we disable autoFlush here
        TR.load(filename[1] + "_merged.frag")

        print "\nTesting Rsite filter"
        TR.filterRsiteStart(offset=5)

        #assert len(TR.DS) == 832110

        print "\nTesting duplicate filter"
        TR.filterDuplicates(chunkSize = 30000)        

        #assert len(TR.DS) == 830275

        print "\nTesting small/large and extreme fragment filter"
        TR.filterLarge()

        #assert len(TR.DS) == 825442
        TR.filterExtreme(cutH=0.005, cutL=0)
        TR.writeFilteringStats()

        #assert len(TR.DS) == 803845


    #-------------------------------------------
    TR.printMetadata(saveTo="metadata")
    import cPickle

    stop = False
    mdata = cPickle.load(open("sampleMetadata"))
    for i in sorted(mdata.keys()):
        if TR.metadata[i] != mdata[i]:
            print "Key {0} is not consistent: should be {1}, is {2}".format(i, mdata[i], TR.metadata[i])
            stop = True
    if stop == True:
        print ("""------------_ERROR_--------------
        Inconsistent metadata: see above
        ----------------------------------------""")
        raise ValueError("Inconsistent Metadata")


    print "Testing allxall and by-chromosome heatmap counting diagonal twice"

    print "----> saving allxall heatmap"
    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000,
                   countDiagonalReads="twice")
    a = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = a["heatmap"][st:end, st:end]
    chrom12 = a["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(
        filename[1] + "-1M.hm", resolution=1000000, includeTrans=True,
        countDiagonalReads="twice")

    b = h5dict(filename[1] + "-1M.hm")["1 1"]
    bb = h5dict(filename[1] + "-1M.hm")["1 2"]
    assert (b - chrom1).sum() == 0
    print "Cis heatmap consistent"
    assert (bb - chrom12).sum() == 0
    print 'Trans heatmap consistent'
    print  a["heatmap"][::10, ::10].sum()
    #assert  a["heatmap"][::10, ::10].sum() == 21800
    print "Heatmap sum correct\n"

    #---------------------------------
    print "Testing allxall and by-chromosome heatmap counting diagonal once"

    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000,
                   countDiagonalReads="once")
    Ta = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = Ta["heatmap"][st:end, st:end]
    chrom12 = Ta["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(
        filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True,
        countDiagonalReads="once")
    
    TR.saveHiResHeatmapWithOverlaps(filename[1]+"-1M-highRes.hm", resolution=50000, countDiagonalReads="twice")
    TR.saveSuperHighResMapWithOverlaps(filename[1]+"-5k-SuperHighRes.hm", resolution=5000,chromosomes = [14], countDiagonalReads="twice")

    Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"]
    Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"]
    assert ((Tb - chrom1) == 0).all()
    assert ((Tbb - chrom12) == 0).all()
    assert ((Tb + np.diag(np.diag(Tb))) == b).all()
    print "Diagonal counting methods are consistent\n"
    
    newchrom1 = chrom1.copy()
    for i in xrange(len(newchrom1)):
        newchrom1[i,i] = 2 * newchrom1[i,i]
    
    Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"]
    assert np.abs(Tb.sum() - newchrom1.sum()) < 1
    assert np.sum(np.abs(coarsegrain(Tb,20,True) - newchrom1)) < 500
    

    #------------------------------
    print "Testing updateGenome method"
    from mirnylib.genome import Genome
    removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0])
    #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum()
    t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ((removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum()
    newGenome = Genome(genomePath=genomeFolder, readChrms=["2",
                                                           "3", "4", "5", "X"])
    TR.updateGenome(newGenome)
    assert  TR.N == t

    a = h5dict(filename[1] + "-1M.hm")["heatmap"]
Ejemplo n.º 9
0
Hi-C at 10kb resolution. It took it 20 minutes to load the data, 4 mins to
remove poor bins, and couple hours to perform IC, about 10 minutes per pass. I
also note that the data was in fact stored in memory for doing that, and it
never used more than 16GB of RAM... in fact, creating this dataset used more,
but this will be optimized later.
"""


from mirnylib.genome import Genome
import numpy as np
import warnings
from mirnylib.h5dict import h5dict
from mirnylib.numutils import removeDiagonals
from mirnylib.systemutils import setExceptionHook

setExceptionHook()


class defaultMatrix(object):
    """
    This is a template object which stores matrix in memory.
    Alternatively, matrix can be stored in an h5dict, either normally
    or in a sparse mode.

    All the methods should be first implemented here using getData() and setData()
    Then they shold be translated to sparse subclasses of this class.
    """
    def __init__(self, data=None, dictToSave=None, key=""):
        """
        Initializes the object that stores the Hi-C matrix.
Ejemplo n.º 10
0
def displayHeatmap():
    plt.figure(figsize=(5, 5))
    shared_arr = mp.Array(ctypes.c_double, N**2)
    arr = tonumpyarray(shared_arr)
    arr.shape = (N, N)

    def doSim(i):
        nparr = tonumpyarray(shared_arr)
        SMCTran = initModel(i)

        for j in range(1):
            SMC = []
            N1 = 10000
            for k in range(np.random.randint(N1 // 2, N1)):
                SMCTran.steps(150)
                SMC.append(SMCTran.getSMCs())
            SMC = np.concatenate(SMC, axis=1)
            SMC1D = SMC[0] * N + SMC[1]
            position, counts = np.unique(SMC1D, return_counts=True)

            with shared_arr.get_lock():
                nparr[position] += counts
        print("Finished!")

        return None

    setExceptionHook()

    low20 = low // 10
    high20 = high // 10
    mydict = h5dict(
        "/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",
        'r')

    hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20]
    hicdata = completeIC(hicdata)
    curshape = hicdata.shape
    newshape = (1000 * (high - low)) // (600 * 20)
    print(hicdata.shape, newshape)
    hicdata = zoomArray(hicdata, (newshape, newshape))
    hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99))
    hicdata /= np.mean(np.sum(hicdata, axis=1))

    fmap(doSim, range(30),
         n=20)  # number of threads to use.  On a 20-core machine I use 20.

    arr = coarsegrain(arr, 20)
    arr = np.clip(arr, 0, np.percentile(arr, 99.9))
    arr /= np.mean(np.sum(arr, axis=1))

    ran = np.arange(len(arr))
    mask = ran[:, None] > ran[None, :]

    arr[mask] = hicdata[mask]

    logarr = np.log(arr + 0.0001)
    plt.imshow(logarr,
               vmax=np.percentile(logarr, 99.9),
               extent=[low, high, high, low],
               interpolation="none")
    nicePlot()
Ejemplo n.º 11
0
def showAllDatasets():
    setExceptionHook()

    #plt.figure(figsize=(25, 15))
    fig = plt.figure()

    #size of the figure
    fw = fig.get_figwidth() * fig.get_dpi()
    fh = fig.get_figheight() * fig.get_dpi()

    #get subplot configuration
    sx, sy = subplots(len(datasets))

    for  j, dataset in enumerate(datasets):
        curPlot = plt.subplot(sx, sy, j + 1)
        heatmap = 1. * h5dict(hm(dataset), 'r')["heatmap"]

        #fill in gaps - obsolete, as heatmaps are with overlaps
        for _ in range(1):
            zeros = np.sum(heatmap, axis=0) == 0
            zeros = np.nonzero(zeros)[0]
            heatmap[zeros] = heatmap[zeros - 1]
            heatmap[:, zeros] = heatmap[:, zeros - 1]

        #regular IC protocol
        mirnylib.numutils.fillDiagonal(heatmap, 0, 0)
        mirnylib.numutils.fillDiagonal(heatmap, 0, 1)
        mirnylib.numutils.fillDiagonal(heatmap, 0, -1)
        heatmap = trunc(heatmap, low=0, high=0.0001)
        heatmap = ultracorrect(heatmap)
        diag2value = np.mean(np.diagonal(heatmap, 2))
        mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0)
        mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1)
        mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1)
        newHeatmap = heatmap

        #Top highly expressed genes
        #genePos = [18, 56, 77, 117, 143, 215, 234, 256, 266, 286, 300, 326, 336, 367, 379]
        geneCoor = [1162773, 3509071, 1180887, 543099, 1953250, 2522439, 3328524, 1503879, 900483, 242693, 3677144, 3931680, 3677704, 3762707, 3480870, 3829656, 1424678, 901855, 1439056, 3678537]

        # here we commited to 10kb resolution - change below if you're not
        genePos = [i / 10000. for i in geneCoor]

        genePos = []

        #putting lines at highly expressed genes
        for lpos in genePos:
            plt.hlines(lpos , 0, 500, linewidth=0.7, color="black", alpha=0.2, zorder=1)
            plt.vlines(lpos , 0, 500, linewidth=0.7, color="black", alpha=0.2, zorder=1)
            pass

        #performing adaptive smoothing
        smoothedHeatmap = adaptiveSmoothing(newHeatmap, 20)
        smoothedHeatmap /= np.mean(np.sum(heatmap, axis=0))

        #print dataset, sum([np.diagonal(smoothedHeatmap, i).sum() for i in range(60, 140)])
        #maps = [[smoothedHeatmap, smoothedHeatmap[:30]],
        #         [smoothedHeatmap[:, :30], smoothedHeatmap[:30, :30]]]
        #smoothedHeatmap = np.hstack([np.vstack(i) for i in maps])

        allx = []
        ally = []

        plt.title(dataset, fontsize=10)
        plt.imshow((smoothedHeatmap), interpolation="none", vmax=0.035, cmap="acidblues", zorder=0)
        #plt.imshow((smoothedHeatmap), interpolation="nearest", vmin=0, vmax=np.exp(-4.5), cmap="fall", zorder=0)
        plt.xticks([])
        plt.yticks([])





        plt.subplots_adjust(left=0.05,  # the left side of the subplots of the figure
      right=0.95,  # the right side of the subplots of the figure
      bottom=0.05,  # the bottom of the subplots of the figure
      top=0.95 ,  # the top of the subplots of the figure
      wspace=0.1,  # the amount of width reserved for blank space between subplots
      hspace=0.2)
        #cPickle.dump(scaling, open(dataset.split("/")[-1] + "scaling", 'w'))
        #plt.ylim((400, 200))
        #plt.xlim((0, 200))

        #code below just puts the P(s) over the heatmap
        N = len(smoothedHeatmap)
        pts = np.array([[1, 0], [N, N], [N, 0]])
        p = Polygon(pts, closed=True, facecolor=(0.8, 0.8, 0.8), linewidth=0, alpha=0.7, zorder=2)
        ax = plt.gca()
        ax.add_patch(p)

        Bbox = matplotlib.transforms.Bbox.from_bounds(.55, .55, .35, .42)
        tBbox = matplotlib.transforms.TransformedBbox(Bbox, ax.transAxes).get_points()
        l, b, w, h = tBbox[0, 0] / fw, tBbox[0, 1] / fh, (tBbox[1, 0] - tBbox[0, 0]) / fw, (tBbox[1, 1] - tBbox[0, 1]) / fh
        axins = fig.add_axes([l, b, w, h], axisbg=(0, 0, 0, 0), xscale="log", yscale="log")
        removeAxes(ax=axins)
        for xlabel_i in axins.get_xticklabels(): xlabel_i.set_fontsize(6)
        for xlabel_i in axins.get_yticklabels(): xlabel_i.set_fontsize(6)

        N = len(smoothedHeatmap)
        st = int(0.05 * N)
        end = int(0.45 * N)
        st2 = int(0.55 * N)
        end2 = int(0.95 * N)
        axins.plot(*scaling(0.5 * (smoothedHeatmap[st:end, st:end] + smoothedHeatmap[st2:end2, st2:end2])), color="blue", label="intra-arm")
        if (dataset in ['Wildtype_0min_BglII_rep1', "ML2000_0hr"]):
            myscaling = scaling(0.5 * (smoothedHeatmap[st:end, st:end] + smoothedHeatmap[st2:end2, st2:end2]))
        #axins.plot(*scaling(smoothedHeatmap[st:end, end2:st2:-1]), color="green", label="inter-arm")
        axins.set_xlabel("kb", fontsize=6)
        axins.set_ylabel("Pc", fontsize=6)
        axins.grid()

        if "myscaling" in locals():
            axins.plot(*myscaling, color="grey")

        #axins.set_xticks([])
        #axins.set_yticks([])
        #axins.tick_params(color="red")

        #axins.set_xlabel("Mb")
        #axins.set_ylabel("Pc")
        for i, line in enumerate(axins.get_xticklines() + axins.get_yticklines()):
            if i % 2 == 1:  # odd indices
                line.set_visible(False)

        #if dataset != "Wildtype_0min_BglII_rep1":
        #    data = cPickle.load(open("scalings/{0}".format(dataset)))
        #    axins.plot(*data, color="blue")

        #axins.xscale("log")
        #axins.yscale("log")

        #end strange code





    plt.show()
Ejemplo n.º 12
0
def showAllDatasets():
    setExceptionHook()

    #plt.figure(figsize=(25, 15))
    fig = plt.figure()

    #size of the figure
    fw = fig.get_figwidth() * fig.get_dpi()
    fh = fig.get_figheight() * fig.get_dpi()

    #get subplot configuration
    sx, sy = subplots(len(datasets))

    for j, dataset in enumerate(datasets):
        curPlot = plt.subplot(sx, sy, j + 1)
        heatmap = 1. * h5dict(hm(dataset), 'r')["heatmap"]

        #fill in gaps - obsolete, as heatmaps are with overlaps
        for _ in range(1):
            zeros = np.sum(heatmap, axis=0) == 0
            zeros = np.nonzero(zeros)[0]
            heatmap[zeros] = heatmap[zeros - 1]
            heatmap[:, zeros] = heatmap[:, zeros - 1]

        #regular IC protocol
        mirnylib.numutils.fillDiagonal(heatmap, 0, 0)
        mirnylib.numutils.fillDiagonal(heatmap, 0, 1)
        mirnylib.numutils.fillDiagonal(heatmap, 0, -1)
        heatmap = trunc(heatmap, low=0, high=0.0001)
        heatmap = ultracorrect(heatmap)
        diag2value = np.mean(np.diagonal(heatmap, 2))
        mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0)
        mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1)
        mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1)
        newHeatmap = heatmap

        #Top highly expressed genes
        #genePos = [18, 56, 77, 117, 143, 215, 234, 256, 266, 286, 300, 326, 336, 367, 379]
        geneCoor = [
            1162773, 3509071, 1180887, 543099, 1953250, 2522439, 3328524,
            1503879, 900483, 242693, 3677144, 3931680, 3677704, 3762707,
            3480870, 3829656, 1424678, 901855, 1439056, 3678537
        ]

        # here we commited to 10kb resolution - change below if you're not
        genePos = [i / 10000. for i in geneCoor]

        genePos = []

        #putting lines at highly expressed genes
        for lpos in genePos:
            plt.hlines(lpos,
                       0,
                       500,
                       linewidth=0.7,
                       color="black",
                       alpha=0.2,
                       zorder=1)
            plt.vlines(lpos,
                       0,
                       500,
                       linewidth=0.7,
                       color="black",
                       alpha=0.2,
                       zorder=1)
            pass

        #performing adaptive smoothing
        smoothedHeatmap = adaptiveSmoothing(newHeatmap, 20)
        smoothedHeatmap /= np.mean(np.sum(heatmap, axis=0))

        #print dataset, sum([np.diagonal(smoothedHeatmap, i).sum() for i in range(60, 140)])
        #maps = [[smoothedHeatmap, smoothedHeatmap[:30]],
        #         [smoothedHeatmap[:, :30], smoothedHeatmap[:30, :30]]]
        #smoothedHeatmap = np.hstack([np.vstack(i) for i in maps])

        allx = []
        ally = []

        plt.title(dataset, fontsize=10)
        plt.imshow((smoothedHeatmap),
                   interpolation="none",
                   vmax=0.035,
                   cmap="acidblues",
                   zorder=0)
        #plt.imshow((smoothedHeatmap), interpolation="nearest", vmin=0, vmax=np.exp(-4.5), cmap="fall", zorder=0)
        plt.xticks([])
        plt.yticks([])

        plt.subplots_adjust(
            left=0.05,  # the left side of the subplots of the figure
            right=0.95,  # the right side of the subplots of the figure
            bottom=0.05,  # the bottom of the subplots of the figure
            top=0.95,  # the top of the subplots of the figure
            wspace=
            0.1,  # the amount of width reserved for blank space between subplots
            hspace=0.2)
        #cPickle.dump(scaling, open(dataset.split("/")[-1] + "scaling", 'w'))
        #plt.ylim((400, 200))
        #plt.xlim((0, 200))

        #code below just puts the P(s) over the heatmap
        N = len(smoothedHeatmap)
        pts = np.array([[1, 0], [N, N], [N, 0]])
        p = Polygon(pts,
                    closed=True,
                    facecolor=(0.8, 0.8, 0.8),
                    linewidth=0,
                    alpha=0.7,
                    zorder=2)
        ax = plt.gca()
        ax.add_patch(p)

        Bbox = matplotlib.transforms.Bbox.from_bounds(.55, .55, .35, .42)
        tBbox = matplotlib.transforms.TransformedBbox(
            Bbox, ax.transAxes).get_points()
        l, b, w, h = tBbox[0, 0] / fw, tBbox[0, 1] / fh, (
            tBbox[1, 0] - tBbox[0, 0]) / fw, (tBbox[1, 1] - tBbox[0, 1]) / fh
        axins = fig.add_axes([l, b, w, h],
                             axisbg=(0, 0, 0, 0),
                             xscale="log",
                             yscale="log")
        removeAxes(ax=axins)
        for xlabel_i in axins.get_xticklabels():
            xlabel_i.set_fontsize(6)
        for xlabel_i in axins.get_yticklabels():
            xlabel_i.set_fontsize(6)

        N = len(smoothedHeatmap)
        st = int(0.05 * N)
        end = int(0.45 * N)
        st2 = int(0.55 * N)
        end2 = int(0.95 * N)
        axins.plot(*scaling(0.5 * (smoothedHeatmap[st:end, st:end] +
                                   smoothedHeatmap[st2:end2, st2:end2])),
                   color="blue",
                   label="intra-arm")
        if (dataset in ['Wildtype_0min_BglII_rep1', "ML2000_0hr"]):
            myscaling = scaling(0.5 * (smoothedHeatmap[st:end, st:end] +
                                       smoothedHeatmap[st2:end2, st2:end2]))
        #axins.plot(*scaling(smoothedHeatmap[st:end, end2:st2:-1]), color="green", label="inter-arm")
        axins.set_xlabel("kb", fontsize=6)
        axins.set_ylabel("Pc", fontsize=6)
        axins.grid()

        if "myscaling" in locals():
            axins.plot(*myscaling, color="grey")

        #axins.set_xticks([])
        #axins.set_yticks([])
        #axins.tick_params(color="red")

        #axins.set_xlabel("Mb")
        #axins.set_ylabel("Pc")
        for i, line in enumerate(axins.get_xticklines() +
                                 axins.get_yticklines()):
            if i % 2 == 1:  # odd indices
                line.set_visible(False)

        #if dataset != "Wildtype_0min_BglII_rep1":
        #    data = cPickle.load(open("scalings/{0}".format(dataset)))
        #    axins.plot(*data, color="blue")

        #axins.xscale("log")
        #axins.yscale("log")

        #end strange code

    plt.show()
Ejemplo n.º 13
0
def refine_paper(filename, create=True):
    """filename[0] is a list of filenames of incoming files
    filename[1] is a folder for outgoing file"""
    if create == True:
        for onename in filename[0]:
            #Parsing individual files
            if not os.path.exists(onename):
                raise StandardError("path not found: %s" % onename)
            TR = HiCdataset("bla",
                            genome=genomeFolder,
                            enzymeName="HindIII",
                            maximumMoleculeLength=500,
                            inMemory=True)
            print "\nTesting loading new data without rsite information    "
            TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII")
            #assert len(TR.DS) == 856143

            #assert len(TR.ufragments) == 634572
            TR.save(onename + "_parsed.frag")

        #Merging files alltogether, applying filters
        TR = HiCdataset(filename[1] + "_merged.frag",
                        enzymeName="HindIII",
                        genome=genomeFolder,
                        mode="w")
        TR.merge([i + "_parsed.frag" for i in filename[0]])

        TR = HiCdataset("refined",
                        genome=genomeFolder,
                        enzymeName="HindIII",
                        mode="w",
                        inMemory=True)

        print "\nTesting chunking during all tests"
        TR.chunksize = 30000
        #because we do many operations, we disable autoFlush here
        TR.load(filename[1] + "_merged.frag")

        print "\nTesting Rsite filter"
        TR.filterRsiteStart(offset=5)

        #assert len(TR.DS) == 832110

        print "\nTesting duplicate filter"
        TR.filterDuplicates(chunkSize=30000)

        #assert len(TR.DS) == 830275

        print "\nTesting small/large and extreme fragment filter"
        TR.filterLarge()

        #assert len(TR.DS) == 825442
        TR.filterExtreme(cutH=0.005, cutL=0)
        TR.writeFilteringStats()

        #assert len(TR.DS) == 803845

    #-------------------------------------------
    TR.printMetadata(saveTo="metadata")
    import cPickle

    stop = False
    mdata = cPickle.load(open("sampleMetadata"))
    for i in sorted(mdata.keys()):
        if TR.metadata[i] != mdata[i]:
            print "Key {0} is not consistent: should be {1}, is {2}".format(
                i, mdata[i], TR.metadata[i])
            stop = True
    if stop == True:
        print("""------------_ERROR_--------------
        Inconsistent metadata: see above
        ----------------------------------------""")
        raise ValueError("Inconsistent Metadata")

    print "Testing allxall and by-chromosome heatmap counting diagonal twice"

    print "----> saving allxall heatmap"
    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice")
    a = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = a["heatmap"][st:end, st:end]
    chrom12 = a["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(filename[1] + "-1M.hm",
                               resolution=1000000,
                               includeTrans=True,
                               countDiagonalReads="twice")

    b = h5dict(filename[1] + "-1M.hm")["1 1"]
    bb = h5dict(filename[1] + "-1M.hm")["1 2"]
    assert (b - chrom1).sum() == 0
    print "Cis heatmap consistent"
    assert (bb - chrom12).sum() == 0
    print 'Trans heatmap consistent'
    print a["heatmap"][::10, ::10].sum()
    #assert  a["heatmap"][::10, ::10].sum() == 21800
    print "Heatmap sum correct\n"

    #---------------------------------
    print "Testing allxall and by-chromosome heatmap counting diagonal once"

    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once")
    Ta = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = Ta["heatmap"][st:end, st:end]
    chrom12 = Ta["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(filename[1] + "-1M-byChr.hm",
                               resolution=1000000,
                               includeTrans=True,
                               countDiagonalReads="once")

    TR.saveHiResHeatmapWithOverlaps(filename[1] + "-1M-highRes.hm",
                                    resolution=50000,
                                    countDiagonalReads="twice")
    TR.saveSuperHighResMapWithOverlaps(filename[1] + "-5k-SuperHighRes.hm",
                                       resolution=5000,
                                       chromosomes=[14],
                                       countDiagonalReads="twice")

    Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"]
    Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"]
    assert ((Tb - chrom1) == 0).all()
    assert ((Tbb - chrom12) == 0).all()
    assert ((Tb + np.diag(np.diag(Tb))) == b).all()
    print "Diagonal counting methods are consistent\n"

    newchrom1 = chrom1.copy()
    for i in xrange(len(newchrom1)):
        newchrom1[i, i] = 2 * newchrom1[i, i]

    Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"]
    assert np.abs(Tb.sum() - newchrom1.sum()) < 1
    assert np.sum(np.abs(coarsegrain(Tb, 20, True) - newchrom1)) < 500

    #------------------------------
    print "Testing updateGenome method"
    from mirnylib.genome import Genome
    removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0])
    #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum()
    t = ((removeChromIDs[TR.chrms1] == 1) *
         (removeChromIDs[TR.chrms2] == 1)).sum() + (
             (removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum()
    newGenome = Genome(genomePath=genomeFolder,
                       readChrms=["2", "3", "4", "5", "X"])
    TR.updateGenome(newGenome)
    assert TR.N == t

    a = h5dict(filename[1] + "-1M.hm")["heatmap"]
Ejemplo n.º 14
0
def plotCorrelationAtDifferentBinning():
    """Plots figure with correlation at different binning.
    Note the caching and creating of binned heatmaps flags below.
    Suppplementary paper figure
    """

    sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    setExceptionHook()

    cache = False
    create = False

    if create == True:
        if cache == True:
            #-------------------standard version code-----------------
            FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                        override=False, inMemory=True)
            FR.load("../../../ErezPaperData/hg18/GM-HindIII-hg18_refined.frag")

            FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR3.load("../../../ErezPaperData/hg18/GM-HindIII-hg18"\
                     "_refined.frag")

            FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR2.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag")

            #----------------------cross-check code----------------
#            FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
#                                        override=False, inMemory=True)
#            FR.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag")
#
#            FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
#                                         override=False, inMemory=True)
#            FR3.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag")
#
#            FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
#                                         override=False, inMemory=True)
#            FR2.load("../../../ErezPaperData/hg18/G"\
#                    "M-HindIII-hg18_refined.frag")
            #-------end corss-check code ---------------------------------
            #--------Filter only trans DS reads-----------------
            FR.maskFilter(FR.DS * (FR.chrms1 != FR.chrms2))
            FR2.maskFilter(FR2.DS * (FR2.chrms1 != FR2.chrms2))
            FR3.maskFilter(FR3.DS * (FR3.chrms1 != FR3.chrms2))

            #Now create two halfs of one dataset and down-sample second dataset
            #----------------------standard version code--------
            fraction = 0.5 * len(FR.DS) / float(len(FR2.DS))

            rarray = numpy.random.random(len(FR.DS))
            mask1 = rarray < 0.5
            mask3 = rarray >= 0.5
            mask2 = numpy.random.random(len(FR2.DS)) < fraction

            #-------------------- cross-check code---------
            #fraction = 0.5 * len(FR2.DS) / float(len(FR.DS))

            #rarray = numpy.random.random(len(FR.DS))
            #mask1 =  rarray  < fraction
            #mask3 = (rarray > fraction) * (rarray < fraction * 2)
            #mask2 =  numpy.random.random(len(FR2.DS)) > 0.5
            #-----------------------------------------

            FR.maskFilter(mask1)
            FR2.maskFilter(mask2)
            FR3.maskFilter(mask3)

            FR.save("../../../tcc/working/cache1")
            FR2.save("../../../tcc/working/cache2")
            FR3.save("../../../tcc/working/cache3")
        else:
            FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                        override=False, inMemory=True)
            FR.load("../../../tcc/working/cache1")

            FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR3.load("../../../tcc/working/cache3")

            FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR2.load("../../../tcc/working/cache2")

        for size in sizes:
            FR.saveHeatmap("../../../tcc/working/HindIII_%d.hm" %
                           size, size * 1000000)
            FR2.saveHeatmap("../../../tcc/working/NcoI_%d.hm" %
                            size, size * 1000000)
            FR3.saveHeatmap("../../../tcc/working/control_%d.hm" %
                            size, size * 1000000)

    p1 = []
    p2 = []
    p3 = []
    p4 = []
    evs = []
    for size in sizes:

        BD = binnedDataAnalysis(size * 1000000, "../../../data/hg18")
        BD.simpleLoad("../../../tcc/working/HindIII_%d.hm" % size, "HindIII")
        BD.simpleLoad("../../../tcc/working/NcoI_%d.hm" % size, "NcoI")
        BD.simpleLoad("../../../tcc/working/control_%d.hm" % size, "control")
        BD.removeDiagonal()
        BD.removePoorRegions(cutoff=2)
        BD.removeCis()

        data1 = BD.dataDict["HindIII"]
        data2 = BD.dataDict["NcoI"]
        data3 = BD.dataDict["control"]

        mask = (numpy.sum(
            data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0)
        validMask = mask[:, None] * mask[None, :]
        transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :]
        cormask = transmask * validMask

        c1 = scipy.stats.spearmanr(data1[cormask], data2[cormask])[0]
        c4 = scipy.stats.spearmanr(data1[cormask], data3[cormask])[0]

        if size == 1:
            evs.append(BD.interchromosomalValues("HindIII"))
            evs.append(BD.interchromosomalValues("NcoI"))
            evs.append(BD.interchromosomalValues("control"))
        p4.append(c4)
        p1.append(c1)

        print "size\t%d\traw:" % size, c1,
        BD.removeZeros()
        BD.fakeCis()  # does iterative correction as well
        BD.restoreZeros(value=0)

        data1 = BD.dataDict["HindIII"]
        data2 = BD.dataDict["NcoI"]
        data3 = BD.dataDict["control"]
        c2 = scipy.stats.spearmanr(data1[cormask], data2[cormask])[0]
        c3 = scipy.stats.spearmanr(data1[cormask], data3[cormask])[0]

        if size == 1:
            evs.append(BD.interchromosomalValues("HindIII"))
            evs.append(BD.interchromosomalValues("NcoI"))
            evs.append(BD.interchromosomalValues("control"))
            print evs

        p3.append(c3)
        p2.append(c2)

        print "\tcorrected:", c2, "\tcontrol", c3

    plt.plot(sizes, p1, label="Raw data, between enzymes")
    plt.plot(sizes, p2, label="Iteratively corrected, between")
    plt.plot(sizes, p3, label="IC, within")
    plt.xlabel("Bin size, MB")
    plt.xticks(range(1, 11))
    plt.ylabel("Spearman correlation coefficient")
    plt.legend()
    niceShow()

    setExceptionHook()
    0 / 0