def diamondScore(dataset, size=10): """ Extract a so-called "diamond score" - inspired by Suzana Hadjur talks see Sevil Sofueva, EMBO 2013 - Supp Figure 11 (but this is a bit different from Supp Figure 11!!!) """ heatmap = 1. * h5dict(hm(dataset))["heatmap"] for _ in range(1): zeros = np.sum(heatmap, axis=0) == 0 zeros = np.nonzero(zeros)[0] heatmap[zeros] = heatmap[zeros - 1] heatmap[:, zeros] = heatmap[:, zeros - 1] mirnylib.numutils.fillDiagonal(heatmap, 0, 0) mirnylib.numutils.fillDiagonal(heatmap, 0, 1) mirnylib.numutils.fillDiagonal(heatmap, 0, -1) heatmap = trunc(heatmap, low=0, high=0.0001) heatmap = ultracorrect(heatmap) diag2value = np.mean(np.diagonal(heatmap, 2)) mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1) heatmap /= np.mean(np.sum(heatmap, axis=0)) tiledHeatmap = np.hstack([heatmap, heatmap, heatmap]) tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap]) setExceptionHook() start = len(heatmap) end = 2 * len(heatmap) ratios = [] for mon in xrange(start, end): diamond = tiledHeatmap[mon:mon + size, mon:mon - size:-1] inds = (np.arange(len(diamond))[:, None] + np.arange(len(diamond))[None, :]) < len(diamond) ratios.append(diamond[inds].sum()) return np.array(ratios) - gaussian_filter(ratios, 30) return ratios
def plotScalingsForDifferentDatasets(): setExceptionHook() # for debug purposes only for dataset in datasets: #Creating one inMemory dataset to get scaling FR = hiclib.fragmentHiC.HiCStatistics("bla", getGenome(), inMemory=True) #Load data. Do not build fragments - maskFilter will do it anyways FR.load(frag(dataset), buildFragments=False) #Keep read paris from the same strand - see (Imakaev 2012) FR.maskFilter((FR.chrms1 == FR.chrms2) * (FR.strands1 == FR.strands2)) #perform fragment-based IC FR.iterativeCorrectionFromMax() #These are coordiates of chromosomal arms regions = [(0, 200000, 1750000), (0, 2300000, 3800000)] #main P(s) code. Do not consider direct neighbors, but kees second #neighbors. Caulobacter is not badly affected by inefficient restriction #so we set excludeNeighbors to 1. In Eucaryotes it would be 2-4. #useWeights uses weights written by fragment-based IC scaling = FR.plotScaling(excludeNeighbors=1, enzyme=enzyme(dataset), normalize=True, useWeights=True, regions=regions, appendReadCount=True, mindist=5000, label=dataset, linewidth=2) #saving P(s) cPickle.dump( scaling, open(os.path.join("scalings", os.path.split(dataset)[-1]), 'w')) #Plotting the result plt.xlim((5000, 2000000)) plt.legend() plt.show()
def directionalityRatio(dataset, size=20): heatmap = 1. * h5dict(hm(dataset))["heatmap"] # extract heatmap #filling in the gaps in the heatmap. Not really needed as heatmaps are with overlaps, #so they have no gaps for _ in range(1): zeros = np.sum(heatmap, axis=0) == 0 zeros = np.nonzero(zeros)[0] heatmap[zeros] = heatmap[zeros - 1] heatmap[:, zeros] = heatmap[:, zeros - 1] #Following regular IC protocol (see 033_....py) mirnylib.numutils.fillDiagonal(heatmap, 0, 0) mirnylib.numutils.fillDiagonal(heatmap, 0, 1) mirnylib.numutils.fillDiagonal(heatmap, 0, -1) heatmap = trunc(heatmap, low=0, high=0.0001) heatmap = ultracorrect(heatmap) diag2value = np.mean(np.diagonal(heatmap, 2)) mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1) heatmap /= np.mean(np.sum(heatmap, axis=0)) #Put 9 copies of the heatmap in a huge square - Caulobacter is a ring. #this is a cheap-and-dirty way to account for that tiledHeatmap = np.hstack([heatmap, heatmap, heatmap]) tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap]) setExceptionHook() # debug only start = len(heatmap) end = 2 * len(heatmap) ratios = [] for mon in xrange(start, end): #going through the central square upstream = tiledHeatmap[mon, mon:mon + size].sum() downstream = tiledHeatmap[mon - size:mon, mon].sum() #print upstream #print downstream ratios.append( upstream / (upstream + downstream)) #this is upstream/downstream ratio return ratios
def directionalityRatio(dataset, size=20): heatmap = 1. * h5dict(hm(dataset))["heatmap"] # extract heatmap #filling in the gaps in the heatmap. Not really needed as heatmaps are with overlaps, #so they have no gaps for _ in range(1): zeros = np.sum(heatmap, axis=0) == 0 zeros = np.nonzero(zeros)[0] heatmap[zeros] = heatmap[zeros - 1] heatmap[:, zeros] = heatmap[:, zeros - 1] #Following regular IC protocol (see 033_....py) mirnylib.numutils.fillDiagonal(heatmap, 0, 0) mirnylib.numutils.fillDiagonal(heatmap, 0, 1) mirnylib.numutils.fillDiagonal(heatmap, 0, -1) heatmap = trunc(heatmap, low=0, high=0.0001) heatmap = ultracorrect(heatmap) diag2value = np.mean(np.diagonal(heatmap, 2)) mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1) heatmap /= np.mean(np.sum(heatmap, axis=0)) #Put 9 copies of the heatmap in a huge square - Caulobacter is a ring. #this is a cheap-and-dirty way to account for that tiledHeatmap = np.hstack([heatmap, heatmap, heatmap]) tiledHeatmap = np.vstack([tiledHeatmap, tiledHeatmap, tiledHeatmap]) setExceptionHook() # debug only start = len(heatmap) end = 2 * len(heatmap) ratios = [] for mon in xrange(start, end): #going through the central square upstream = tiledHeatmap[mon, mon:mon + size].sum() downstream = tiledHeatmap[mon - size:mon, mon].sum() #print upstream #print downstream ratios.append(upstream / (upstream + downstream)) #this is upstream/downstream ratio return ratios
def plotScalingsForDifferentDatasets(): setExceptionHook() # for debug purposes only for dataset in datasets: #Creating one inMemory dataset to get scaling FR = hiclib.fragmentHiC.HiCStatistics("bla", getGenome(), inMemory=True) #Load data. Do not build fragments - maskFilter will do it anyways FR.load(frag(dataset), buildFragments=False) #Keep read paris from the same strand - see (Imakaev 2012) FR.maskFilter((FR.chrms1 == FR.chrms2) * (FR.strands1 == FR.strands2)) #perform fragment-based IC FR.iterativeCorrectionFromMax() #These are coordiates of chromosomal arms regions = [(0, 200000, 1750000), (0, 2300000, 3800000)] #main P(s) code. Do not consider direct neighbors, but kees second #neighbors. Caulobacter is not badly affected by inefficient restriction #so we set excludeNeighbors to 1. In Eucaryotes it would be 2-4. #useWeights uses weights written by fragment-based IC scaling = FR.plotScaling(excludeNeighbors=1, enzyme=enzyme(dataset), normalize=True, useWeights=True, regions=regions, appendReadCount=True, mindist=5000, label=dataset, linewidth=2) #saving P(s) cPickle.dump(scaling, open(os.path.join("scalings", os.path.split(dataset)[-1]), 'w')) #Plotting the result plt.xlim((5000, 2000000)) plt.legend() plt.show()
from mirnylib import plotting import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from mirnylib import genome from mirnylib import h5dict from hiclib import binnedData import numpy as np from mirnylib.systemutils import setExceptionHook setExceptionHook() ########define file names and other params #mirnylib genome params genomeName = "GalGal5filtered" genome_db = genome.Genome( "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/", readChrms=[], chrmFileTemplate="N%s.fa") #where to find hic lib heatmap basefolder = "/mnt/storage/home/vsfishman/HiC/data/chick/mapped-GalGal5filtered/B1_TTAGGC_L001_/" filename = "chunk0001.hdf5.hm-res-1000kb" #resulting file name out_file = "all.glm" ###parameters requerd by LACHESIS to be in the header header_string = """# GenomeLinkMatrix file - see GenomeLinkMatrix.h for documentation of this object type # Species = chick # N_bins = 524
def refine_paper(filename, create=True): """filename[0] is a list of filenames of incoming files filename[1] is a folder for outgoing file""" if create == True: for onename in filename[0]: #Parsing individual files if not os.path.exists(onename): raise StandardError("path not found: %s" % onename) TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII",maximumMoleculeLength=500, inMemory=True) print "\nTesting loading new data without rsite information " TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII") #assert len(TR.DS) == 856143 #assert len(TR.ufragments) == 634572 TR.save(onename + "_parsed.frag") #Merging files alltogether, applying filters TR = HiCdataset(filename[1] + "_merged.frag",enzymeName = "HindIII", genome=genomeFolder, mode="w") TR.merge([i + "_parsed.frag" for i in filename[0]]) TR = HiCdataset("refined", genome=genomeFolder,enzymeName = "HindIII", mode="w", inMemory=True) print "\nTesting chunking during all tests" TR.chunksize = 30000 #because we do many operations, we disable autoFlush here TR.load(filename[1] + "_merged.frag") print "\nTesting Rsite filter" TR.filterRsiteStart(offset=5) #assert len(TR.DS) == 832110 print "\nTesting duplicate filter" TR.filterDuplicates(chunkSize = 30000) #assert len(TR.DS) == 830275 print "\nTesting small/large and extreme fragment filter" TR.filterLarge() #assert len(TR.DS) == 825442 TR.filterExtreme(cutH=0.005, cutL=0) TR.writeFilteringStats() #assert len(TR.DS) == 803845 #------------------------------------------- TR.printMetadata(saveTo="metadata") import cPickle stop = False mdata = cPickle.load(open("sampleMetadata")) for i in sorted(mdata.keys()): if TR.metadata[i] != mdata[i]: print "Key {0} is not consistent: should be {1}, is {2}".format(i, mdata[i], TR.metadata[i]) stop = True if stop == True: print ("""------------_ERROR_-------------- Inconsistent metadata: see above ----------------------------------------""") raise ValueError("Inconsistent Metadata") print "Testing allxall and by-chromosome heatmap counting diagonal twice" print "----> saving allxall heatmap" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice") a = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = a["heatmap"][st:end, st:end] chrom12 = a["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap( filename[1] + "-1M.hm", resolution=1000000, includeTrans=True, countDiagonalReads="twice") b = h5dict(filename[1] + "-1M.hm")["1 1"] bb = h5dict(filename[1] + "-1M.hm")["1 2"] assert (b - chrom1).sum() == 0 print "Cis heatmap consistent" assert (bb - chrom12).sum() == 0 print 'Trans heatmap consistent' print a["heatmap"][::10, ::10].sum() #assert a["heatmap"][::10, ::10].sum() == 21800 print "Heatmap sum correct\n" #--------------------------------- print "Testing allxall and by-chromosome heatmap counting diagonal once" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once") Ta = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = Ta["heatmap"][st:end, st:end] chrom12 = Ta["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap( filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True, countDiagonalReads="once") TR.saveHiResHeatmapWithOverlaps(filename[1]+"-1M-highRes.hm", resolution=50000, countDiagonalReads="twice") TR.saveSuperHighResMapWithOverlaps(filename[1]+"-5k-SuperHighRes.hm", resolution=5000,chromosomes = [14], countDiagonalReads="twice") Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"] Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"] assert ((Tb - chrom1) == 0).all() assert ((Tbb - chrom12) == 0).all() assert ((Tb + np.diag(np.diag(Tb))) == b).all() print "Diagonal counting methods are consistent\n" newchrom1 = chrom1.copy() for i in xrange(len(newchrom1)): newchrom1[i,i] = 2 * newchrom1[i,i] Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"] assert np.abs(Tb.sum() - newchrom1.sum()) < 1 assert np.sum(np.abs(coarsegrain(Tb,20,True) - newchrom1)) < 500 #------------------------------ print "Testing updateGenome method" from mirnylib.genome import Genome removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0]) #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum() t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ((removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum() newGenome = Genome(genomePath=genomeFolder, readChrms=["2", "3", "4", "5", "X"]) TR.updateGenome(newGenome) assert TR.N == t a = h5dict(filename[1] + "-1M.hm")["heatmap"]
Hi-C at 10kb resolution. It took it 20 minutes to load the data, 4 mins to remove poor bins, and couple hours to perform IC, about 10 minutes per pass. I also note that the data was in fact stored in memory for doing that, and it never used more than 16GB of RAM... in fact, creating this dataset used more, but this will be optimized later. """ from mirnylib.genome import Genome import numpy as np import warnings from mirnylib.h5dict import h5dict from mirnylib.numutils import removeDiagonals from mirnylib.systemutils import setExceptionHook setExceptionHook() class defaultMatrix(object): """ This is a template object which stores matrix in memory. Alternatively, matrix can be stored in an h5dict, either normally or in a sparse mode. All the methods should be first implemented here using getData() and setData() Then they shold be translated to sparse subclasses of this class. """ def __init__(self, data=None, dictToSave=None, key=""): """ Initializes the object that stores the Hi-C matrix.
def displayHeatmap(): plt.figure(figsize=(5, 5)) shared_arr = mp.Array(ctypes.c_double, N**2) arr = tonumpyarray(shared_arr) arr.shape = (N, N) def doSim(i): nparr = tonumpyarray(shared_arr) SMCTran = initModel(i) for j in range(1): SMC = [] N1 = 10000 for k in range(np.random.randint(N1 // 2, N1)): SMCTran.steps(150) SMC.append(SMCTran.getSMCs()) SMC = np.concatenate(SMC, axis=1) SMC1D = SMC[0] * N + SMC[1] position, counts = np.unique(SMC1D, return_counts=True) with shared_arr.get_lock(): nparr[position] += counts print("Finished!") return None setExceptionHook() low20 = low // 10 high20 = high // 10 mydict = h5dict( "/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr", 'r') hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20] hicdata = completeIC(hicdata) curshape = hicdata.shape newshape = (1000 * (high - low)) // (600 * 20) print(hicdata.shape, newshape) hicdata = zoomArray(hicdata, (newshape, newshape)) hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99)) hicdata /= np.mean(np.sum(hicdata, axis=1)) fmap(doSim, range(30), n=20) # number of threads to use. On a 20-core machine I use 20. arr = coarsegrain(arr, 20) arr = np.clip(arr, 0, np.percentile(arr, 99.9)) arr /= np.mean(np.sum(arr, axis=1)) ran = np.arange(len(arr)) mask = ran[:, None] > ran[None, :] arr[mask] = hicdata[mask] logarr = np.log(arr + 0.0001) plt.imshow(logarr, vmax=np.percentile(logarr, 99.9), extent=[low, high, high, low], interpolation="none") nicePlot()
def showAllDatasets(): setExceptionHook() #plt.figure(figsize=(25, 15)) fig = plt.figure() #size of the figure fw = fig.get_figwidth() * fig.get_dpi() fh = fig.get_figheight() * fig.get_dpi() #get subplot configuration sx, sy = subplots(len(datasets)) for j, dataset in enumerate(datasets): curPlot = plt.subplot(sx, sy, j + 1) heatmap = 1. * h5dict(hm(dataset), 'r')["heatmap"] #fill in gaps - obsolete, as heatmaps are with overlaps for _ in range(1): zeros = np.sum(heatmap, axis=0) == 0 zeros = np.nonzero(zeros)[0] heatmap[zeros] = heatmap[zeros - 1] heatmap[:, zeros] = heatmap[:, zeros - 1] #regular IC protocol mirnylib.numutils.fillDiagonal(heatmap, 0, 0) mirnylib.numutils.fillDiagonal(heatmap, 0, 1) mirnylib.numutils.fillDiagonal(heatmap, 0, -1) heatmap = trunc(heatmap, low=0, high=0.0001) heatmap = ultracorrect(heatmap) diag2value = np.mean(np.diagonal(heatmap, 2)) mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1) newHeatmap = heatmap #Top highly expressed genes #genePos = [18, 56, 77, 117, 143, 215, 234, 256, 266, 286, 300, 326, 336, 367, 379] geneCoor = [1162773, 3509071, 1180887, 543099, 1953250, 2522439, 3328524, 1503879, 900483, 242693, 3677144, 3931680, 3677704, 3762707, 3480870, 3829656, 1424678, 901855, 1439056, 3678537] # here we commited to 10kb resolution - change below if you're not genePos = [i / 10000. for i in geneCoor] genePos = [] #putting lines at highly expressed genes for lpos in genePos: plt.hlines(lpos , 0, 500, linewidth=0.7, color="black", alpha=0.2, zorder=1) plt.vlines(lpos , 0, 500, linewidth=0.7, color="black", alpha=0.2, zorder=1) pass #performing adaptive smoothing smoothedHeatmap = adaptiveSmoothing(newHeatmap, 20) smoothedHeatmap /= np.mean(np.sum(heatmap, axis=0)) #print dataset, sum([np.diagonal(smoothedHeatmap, i).sum() for i in range(60, 140)]) #maps = [[smoothedHeatmap, smoothedHeatmap[:30]], # [smoothedHeatmap[:, :30], smoothedHeatmap[:30, :30]]] #smoothedHeatmap = np.hstack([np.vstack(i) for i in maps]) allx = [] ally = [] plt.title(dataset, fontsize=10) plt.imshow((smoothedHeatmap), interpolation="none", vmax=0.035, cmap="acidblues", zorder=0) #plt.imshow((smoothedHeatmap), interpolation="nearest", vmin=0, vmax=np.exp(-4.5), cmap="fall", zorder=0) plt.xticks([]) plt.yticks([]) plt.subplots_adjust(left=0.05, # the left side of the subplots of the figure right=0.95, # the right side of the subplots of the figure bottom=0.05, # the bottom of the subplots of the figure top=0.95 , # the top of the subplots of the figure wspace=0.1, # the amount of width reserved for blank space between subplots hspace=0.2) #cPickle.dump(scaling, open(dataset.split("/")[-1] + "scaling", 'w')) #plt.ylim((400, 200)) #plt.xlim((0, 200)) #code below just puts the P(s) over the heatmap N = len(smoothedHeatmap) pts = np.array([[1, 0], [N, N], [N, 0]]) p = Polygon(pts, closed=True, facecolor=(0.8, 0.8, 0.8), linewidth=0, alpha=0.7, zorder=2) ax = plt.gca() ax.add_patch(p) Bbox = matplotlib.transforms.Bbox.from_bounds(.55, .55, .35, .42) tBbox = matplotlib.transforms.TransformedBbox(Bbox, ax.transAxes).get_points() l, b, w, h = tBbox[0, 0] / fw, tBbox[0, 1] / fh, (tBbox[1, 0] - tBbox[0, 0]) / fw, (tBbox[1, 1] - tBbox[0, 1]) / fh axins = fig.add_axes([l, b, w, h], axisbg=(0, 0, 0, 0), xscale="log", yscale="log") removeAxes(ax=axins) for xlabel_i in axins.get_xticklabels(): xlabel_i.set_fontsize(6) for xlabel_i in axins.get_yticklabels(): xlabel_i.set_fontsize(6) N = len(smoothedHeatmap) st = int(0.05 * N) end = int(0.45 * N) st2 = int(0.55 * N) end2 = int(0.95 * N) axins.plot(*scaling(0.5 * (smoothedHeatmap[st:end, st:end] + smoothedHeatmap[st2:end2, st2:end2])), color="blue", label="intra-arm") if (dataset in ['Wildtype_0min_BglII_rep1', "ML2000_0hr"]): myscaling = scaling(0.5 * (smoothedHeatmap[st:end, st:end] + smoothedHeatmap[st2:end2, st2:end2])) #axins.plot(*scaling(smoothedHeatmap[st:end, end2:st2:-1]), color="green", label="inter-arm") axins.set_xlabel("kb", fontsize=6) axins.set_ylabel("Pc", fontsize=6) axins.grid() if "myscaling" in locals(): axins.plot(*myscaling, color="grey") #axins.set_xticks([]) #axins.set_yticks([]) #axins.tick_params(color="red") #axins.set_xlabel("Mb") #axins.set_ylabel("Pc") for i, line in enumerate(axins.get_xticklines() + axins.get_yticklines()): if i % 2 == 1: # odd indices line.set_visible(False) #if dataset != "Wildtype_0min_BglII_rep1": # data = cPickle.load(open("scalings/{0}".format(dataset))) # axins.plot(*data, color="blue") #axins.xscale("log") #axins.yscale("log") #end strange code plt.show()
def showAllDatasets(): setExceptionHook() #plt.figure(figsize=(25, 15)) fig = plt.figure() #size of the figure fw = fig.get_figwidth() * fig.get_dpi() fh = fig.get_figheight() * fig.get_dpi() #get subplot configuration sx, sy = subplots(len(datasets)) for j, dataset in enumerate(datasets): curPlot = plt.subplot(sx, sy, j + 1) heatmap = 1. * h5dict(hm(dataset), 'r')["heatmap"] #fill in gaps - obsolete, as heatmaps are with overlaps for _ in range(1): zeros = np.sum(heatmap, axis=0) == 0 zeros = np.nonzero(zeros)[0] heatmap[zeros] = heatmap[zeros - 1] heatmap[:, zeros] = heatmap[:, zeros - 1] #regular IC protocol mirnylib.numutils.fillDiagonal(heatmap, 0, 0) mirnylib.numutils.fillDiagonal(heatmap, 0, 1) mirnylib.numutils.fillDiagonal(heatmap, 0, -1) heatmap = trunc(heatmap, low=0, high=0.0001) heatmap = ultracorrect(heatmap) diag2value = np.mean(np.diagonal(heatmap, 2)) mirnylib.numutils.fillDiagonal(heatmap, 1.5 * diag2value, 0) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, 1) mirnylib.numutils.fillDiagonal(heatmap, 1.2 * diag2value, -1) newHeatmap = heatmap #Top highly expressed genes #genePos = [18, 56, 77, 117, 143, 215, 234, 256, 266, 286, 300, 326, 336, 367, 379] geneCoor = [ 1162773, 3509071, 1180887, 543099, 1953250, 2522439, 3328524, 1503879, 900483, 242693, 3677144, 3931680, 3677704, 3762707, 3480870, 3829656, 1424678, 901855, 1439056, 3678537 ] # here we commited to 10kb resolution - change below if you're not genePos = [i / 10000. for i in geneCoor] genePos = [] #putting lines at highly expressed genes for lpos in genePos: plt.hlines(lpos, 0, 500, linewidth=0.7, color="black", alpha=0.2, zorder=1) plt.vlines(lpos, 0, 500, linewidth=0.7, color="black", alpha=0.2, zorder=1) pass #performing adaptive smoothing smoothedHeatmap = adaptiveSmoothing(newHeatmap, 20) smoothedHeatmap /= np.mean(np.sum(heatmap, axis=0)) #print dataset, sum([np.diagonal(smoothedHeatmap, i).sum() for i in range(60, 140)]) #maps = [[smoothedHeatmap, smoothedHeatmap[:30]], # [smoothedHeatmap[:, :30], smoothedHeatmap[:30, :30]]] #smoothedHeatmap = np.hstack([np.vstack(i) for i in maps]) allx = [] ally = [] plt.title(dataset, fontsize=10) plt.imshow((smoothedHeatmap), interpolation="none", vmax=0.035, cmap="acidblues", zorder=0) #plt.imshow((smoothedHeatmap), interpolation="nearest", vmin=0, vmax=np.exp(-4.5), cmap="fall", zorder=0) plt.xticks([]) plt.yticks([]) plt.subplots_adjust( left=0.05, # the left side of the subplots of the figure right=0.95, # the right side of the subplots of the figure bottom=0.05, # the bottom of the subplots of the figure top=0.95, # the top of the subplots of the figure wspace= 0.1, # the amount of width reserved for blank space between subplots hspace=0.2) #cPickle.dump(scaling, open(dataset.split("/")[-1] + "scaling", 'w')) #plt.ylim((400, 200)) #plt.xlim((0, 200)) #code below just puts the P(s) over the heatmap N = len(smoothedHeatmap) pts = np.array([[1, 0], [N, N], [N, 0]]) p = Polygon(pts, closed=True, facecolor=(0.8, 0.8, 0.8), linewidth=0, alpha=0.7, zorder=2) ax = plt.gca() ax.add_patch(p) Bbox = matplotlib.transforms.Bbox.from_bounds(.55, .55, .35, .42) tBbox = matplotlib.transforms.TransformedBbox( Bbox, ax.transAxes).get_points() l, b, w, h = tBbox[0, 0] / fw, tBbox[0, 1] / fh, ( tBbox[1, 0] - tBbox[0, 0]) / fw, (tBbox[1, 1] - tBbox[0, 1]) / fh axins = fig.add_axes([l, b, w, h], axisbg=(0, 0, 0, 0), xscale="log", yscale="log") removeAxes(ax=axins) for xlabel_i in axins.get_xticklabels(): xlabel_i.set_fontsize(6) for xlabel_i in axins.get_yticklabels(): xlabel_i.set_fontsize(6) N = len(smoothedHeatmap) st = int(0.05 * N) end = int(0.45 * N) st2 = int(0.55 * N) end2 = int(0.95 * N) axins.plot(*scaling(0.5 * (smoothedHeatmap[st:end, st:end] + smoothedHeatmap[st2:end2, st2:end2])), color="blue", label="intra-arm") if (dataset in ['Wildtype_0min_BglII_rep1', "ML2000_0hr"]): myscaling = scaling(0.5 * (smoothedHeatmap[st:end, st:end] + smoothedHeatmap[st2:end2, st2:end2])) #axins.plot(*scaling(smoothedHeatmap[st:end, end2:st2:-1]), color="green", label="inter-arm") axins.set_xlabel("kb", fontsize=6) axins.set_ylabel("Pc", fontsize=6) axins.grid() if "myscaling" in locals(): axins.plot(*myscaling, color="grey") #axins.set_xticks([]) #axins.set_yticks([]) #axins.tick_params(color="red") #axins.set_xlabel("Mb") #axins.set_ylabel("Pc") for i, line in enumerate(axins.get_xticklines() + axins.get_yticklines()): if i % 2 == 1: # odd indices line.set_visible(False) #if dataset != "Wildtype_0min_BglII_rep1": # data = cPickle.load(open("scalings/{0}".format(dataset))) # axins.plot(*data, color="blue") #axins.xscale("log") #axins.yscale("log") #end strange code plt.show()
def refine_paper(filename, create=True): """filename[0] is a list of filenames of incoming files filename[1] is a folder for outgoing file""" if create == True: for onename in filename[0]: #Parsing individual files if not os.path.exists(onename): raise StandardError("path not found: %s" % onename) TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII", maximumMoleculeLength=500, inMemory=True) print "\nTesting loading new data without rsite information " TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII") #assert len(TR.DS) == 856143 #assert len(TR.ufragments) == 634572 TR.save(onename + "_parsed.frag") #Merging files alltogether, applying filters TR = HiCdataset(filename[1] + "_merged.frag", enzymeName="HindIII", genome=genomeFolder, mode="w") TR.merge([i + "_parsed.frag" for i in filename[0]]) TR = HiCdataset("refined", genome=genomeFolder, enzymeName="HindIII", mode="w", inMemory=True) print "\nTesting chunking during all tests" TR.chunksize = 30000 #because we do many operations, we disable autoFlush here TR.load(filename[1] + "_merged.frag") print "\nTesting Rsite filter" TR.filterRsiteStart(offset=5) #assert len(TR.DS) == 832110 print "\nTesting duplicate filter" TR.filterDuplicates(chunkSize=30000) #assert len(TR.DS) == 830275 print "\nTesting small/large and extreme fragment filter" TR.filterLarge() #assert len(TR.DS) == 825442 TR.filterExtreme(cutH=0.005, cutL=0) TR.writeFilteringStats() #assert len(TR.DS) == 803845 #------------------------------------------- TR.printMetadata(saveTo="metadata") import cPickle stop = False mdata = cPickle.load(open("sampleMetadata")) for i in sorted(mdata.keys()): if TR.metadata[i] != mdata[i]: print "Key {0} is not consistent: should be {1}, is {2}".format( i, mdata[i], TR.metadata[i]) stop = True if stop == True: print("""------------_ERROR_-------------- Inconsistent metadata: see above ----------------------------------------""") raise ValueError("Inconsistent Metadata") print "Testing allxall and by-chromosome heatmap counting diagonal twice" print "----> saving allxall heatmap" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice") a = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = a["heatmap"][st:end, st:end] chrom12 = a["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap(filename[1] + "-1M.hm", resolution=1000000, includeTrans=True, countDiagonalReads="twice") b = h5dict(filename[1] + "-1M.hm")["1 1"] bb = h5dict(filename[1] + "-1M.hm")["1 2"] assert (b - chrom1).sum() == 0 print "Cis heatmap consistent" assert (bb - chrom12).sum() == 0 print 'Trans heatmap consistent' print a["heatmap"][::10, ::10].sum() #assert a["heatmap"][::10, ::10].sum() == 21800 print "Heatmap sum correct\n" #--------------------------------- print "Testing allxall and by-chromosome heatmap counting diagonal once" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once") Ta = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = Ta["heatmap"][st:end, st:end] chrom12 = Ta["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap(filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True, countDiagonalReads="once") TR.saveHiResHeatmapWithOverlaps(filename[1] + "-1M-highRes.hm", resolution=50000, countDiagonalReads="twice") TR.saveSuperHighResMapWithOverlaps(filename[1] + "-5k-SuperHighRes.hm", resolution=5000, chromosomes=[14], countDiagonalReads="twice") Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"] Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"] assert ((Tb - chrom1) == 0).all() assert ((Tbb - chrom12) == 0).all() assert ((Tb + np.diag(np.diag(Tb))) == b).all() print "Diagonal counting methods are consistent\n" newchrom1 = chrom1.copy() for i in xrange(len(newchrom1)): newchrom1[i, i] = 2 * newchrom1[i, i] Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"] assert np.abs(Tb.sum() - newchrom1.sum()) < 1 assert np.sum(np.abs(coarsegrain(Tb, 20, True) - newchrom1)) < 500 #------------------------------ print "Testing updateGenome method" from mirnylib.genome import Genome removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0]) #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum() t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ( (removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum() newGenome = Genome(genomePath=genomeFolder, readChrms=["2", "3", "4", "5", "X"]) TR.updateGenome(newGenome) assert TR.N == t a = h5dict(filename[1] + "-1M.hm")["heatmap"]
def plotCorrelationAtDifferentBinning(): """Plots figure with correlation at different binning. Note the caching and creating of binned heatmaps flags below. Suppplementary paper figure """ sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] setExceptionHook() cache = False create = False if create == True: if cache == True: #-------------------standard version code----------------- FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR.load("../../../ErezPaperData/hg18/GM-HindIII-hg18_refined.frag") FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR3.load("../../../ErezPaperData/hg18/GM-HindIII-hg18"\ "_refined.frag") FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR2.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag") #----------------------cross-check code---------------- # FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18", # override=False, inMemory=True) # FR.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag") # # FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", # override=False, inMemory=True) # FR3.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag") # # FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", # override=False, inMemory=True) # FR2.load("../../../ErezPaperData/hg18/G"\ # "M-HindIII-hg18_refined.frag") #-------end corss-check code --------------------------------- #--------Filter only trans DS reads----------------- FR.maskFilter(FR.DS * (FR.chrms1 != FR.chrms2)) FR2.maskFilter(FR2.DS * (FR2.chrms1 != FR2.chrms2)) FR3.maskFilter(FR3.DS * (FR3.chrms1 != FR3.chrms2)) #Now create two halfs of one dataset and down-sample second dataset #----------------------standard version code-------- fraction = 0.5 * len(FR.DS) / float(len(FR2.DS)) rarray = numpy.random.random(len(FR.DS)) mask1 = rarray < 0.5 mask3 = rarray >= 0.5 mask2 = numpy.random.random(len(FR2.DS)) < fraction #-------------------- cross-check code--------- #fraction = 0.5 * len(FR2.DS) / float(len(FR.DS)) #rarray = numpy.random.random(len(FR.DS)) #mask1 = rarray < fraction #mask3 = (rarray > fraction) * (rarray < fraction * 2) #mask2 = numpy.random.random(len(FR2.DS)) > 0.5 #----------------------------------------- FR.maskFilter(mask1) FR2.maskFilter(mask2) FR3.maskFilter(mask3) FR.save("../../../tcc/working/cache1") FR2.save("../../../tcc/working/cache2") FR3.save("../../../tcc/working/cache3") else: FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR.load("../../../tcc/working/cache1") FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR3.load("../../../tcc/working/cache3") FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR2.load("../../../tcc/working/cache2") for size in sizes: FR.saveHeatmap("../../../tcc/working/HindIII_%d.hm" % size, size * 1000000) FR2.saveHeatmap("../../../tcc/working/NcoI_%d.hm" % size, size * 1000000) FR3.saveHeatmap("../../../tcc/working/control_%d.hm" % size, size * 1000000) p1 = [] p2 = [] p3 = [] p4 = [] evs = [] for size in sizes: BD = binnedDataAnalysis(size * 1000000, "../../../data/hg18") BD.simpleLoad("../../../tcc/working/HindIII_%d.hm" % size, "HindIII") BD.simpleLoad("../../../tcc/working/NcoI_%d.hm" % size, "NcoI") BD.simpleLoad("../../../tcc/working/control_%d.hm" % size, "control") BD.removeDiagonal() BD.removePoorRegions(cutoff=2) BD.removeCis() data1 = BD.dataDict["HindIII"] data2 = BD.dataDict["NcoI"] data3 = BD.dataDict["control"] mask = (numpy.sum( data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0) validMask = mask[:, None] * mask[None, :] transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :] cormask = transmask * validMask c1 = scipy.stats.spearmanr(data1[cormask], data2[cormask])[0] c4 = scipy.stats.spearmanr(data1[cormask], data3[cormask])[0] if size == 1: evs.append(BD.interchromosomalValues("HindIII")) evs.append(BD.interchromosomalValues("NcoI")) evs.append(BD.interchromosomalValues("control")) p4.append(c4) p1.append(c1) print "size\t%d\traw:" % size, c1, BD.removeZeros() BD.fakeCis() # does iterative correction as well BD.restoreZeros(value=0) data1 = BD.dataDict["HindIII"] data2 = BD.dataDict["NcoI"] data3 = BD.dataDict["control"] c2 = scipy.stats.spearmanr(data1[cormask], data2[cormask])[0] c3 = scipy.stats.spearmanr(data1[cormask], data3[cormask])[0] if size == 1: evs.append(BD.interchromosomalValues("HindIII")) evs.append(BD.interchromosomalValues("NcoI")) evs.append(BD.interchromosomalValues("control")) print evs p3.append(c3) p2.append(c2) print "\tcorrected:", c2, "\tcontrol", c3 plt.plot(sizes, p1, label="Raw data, between enzymes") plt.plot(sizes, p2, label="Iteratively corrected, between") plt.plot(sizes, p3, label="IC, within") plt.xlabel("Bin size, MB") plt.xticks(range(1, 11)) plt.ylabel("Spearman correlation coefficient") plt.legend() niceShow() setExceptionHook() 0 / 0