def coverage(s1, s2, TR): genome = Genome() genome.createMapping(resolution) label = genome.chromosomeStarts[TR.ufragments / TR.fragIDmult - 1] + ( TR.ufragments % TR.fragIDmult) / resolution counts = np.bincount(label, weights=s1) counts2 = np.bincount(label, weights=s2) data = cPickle.load(open("GC1M", 'rb')) eigenvector = np.zeros(genome.chromosomeEnds[-1], float) inds = np.argsort(counts) mask = inds[int(0.02 * len(inds)):] for chrom in range(1, 24): eigenvector[genome.chromosomeStarts[chrom - 1]:genome. chromosomeStarts[chrom - 1] + len(data[chrom - 1])] = data[chrom - 1] eigenvector[eigenvector < 35] = 35 plt.scatter(counts[mask], counts2[mask], c=eigenvector[mask], s=6, linewidth=0) print stats.spearmanr(counts[mask], counts2[mask]) plt.xlabel("Coverage from all reads") plt.xticks([0, 5000, 10000, 15000]) plt.ylabel("Coverage from RBs") b = plt.colorbar() b.ax.set_xlabel("GC content")
def doSaddle(filename, eig, gen): c = cooler.Cooler(filename) gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"]) gen.setResolution(getResolution(filename)) saddles = [] for chrom in range(gen.chrmCount): saddle = np.zeros((5,5), dtype = float) st = gen.chrmStartsBinCont[chrom] end = gen.chrmEndsBinCont[chrom] cur = c.matrix(balance=False).fetch(gen.idx2label[chrom]) cur = observedOverExpected(cur) mask = np.sum(cur , axis=0) > 0 cur = cur [mask] cur = cur [:, mask] GC = eig[st:end] GC = GC[mask] if len(GC) > 5: for i in range(5): for j in range(5): G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20]) mask1 = (GC > G1) * (GC < G2) G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20]) mask2 = (GC > G1) * (GC < G2) saddle[i, j] += cur[np.ix_(mask1, mask2)].mean() saddles.append(saddle) return saddles
def doSaddleError(filename, eig, gen, correct=False): gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"]) cur = 0 data = h5dict(filename,'r')["heatmap"] if correct: data = completeIC(data) gen.setResolution(getResolution(filename)) if eig == "GC": eig = np.concatenate(gen.GCBin) saddles = [] permutted = [] saddle = np.zeros((5,5), dtype = float) for i in range(100): permutted.append(np.zeros((5,5), dtype = float)) for chrom in range(gen.chrmCount): st = gen.chrmStartsBinCont[chrom] end = gen.chrmEndsBinCont[chrom] cur = data[st:end, st:end] cur = observedOverExpected(cur) mask = np.sum(cur , axis=0) > 0 cur = cur [mask] cur = cur [:, mask] GC = eig[st:end] GC = GC[mask] if len(GC) > 5: for i in range(5): for j in range(5): G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20]) mask1 = (GC > G1) * (GC < G2) G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20]) mask2 = (GC > G1) * (GC < G2) addition = cur[np.ix_(mask1, mask2)] addition = np.reshape(addition, (-1)) for k in range(100): resampled = np.random.choice(addition, len(addition), replace=True) permutted[k][i,j] += resampled.mean() saddle[i, j] += addition.mean() return saddle, permutted
def coverage(s1, s2, TR): genome = Genome() genome.createMapping(resolution) label = genome.chromosomeStarts[TR.ufragments / TR.fragIDmult - 1] + (TR.ufragments % TR.fragIDmult) / resolution counts = np.bincount(label, weights=s1) counts2 = np.bincount(label, weights=s2) data = cPickle.load(open("GC1M", 'rb')) eigenvector = np.zeros(genome.chromosomeEnds[-1], float) inds = np.argsort(counts) mask = inds[int(0.02 * len(inds)):] for chrom in range(1, 24): eigenvector[genome.chromosomeStarts[chrom - 1]:genome.chromosomeStarts[chrom - 1] + len(data[chrom - 1])] = data[chrom - 1] eigenvector[eigenvector < 35] = 35 plt.scatter(counts[mask], counts2[mask], c=eigenvector[ mask], s=6, linewidth=0) print stats.spearmanr(counts[mask], counts2[mask]) plt.xlabel("Coverage from all reads") plt.xticks([0, 5000, 10000, 15000]) plt.ylabel("Coverage from RBs") b = plt.colorbar() b.ax.set_xlabel("GC content")
def doEigenvector(filename, genome): if filename == "GC": gen = Genome("/home/magus/HiC2011/data/" + genome, readChrms=["#","X"]) gen.setResolution(1000000) GC = np.concatenate(gen.GCBin) return GC resolution = getResolution(filename) BD = binnedData.binnedData(resolution, "/home/magus/HiC2011/data/" + genome, ["#","X"]) BD.simpleLoad(filename, "bla") BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removeCis() BD.truncTrans(high=0.0005) BD.removePoorRegions(cutoff=1) BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=2) BD.restoreZeros(value=0) return BD.EigDict["bla"][0]
def byChrEig(filename, genome, chromosomes="all", resolution="auto", byArm=True, doSmooth=False): from mirnylib.genome import Genome if resolution == "auto": resolution = getResolution(filename) if type(genome) == str: genome = Genome(genome) assert isinstance(genome, Genome) genome.setResolution(resolution) mydict = mirnylib.h5dict.h5dict(filename) if chromosomes == "all": chromosomes = list(range(genome.chrmCount)) chromosomes = [i for i in chromosomes if "{0} {0}".format(i) in mydict] if len(chromosomes) == 0: raise ValueError("No chromosomes left. Check h5dict file.") result = [] for chrom in chromosomes: data = mydict["{0} {0}".format(chrom)] if not byArm: result.append( completeEig(data, genome.GCBin[chrom], doSmooth=doSmooth)) else: GC = genome.GCBin[chrom] result.append(np.zeros(len(GC), dtype=float)) cent = genome.cntrMids[chrom] / resolution result[-1][:cent] = completeEig(data[:cent, :cent], genome.GCBin[chrom][:cent], doSmooth=doSmooth) result[-1][cent:] = completeEig(data[cent:, cent:], genome.GCBin[chrom][cent:], doSmooth=doSmooth) return result
def __init__(self, genome, resolution, storageFile="inMemory", mode="w"): """ Initializes the high-resolution Hi-C data storage. Parameters ---------- genome : folder or Genome object matching Genome object or folder to load it form resolution : int Resolution (number of bp per bin) storageFile : str (optional) File to store the h5dict. File will be created. By default stores in memory mode : "w", "w-" "r+" or "a", optional Access mode to h5dict (see h5dict manual) """ inMemory = (storageFile == "inMemory") self._h5dict = h5dict(storageFile, mode=mode, in_memory=inMemory) if type(genome) == str: genome = Genome(genome, readChrms=["#", "X"]) assert isinstance(genome, Genome) self.genome = genome self.resolution = resolution self.genome.setResolution(resolution) if self.genome.numBins < 7000: print "Total number of bins in the genome is just %d" % self.genome.numBins warnings.warn( "For low-resolution analysis use binnedData, as it provides" "more analysis tools") M = self.genome.chrmCount self.cisKeys = [(i, i) for i in xrange(M)] self.transKeys = [(i, j) for i in range(M) for j in range(M) if j > i] self.allKeys = self.cisKeys + self.transKeys self.data = {} self._initChromosomes()
def getGenome(): return Genome("../data/caul", chrmFileTemplate="%s.fa", readChrms=[])
from mirnylib.h5dict import h5dict import matplotlib.pyplot as plt import os import sys import numpy as np from mirnylib.systemutils import setExceptionHook from mirnylib.numutils import coarsegrain from mirnylib.genome import Genome setExceptionHook() workingGenome = "hg19" genomeFolder = sys.argv[1] if not os.path.exists(genomeFolder): raise Exception("Please provide hg19 Genome folder in the code or as a first argument") mygenome = Genome(genomeFolder, readChrms = ["#","X"]) def source(ID): return os.path.join("%s-%s.hdf5" % (ID, workingGenome)) # determines path to the parsed file by ID N = 2000000 chrms1 = np.random.randint(0,22,N) chrms2 = chrms1.copy() mask = np.random.random(N) < 0.5 chrms2[mask] = np.random.randint(0,22,mask.sum()) pos1 = np.array(np.array((0.1 + 0.8 * np.random.random(N)) * mygenome.chrmLens[chrms1]), dtype=int) offset1 = np.exp(3 + np.random.random(N) * (np.log(1000000) - 3)) * (2 * (np.random.random(N)>0.5) - 1 ) pos2 = np.array(pos1 + offset1, dtype=int) strands1 = np.random.random(N) > 0.5
def parseInputData(self, dictLike, commandArgs, **kwargs): """ Added Parameters ---------------- commandArgs : NameSpace A NameSpace object defined by argparse. """ ## Necessary Modules import numexpr if not os.path.exists(dictLike): raise IOError('File not found: %s' % dictLike) dictLike = h5dict(dictLike, 'r') self.chrms1 = dictLike['chrms1'] self.chrms2 = dictLike['chrms2'] self.cuts1 = dictLike['cuts1'] self.cuts2 = dictLike['cuts2'] self.strands1 = dictLike['strands1'] self.strands2 = dictLike['strands2'] self.dists1 = np.abs(dictLike['rsites1'] - self.cuts1) self.dists2 = np.abs(dictLike['rsites2'] - self.cuts2) self.mids1 = (dictLike['uprsites1'] + dictLike['downrsites1']) / 2 self.mids2 = (dictLike['uprsites2'] + dictLike['downrsites2']) / 2 self.fraglens1 = np.abs( (dictLike['uprsites1'] - dictLike['downrsites1'])) self.fraglens2 = np.abs( (dictLike['uprsites2'] - dictLike['downrsites2'])) self.fragids1 = self.mids1 + np.array(self.chrms1, dtype='int64') * self.fragIDmult self.fragids2 = self.mids2 + np.array(self.chrms2, dtype='int64') * self.fragIDmult distances = np.abs(self.mids1 - self.mids2) distances[self.chrms1 != self.chrms2] = -1 self.distances = distances # Distances between restriction fragments del distances # Total Reads self.N = len(self.chrms1) self.metadata["100_TotalReads"] = self.N try: dictLike['misc']['genome']['idx2label'] self.updateGenome( self.genome, oldGenome=dictLike["misc"]["genome"]["idx2label"], putMetadata=True) except KeyError: assumedGenome = Genome(self.genome.genomePath) self.updateGenome(self.genome, oldGenome=assumedGenome, putMetadata=True) DSmask = (self.chrms1 >= 0) * (self.chrms2 >= 0) self.metadata["200_totalDSReads"] = DSmask.sum() self.metadata["201_DS+SS"] = len(DSmask) self.metadata["202_SSReadsRemoved"] = len(DSmask) - DSmask.sum() mask = DSmask ## Information based on restriction fragments sameFragMask = self.evaluate("a = (fragids1 == fragids2)", ["fragids1", "fragids2"]) * DSmask cutDifs = self.cuts2[sameFragMask] > self.cuts1[sameFragMask] s1 = self.strands1[sameFragMask] s2 = self.strands2[sameFragMask] SSDE = (s1 != s2) SS = SSDE * (cutDifs == s2) Dangling = SSDE & (~SS) SS_N = SS.sum() SSDE_N = SSDE.sum() sameFrag_N = sameFragMask.sum() dist = self.evaluate( "a = - cuts1 * (2 * strands1 -1) - " "cuts2 * (2 * strands2 - 1)", ["cuts1", "cuts2", "strands1", "strands2"]) Dangling_L = dist[sameFragMask][Dangling] library_L = int(np.ceil((np.percentile(Dangling_L, 95)))) self.maximumMoleculeLength = library_L readsMolecules = self.evaluate( "a = numexpr.evaluate('(chrms1 == chrms2) & (strands1 != strands2) & (dist >=0) &" " (dist <= maximumMoleculeLength)')", internalVariables=["chrms1", "chrms2", "strands1", "strands2"], externalVariables={"dist": dist}, constants={ "maximumMoleculeLength": self.maximumMoleculeLength, "numexpr": numexpr }) if commandArgs.sameFragments: mask *= (-sameFragMask) noSameFrag = mask.sum() self.metadata["210_sameFragmentReadsRemoved"] = sameFrag_N self.metadata["212_Self-Circles"] = SS_N self.metadata["214_DandlingEnds"] = SSDE_N - SS_N self.metadata["216_error"] = sameFrag_N - SSDE_N mask *= (readsMolecules == False) extraDE = mask.sum() self.metadata[ "220_extraDandlingEndsRemoved"] = -extraDE + noSameFrag if commandArgs.RandomBreaks: ini_N = extraDE mask *= ((self.dists1 + self.dists2) <= library_L) rb_N = ini_N - mask.sum() self.metadata["330_removeRandomBreaks"] = rb_N if mask.sum() == 0: raise Exception( 'No reads left after filtering. Please, check the input data') del DSmask, sameFragMask del dist, readsMolecules self.metadata["300_ValidPairs"] = self.N self.maskFilter(mask)
genomeFolder = "../../../data/hg18" readChrms = [ "#", # read all numbered chromosomes "X" ] # add X chromosome for inDataset in inDatasets.values(): if not os.path.exists(inDataset): raise IOError("Raw heatmap file does not exist: {}".format(inDataset)) if not os.path.isdir(genomeFolder): raise IOError("Genome folder does not exist") # When you do this, be sure that readChrms used to save heatmap matches # readChrms that you define here! genome = Genome(genomeFolder, readChrms=readChrms) # Read resolution from one of the datasets sampleDataset = h5dict(inDatasets.values()[0], mode="r") # random dataset resolution = int(sampleDataset["resolution"]) # Define the binnedData object, load data BD = binnedData(resolution, genome, readChrms) for name, filename in inDatasets.items(): BD.simpleLoad(filename, name) BD.removeDiagonal() # Remove bins with less than half of a bin sequenced BD.removeBySequencedCount(0.5)
def parseInputData(self, dictLike, **kwargs): import numexpr if not os.path.exists(dictLike): raise IOError('File not found: %s' % dictLike) dictLike = h5dict(dictLike, 'r') self.chrms1 = dictLike['chrms1'] self.chrms2 = dictLike['chrms2'] self.cuts1 = dictLike['cuts1'] self.cuts2 = dictLike['cuts2'] self.strands1 = dictLike['strands1'] self.strands2 = dictLike['strands2'] self.dists1 = np.abs(dictLike['rsites1'] - self.cuts1) self.dists2 = np.abs(dictLike['rsites2'] - self.cuts2) self.mids1 = (dictLike['uprsites1'] + dictLike['downrsites1']) / 2 self.mids2 = (dictLike['uprsites2'] + dictLike['downrsites2']) / 2 self.fraglens1 = np.abs( (dictLike['uprsites1'] - dictLike['downrsites1'])) self.fraglens2 = np.abs( (dictLike['uprsites2'] - dictLike['downrsites2'])) self.fragids1 = self.mids1 + np.array(self.chrms1, dtype='int64') * self.fragIDmult self.fragids2 = self.mids2 + np.array(self.chrms2, dtype='int64') * self.fragIDmult distances = np.abs(self.mids1 - self.mids2) distances[self.chrms1 != self.chrms2] = -1 self.distances = distances # Distances between restriction fragments del distances self.N = len(self.chrms1) try: dictLike['misc']['genome']['idx2label'] self.updateGenome(self.genome, oldGenome = dictLike["misc"]["genome"]["idx2label"]) except KeyError: assumedGenome = Genome(self.genome.genomePath) self.updateGenome(self.genome, oldGenome = assumedGenome) # Discard dangling ends and self-circles DSmask = (self.chrms1 >= 0) * (self.chrms2 >= 0) self.metadata['100_NormalPairs'] = DSmask.sum() sameFragMask = self.evaluate("a = (fragids1 == fragids2)", ["fragids1", "fragids2"]) * DSmask cutDifs = self.cuts2[sameFragMask] > self.cuts1[sameFragMask] s1 = self.strands1[sameFragMask] s2 = self.strands2[sameFragMask] SSDE = (s1 != s2) SS = SSDE * (cutDifs == s2) SS_N = SS.sum() SSDE_N = SSDE.sum() sameFrag_N = sameFragMask.sum() self.metadata['120_SameFragmentReads'] = sameFrag_N self.metadata['122_SelfLigationReads'] = SS_N self.metadata['124_DanglingReads'] = SSDE_N - SS_N self.metadata['126_UnknownMechanism'] = sameFrag_N - SSDE_N mask = DSmask * (-sameFragMask) del DSmask, sameFragMask noSameFrag = mask.sum() # distance between sites facing each other dist = self.evaluate("a = numexpr.evaluate('- cuts1 * (2 * strands1 -1) - " "cuts2 * (2 * strands2 - 1)')", ["cuts1", "cuts2", "strands1", "strands2"], constants={"numexpr":numexpr}) readsMolecules = self.evaluate( "a = numexpr.evaluate('(chrms1 == chrms2) & (strands1 != strands2) & (dist >=0) &" " (dist <= maximumMoleculeLength)')", internalVariables=["chrms1", "chrms2", "strands1", "strands2"], externalVariables={"dist": dist}, constants={"maximumMoleculeLength": self.maximumMoleculeLength, "numexpr": numexpr}) mask *= (readsMolecules == False) extraDE = mask.sum() self.metadata['210_ExtraDanglingReads'] = -extraDE + noSameFrag if mask.sum() == 0: raise Exception('No reads left after filtering. Please, check the input data') del dist, readsMolecules self.maskFilter(mask)
from mirnylib.genome import Genome import pandas as pd from hiclib import hicShared from mirnylib.h5dict import h5dict from mirnylib.numutils import coarsegrain, observedOverExpected, completeIC import joblib import pickle import cooler from mirnylib.numutils import zoomArray from hiclib.hicShared import getResolution from hiclib import binnedData from mirnylib.systemutils import setExceptionHook # defining genomes used here mygenM = Genome("../../data/mm9", readChrms=["#", "X"]) mygenH = Genome("../../data/hg19", readChrms=["#", "X"]) genomDict = {"mm9":mygenM, "hg19":mygenH} mem = joblib.Memory(".") def getFrags(): """ A method that calculates a set of restriction fragments covered in all hg19, or in all mm9 datasets Used for the correct calculation of scalings """ mouse = "/home/magus/HiC2011/DrosophilaSingleCell2015/alternativeFiltering/mm9/oocyte_combined_refined.frag" human = "/home/magus/HiC2011/DrosophilaSingleCell2015/alternativeFiltering/hg19/K562_combined_refined.frag"
and might be in fact pretty fast. General recommendation: if you have 16+GB of RAM, and .sra (.fastq.gz) files were less than 30GB, then you should be fine with parsing things in memory. """ from hiclib.fragmentHiC import HiCdataset from mirnylib.systemutils import fmap,setExceptionHook from mirnylib.genome import Genome import numpy as np import os import sys #from defineGenome import getGenome genomeName = "galGal4_1MB" genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/galGal4/"+genomeName,readChrms=[],chrmFileTemplate="chr%s.fa",gapFile="galGal4.gap.txt") def getGenome(workingGenome): if workingGenome != genomeName: raise return genome_db setExceptionHook() def ensure(f): d = os.path.dirname(f) if os.path.isdir(d): return f else: try: os.makedirs(d) except: raise ValueError("Cannot create directory") return f
overhead, and therefore having 23*11 chromosome pairs would slow it down a bit. Note the trick of converting a global heatmap map to a smaller map. It tests only iterative correction now. """ from hiclib.highResBinnedData import HiResHiC from mirnylib.genome import Genome from hiclib.binnedData import binnedData from mirnylib.h5dict import h5dict import numpy as np import sys import os genome = Genome(sys.argv[1], readChrms=["1", "2", "3", "4", "5"]) a = HiResHiC(genome, 1000000, "hiResDict", mode='w') a.loadData(dictLike="../fragmentHiC/test-1M-byChr.hm") a.removeDiagonal() a.removePoorRegions(2) a.iterativeCorrection(1e-10) b = binnedData(1000000, genome) data = {"heatmap": h5dict("../fragmentHiC/test-1M.hm")["heatmap"]} lim = b.genome.chrmEndsBinCont[-1] data["heatmap"] = data["heatmap"][:lim, :lim] b.simpleLoad(data, "data") b.removeDiagonal()
def refine_paper(filename, create=True): """filename[0] is a list of filenames of incoming files filename[1] is a folder for outgoing file""" if create == True: for onename in filename[0]: #Parsing individual files if not os.path.exists(onename): raise StandardError("path not found: %s" % onename) TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII", maximumMoleculeLength=500, inMemory=True) print "\nTesting loading new data without rsite information " TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII") #assert len(TR.DS) == 856143 #assert len(TR.ufragments) == 634572 TR.save(onename + "_parsed.frag") #Merging files alltogether, applying filters TR = HiCdataset(filename[1] + "_merged.frag", enzymeName="HindIII", genome=genomeFolder, mode="w") TR.merge([i + "_parsed.frag" for i in filename[0]]) TR = HiCdataset("refined", genome=genomeFolder, enzymeName="HindIII", mode="w", inMemory=True) print "\nTesting chunking during all tests" TR.chunksize = 30000 #because we do many operations, we disable autoFlush here TR.load(filename[1] + "_merged.frag") print "\nTesting Rsite filter" TR.filterRsiteStart(offset=5) #assert len(TR.DS) == 832110 print "\nTesting duplicate filter" TR.filterDuplicates(chunkSize=30000) #assert len(TR.DS) == 830275 print "\nTesting small/large and extreme fragment filter" TR.filterLarge() #assert len(TR.DS) == 825442 TR.filterExtreme(cutH=0.005, cutL=0) TR.writeFilteringStats() #assert len(TR.DS) == 803845 #------------------------------------------- TR.printMetadata(saveTo="metadata") import cPickle stop = False mdata = cPickle.load(open("sampleMetadata")) for i in sorted(mdata.keys()): if TR.metadata[i] != mdata[i]: print "Key {0} is not consistent: should be {1}, is {2}".format( i, mdata[i], TR.metadata[i]) stop = True if stop == True: print("""------------_ERROR_-------------- Inconsistent metadata: see above ----------------------------------------""") raise ValueError("Inconsistent Metadata") print "Testing allxall and by-chromosome heatmap counting diagonal twice" print "----> saving allxall heatmap" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice") a = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = a["heatmap"][st:end, st:end] chrom12 = a["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap(filename[1] + "-1M.hm", resolution=1000000, includeTrans=True, countDiagonalReads="twice") b = h5dict(filename[1] + "-1M.hm")["1 1"] bb = h5dict(filename[1] + "-1M.hm")["1 2"] assert (b - chrom1).sum() == 0 print "Cis heatmap consistent" assert (bb - chrom12).sum() == 0 print 'Trans heatmap consistent' print a["heatmap"][::10, ::10].sum() #assert a["heatmap"][::10, ::10].sum() == 21800 print "Heatmap sum correct\n" #--------------------------------- print "Testing allxall and by-chromosome heatmap counting diagonal once" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once") Ta = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = Ta["heatmap"][st:end, st:end] chrom12 = Ta["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap(filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True, countDiagonalReads="once") TR.saveHiResHeatmapWithOverlaps(filename[1] + "-1M-highRes.hm", resolution=50000, countDiagonalReads="twice") TR.saveSuperHighResMapWithOverlaps(filename[1] + "-5k-SuperHighRes.hm", resolution=5000, chromosomes=[14], countDiagonalReads="twice") Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"] Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"] assert ((Tb - chrom1) == 0).all() assert ((Tbb - chrom12) == 0).all() assert ((Tb + np.diag(np.diag(Tb))) == b).all() print "Diagonal counting methods are consistent\n" newchrom1 = chrom1.copy() for i in xrange(len(newchrom1)): newchrom1[i, i] = 2 * newchrom1[i, i] Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"] assert np.abs(Tb.sum() - newchrom1.sum()) < 1 assert np.sum(np.abs(coarsegrain(Tb, 20, True) - newchrom1)) < 500 #------------------------------ print "Testing updateGenome method" from mirnylib.genome import Genome removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0]) #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum() t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ( (removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum() newGenome = Genome(genomePath=genomeFolder, readChrms=["2", "3", "4", "5", "X"]) TR.updateGenome(newGenome) assert TR.N == t a = h5dict(filename[1] + "-1M.hm")["heatmap"]
c2 = hd["chrms2"] p1 = hd["cuts1"] p2 = hd["cuts2"] mask = c1 == c2 cis = mask.sum() more20kb = (np.abs(p1[mask] - p2[mask]) > 20000).sum() return more20kb / cis dfs = [] for genome in ["mm9", 'hg19']: genomeObject = Genome("/home/magus/HiC2011/data/{0}".format(genome), readChrms = ["#", "X"]) filenames = [i.replace(".1000000.cool", "") for i in os.listdir(genome) if ".1000000.cool" in i and ("DpnII" in i or "combined" in i or "sperm" in i.lower() or "ad" in i.lower() )] sampleDict = pd.read_csv("samples.csv", index_col=0) #nsnDict = sampleDict["Stage"] ### Bug fix to read files only contained within sampleDict ### # RUN THIS THE FIRST TIME only #fnames = []; #for f in list(sampleDict.index): # thisFile = [file for file in filenames if str(f) in file] # if len(thisFile)>0: # fnames.append(thisFile[0]) #filenames = fnames
def get_chrom_arms(c, gen_name): genome = Genome('/net/levsha/share/lab/genomes/'+gen_name)
try: import numpy as np except: sys.path = ["/usr/lib64/python2.7/site-packages"] + sys.path import numpy as np print "Numpy inported!" from hiclib.fragmentHiC import HiCdataset from mirnylib.systemutils import fmap, setExceptionHook from mirnylib.genome import Genome import os #from defineGenome import getGenome genomeName = "mm10" genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName, readChrms=["#", "X", "Y"]) data_folder = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/" fdataset_fname = "mESC-all-HindIII_refined.frag" setExceptionHook() print "Loading HiCdataset" TR = HiCdataset(data_folder + fdataset_fname, enzymeName="HindIII", mode='r', genome=genome_db) #print "Saving heatmap" #TR.saveHeatmap(data_folder + fdataset_fname + "1000k.hm", 1000000) #print "Saving heatmap" #TR.saveHeatmap(data_folder + fdataset_fname + "40k.hm", 40000) print "Saving heatmap"
from hiclib.hicShared import byChrEig from mirnylib.genome import Genome import matplotlib.pyplot as plt from mirnylib.systemutils import setExceptionHook from mirnylib.plotting import nicePlot, maximumContrastList setExceptionHook() gen = Genome('../../../../hg19', readChrms=["#", "X"]) mychroms = [0, 2, 5, 13, 20] eigs = byChrEig("../fragmentHiC/test-1M-byChr.hm", gen, chromosomes=mychroms) for j, chrom in enumerate(mychroms): plt.scatter(eigs[j], gen.GCBin[chrom], color=maximumContrastList[j], label="Chr {0}".format(chrom + 1)) plt.xlabel("eigenvector") plt.ylabel("GC content") nicePlot()
converter.create_agp_dict() import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import os sys.path.append("/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/mESC") from getIntraChrHeatmaps import get_chromosomes,extractResolutionFromFileName sys.path.append("/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/mESC") genome_db_contig = Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/", readChrms=[], chrmFileTemplate="N%s.fa") genome_db_chrmLevel = Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/", readChrms=[], chrmFileTemplate="%s.fna") genome_db_contig = genome_db_chrmLevel #hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filtered/ChEF-all-HindIII-100k.hm.IC" #second_hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filtered/Blood-all-HindIII-100k.hm.IC" ########################WRITE YOUR HEATMAP HERE######################## hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-40k.hm.IC" domains_files_Arm = "mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-40k.hm.gzipped_matrix/ChEF-all-HindIII-40k.hm.gzipped_matrix.jucebox_domains.annotation" domains_files_Dix = "/mnt/storage/home/vsfishman/HiC/data/chick/DixonDomainsChEF_all_HindIII_40k.hm.IC_domains_40KB/DixonDomainsChEF_all_HindIII_40k.hm.IC_domains_40KB.jucebox_domains.annotation" second_hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/Blood-all-HindIII-40k.hm.IC"
help= 'domains file to use for annotation,coud be several files separated with ","' ) parser.add_argument( "--colors", default="", help= 'colors for domains,should be same number of colors as number of domains') parser.add_argument("--out", default="heatmap_plots", help='output dir for plots') args = parser.parse_args() if args.level == "contig": genome_db = Genome( "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/", readChrms=[], chrmFileTemplate="N%s.fa") elif args.level == "chr": #genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/", # readChrms=[], # chrmFileTemplate="%s.fna") genome_db = Genome('/mnt/storage/home/vsfishman/HiC/fasta/mm9', readChrms=['#', 'X']) hm_file = args.hmap figure_path = args.out + "/" + hm_file.split("/")[-1] if args.domains != "": domains = args.domains.split(",") colors = args.colors.split(",") else: domains = []