def doSaddle(filename, eig, gen): c = cooler.Cooler(filename) gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"]) gen.setResolution(getResolution(filename)) saddles = [] for chrom in range(gen.chrmCount): saddle = np.zeros((5,5), dtype = float) st = gen.chrmStartsBinCont[chrom] end = gen.chrmEndsBinCont[chrom] cur = c.matrix(balance=False).fetch(gen.idx2label[chrom]) cur = observedOverExpected(cur) mask = np.sum(cur , axis=0) > 0 cur = cur [mask] cur = cur [:, mask] GC = eig[st:end] GC = GC[mask] if len(GC) > 5: for i in range(5): for j in range(5): G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20]) mask1 = (GC > G1) * (GC < G2) G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20]) mask2 = (GC > G1) * (GC < G2) saddle[i, j] += cur[np.ix_(mask1, mask2)].mean() saddles.append(saddle) return saddles
def doSaddleError(filename, eig, gen, correct=False): gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"]) cur = 0 data = h5dict(filename,'r')["heatmap"] if correct: data = completeIC(data) gen.setResolution(getResolution(filename)) if eig == "GC": eig = np.concatenate(gen.GCBin) saddles = [] permutted = [] saddle = np.zeros((5,5), dtype = float) for i in range(100): permutted.append(np.zeros((5,5), dtype = float)) for chrom in range(gen.chrmCount): st = gen.chrmStartsBinCont[chrom] end = gen.chrmEndsBinCont[chrom] cur = data[st:end, st:end] cur = observedOverExpected(cur) mask = np.sum(cur , axis=0) > 0 cur = cur [mask] cur = cur [:, mask] GC = eig[st:end] GC = GC[mask] if len(GC) > 5: for i in range(5): for j in range(5): G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20]) mask1 = (GC > G1) * (GC < G2) G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20]) mask2 = (GC > G1) * (GC < G2) addition = cur[np.ix_(mask1, mask2)] addition = np.reshape(addition, (-1)) for k in range(100): resampled = np.random.choice(addition, len(addition), replace=True) permutted[k][i,j] += resampled.mean() saddle[i, j] += addition.mean() return saddle, permutted
def doEigenvector(filename, genome): if filename == "GC": gen = Genome("/home/magus/HiC2011/data/" + genome, readChrms=["#","X"]) gen.setResolution(1000000) GC = np.concatenate(gen.GCBin) return GC resolution = getResolution(filename) BD = binnedData.binnedData(resolution, "/home/magus/HiC2011/data/" + genome, ["#","X"]) BD.simpleLoad(filename, "bla") BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removeCis() BD.truncTrans(high=0.0005) BD.removePoorRegions(cutoff=1) BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=2) BD.restoreZeros(value=0) return BD.EigDict["bla"][0]
def byChrEig(filename, genome, chromosomes="all", resolution="auto", byArm=True, doSmooth=False): from mirnylib.genome import Genome if resolution == "auto": resolution = getResolution(filename) if type(genome) == str: genome = Genome(genome) assert isinstance(genome, Genome) genome.setResolution(resolution) mydict = mirnylib.h5dict.h5dict(filename) if chromosomes == "all": chromosomes = list(range(genome.chrmCount)) chromosomes = [i for i in chromosomes if "{0} {0}".format(i) in mydict] if len(chromosomes) == 0: raise ValueError("No chromosomes left. Check h5dict file.") result = [] for chrom in chromosomes: data = mydict["{0} {0}".format(chrom)] if not byArm: result.append( completeEig(data, genome.GCBin[chrom], doSmooth=doSmooth)) else: GC = genome.GCBin[chrom] result.append(np.zeros(len(GC), dtype=float)) cent = genome.cntrMids[chrom] / resolution result[-1][:cent] = completeEig(data[:cent, :cent], genome.GCBin[chrom][:cent], doSmooth=doSmooth) result[-1][cent:] = completeEig(data[cent:, cent:], genome.GCBin[chrom][cent:], doSmooth=doSmooth) return result