Ejemplo n.º 1
0
 def coverage(s1, s2, TR):
     genome = Genome()
     genome.createMapping(resolution)
     label = genome.chromosomeStarts[TR.ufragments / TR.fragIDmult - 1] + (
         TR.ufragments % TR.fragIDmult) / resolution
     counts = np.bincount(label, weights=s1)
     counts2 = np.bincount(label, weights=s2)
     data = cPickle.load(open("GC1M", 'rb'))
     eigenvector = np.zeros(genome.chromosomeEnds[-1], float)
     inds = np.argsort(counts)
     mask = inds[int(0.02 * len(inds)):]
     for chrom in range(1, 24):
         eigenvector[genome.chromosomeStarts[chrom - 1]:genome.
                     chromosomeStarts[chrom - 1] +
                     len(data[chrom - 1])] = data[chrom - 1]
     eigenvector[eigenvector < 35] = 35
     plt.scatter(counts[mask],
                 counts2[mask],
                 c=eigenvector[mask],
                 s=6,
                 linewidth=0)
     print stats.spearmanr(counts[mask], counts2[mask])
     plt.xlabel("Coverage from all reads")
     plt.xticks([0, 5000, 10000, 15000])
     plt.ylabel("Coverage from RBs")
     b = plt.colorbar()
     b.ax.set_xlabel("GC content")
Ejemplo n.º 2
0
def doSaddle(filename, eig, gen):
    c = cooler.Cooler(filename)

    gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"])

    gen.setResolution(getResolution(filename))
    saddles = []
    for chrom in range(gen.chrmCount):
        saddle = np.zeros((5,5), dtype = float)
        st = gen.chrmStartsBinCont[chrom]
        end = gen.chrmEndsBinCont[chrom]
        cur = c.matrix(balance=False).fetch(gen.idx2label[chrom])
        cur = observedOverExpected(cur)
        mask = np.sum(cur , axis=0) > 0
        cur = cur [mask]
        cur = cur [:, mask]
        GC = eig[st:end]
        GC = GC[mask]
        if len(GC) > 5:
            for i in range(5):
                for j in range(5):
                    G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20])
                    mask1 = (GC > G1) * (GC < G2)

                    G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20])
                    mask2 = (GC > G1) * (GC < G2)
                    saddle[i, j] += cur[np.ix_(mask1, mask2)].mean()
        saddles.append(saddle)

    return saddles
Ejemplo n.º 3
0
def doSaddleError(filename, eig, gen, correct=False):


    gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"])
    cur = 0
    data = h5dict(filename,'r')["heatmap"]
    if correct:
        data = completeIC(data)
    gen.setResolution(getResolution(filename))
    if eig == "GC":
        eig = np.concatenate(gen.GCBin)
    saddles = []
    permutted = []
    saddle = np.zeros((5,5), dtype = float)
    for i in range(100):
        permutted.append(np.zeros((5,5), dtype = float))

    for chrom in range(gen.chrmCount):
        st = gen.chrmStartsBinCont[chrom]
        end = gen.chrmEndsBinCont[chrom]
        cur = data[st:end, st:end]
        cur = observedOverExpected(cur)
        mask = np.sum(cur , axis=0) > 0
        cur = cur [mask]
        cur = cur [:, mask]
        GC = eig[st:end]
        GC = GC[mask]
        if len(GC) > 5:
            for i in range(5):
                for j in range(5):
                    G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20])
                    mask1 = (GC > G1) * (GC < G2)
                    G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20])
                    mask2 = (GC > G1) * (GC < G2)
                    addition = cur[np.ix_(mask1, mask2)]
                    addition = np.reshape(addition, (-1))
                    for k in range(100):
                        resampled = np.random.choice(addition, len(addition), replace=True)
                        permutted[k][i,j] += resampled.mean()
                    saddle[i, j] += addition.mean()
    return saddle, permutted
Ejemplo n.º 4
0
 def coverage(s1, s2, TR):
     genome = Genome()
     genome.createMapping(resolution)
     label = genome.chromosomeStarts[TR.ufragments / TR.fragIDmult -
         1] + (TR.ufragments % TR.fragIDmult) / resolution
     counts = np.bincount(label, weights=s1)
     counts2 = np.bincount(label, weights=s2)
     data = cPickle.load(open("GC1M", 'rb'))
     eigenvector = np.zeros(genome.chromosomeEnds[-1], float)
     inds = np.argsort(counts)
     mask = inds[int(0.02 * len(inds)):]
     for chrom in range(1, 24):
         eigenvector[genome.chromosomeStarts[chrom - 1]:genome.chromosomeStarts[chrom - 1] + len(data[chrom - 1])] = data[chrom - 1]
     eigenvector[eigenvector < 35] = 35
     plt.scatter(counts[mask], counts2[mask], c=eigenvector[
         mask], s=6, linewidth=0)
     print stats.spearmanr(counts[mask], counts2[mask])
     plt.xlabel("Coverage from all reads")
     plt.xticks([0, 5000, 10000, 15000])
     plt.ylabel("Coverage from RBs")
     b = plt.colorbar()
     b.ax.set_xlabel("GC content")
Ejemplo n.º 5
0
def doEigenvector(filename, genome):
    if filename == "GC":
        gen = Genome("/home/magus/HiC2011/data/" + genome, readChrms=["#","X"])
        gen.setResolution(1000000)
        GC = np.concatenate(gen.GCBin)
        return GC
    resolution = getResolution(filename)
    BD = binnedData.binnedData(resolution, "/home/magus/HiC2011/data/" + genome, ["#","X"])

    BD.simpleLoad(filename, "bla")
    BD.removeDiagonal()

    BD.removeBySequencedCount(0.5)

    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=2)
    BD.restoreZeros(value=0)
    return BD.EigDict["bla"][0]
Ejemplo n.º 6
0
def byChrEig(filename,
             genome,
             chromosomes="all",
             resolution="auto",
             byArm=True,
             doSmooth=False):
    from mirnylib.genome import Genome
    if resolution == "auto":
        resolution = getResolution(filename)
    if type(genome) == str:
        genome = Genome(genome)
    assert isinstance(genome, Genome)
    genome.setResolution(resolution)
    mydict = mirnylib.h5dict.h5dict(filename)
    if chromosomes == "all":
        chromosomes = list(range(genome.chrmCount))
        chromosomes = [i for i in chromosomes if "{0} {0}".format(i) in mydict]
        if len(chromosomes) == 0:
            raise ValueError("No chromosomes left. Check h5dict file.")

    result = []
    for chrom in chromosomes:
        data = mydict["{0} {0}".format(chrom)]
        if not byArm:
            result.append(
                completeEig(data, genome.GCBin[chrom], doSmooth=doSmooth))
        else:
            GC = genome.GCBin[chrom]
            result.append(np.zeros(len(GC), dtype=float))
            cent = genome.cntrMids[chrom] / resolution
            result[-1][:cent] = completeEig(data[:cent, :cent],
                                            genome.GCBin[chrom][:cent],
                                            doSmooth=doSmooth)
            result[-1][cent:] = completeEig(data[cent:, cent:],
                                            genome.GCBin[chrom][cent:],
                                            doSmooth=doSmooth)
    return result
Ejemplo n.º 7
0
    def __init__(self, genome, resolution, storageFile="inMemory", mode="w"):
        """
        Initializes the high-resolution Hi-C data storage.

        Parameters
        ----------

        genome : folder or Genome object
            matching Genome object or folder to load it form
        resolution : int
            Resolution (number of bp per bin)
        storageFile : str (optional)
            File to store the h5dict.
            File will be created.
            By default stores in memory
        mode : "w", "w-" "r+" or "a", optional
            Access mode to h5dict (see h5dict manual)
        """

        inMemory = (storageFile == "inMemory")

        self._h5dict = h5dict(storageFile, mode=mode, in_memory=inMemory)

        if type(genome) == str:
            genome = Genome(genome, readChrms=["#", "X"])
        assert isinstance(genome, Genome)
        self.genome = genome

        self.resolution = resolution
        self.genome.setResolution(resolution)

        if self.genome.numBins < 7000:
            print "Total number of bins in the genome is just %d" % self.genome.numBins
            warnings.warn(
                "For low-resolution analysis use binnedData, as it provides"
                "more analysis tools")

        M = self.genome.chrmCount
        self.cisKeys = [(i, i) for i in xrange(M)]
        self.transKeys = [(i, j) for i in range(M) for j in range(M) if j > i]
        self.allKeys = self.cisKeys + self.transKeys

        self.data = {}
        self._initChromosomes()
Ejemplo n.º 8
0
def getGenome():
    return Genome("../data/caul", chrmFileTemplate="%s.fa", readChrms=[])
Ejemplo n.º 9
0
from mirnylib.h5dict import h5dict
import matplotlib.pyplot as plt 
import os
import sys
import numpy as np
from mirnylib.systemutils import setExceptionHook
from mirnylib.numutils import coarsegrain
from mirnylib.genome import Genome
setExceptionHook()

workingGenome = "hg19"

genomeFolder = sys.argv[1]
if not os.path.exists(genomeFolder):
    raise Exception("Please provide hg19 Genome folder in the code or as a first argument")
mygenome = Genome(genomeFolder, readChrms = ["#","X"])

def source(ID):
    return os.path.join("%s-%s.hdf5" % (ID, workingGenome))  # determines path to the parsed file by ID

N = 2000000

chrms1 = np.random.randint(0,22,N)
chrms2 = chrms1.copy()
mask = np.random.random(N) < 0.5
chrms2[mask] = np.random.randint(0,22,mask.sum())
pos1 = np.array(np.array((0.1 + 0.8 * np.random.random(N)) * mygenome.chrmLens[chrms1]), dtype=int)
offset1 = np.exp(3 + np.random.random(N) * (np.log(1000000) - 3)) * (2 * (np.random.random(N)>0.5) - 1 )
pos2 = np.array(pos1 + offset1, dtype=int)

strands1 = np.random.random(N) > 0.5
Ejemplo n.º 10
0
    def parseInputData(self, dictLike, commandArgs, **kwargs):
        """
        Added Parameters
        ----------------
        commandArgs : NameSpace
            A NameSpace object defined by argparse.            
        """
        ## Necessary Modules
        import numexpr

        if not os.path.exists(dictLike):
            raise IOError('File not found: %s' % dictLike)

        dictLike = h5dict(dictLike, 'r')
        self.chrms1 = dictLike['chrms1']
        self.chrms2 = dictLike['chrms2']
        self.cuts1 = dictLike['cuts1']
        self.cuts2 = dictLike['cuts2']
        self.strands1 = dictLike['strands1']
        self.strands2 = dictLike['strands2']
        self.dists1 = np.abs(dictLike['rsites1'] - self.cuts1)
        self.dists2 = np.abs(dictLike['rsites2'] - self.cuts2)
        self.mids1 = (dictLike['uprsites1'] + dictLike['downrsites1']) / 2
        self.mids2 = (dictLike['uprsites2'] + dictLike['downrsites2']) / 2
        self.fraglens1 = np.abs(
            (dictLike['uprsites1'] - dictLike['downrsites1']))
        self.fraglens2 = np.abs(
            (dictLike['uprsites2'] - dictLike['downrsites2']))
        self.fragids1 = self.mids1 + np.array(self.chrms1,
                                              dtype='int64') * self.fragIDmult
        self.fragids2 = self.mids2 + np.array(self.chrms2,
                                              dtype='int64') * self.fragIDmult
        distances = np.abs(self.mids1 - self.mids2)
        distances[self.chrms1 != self.chrms2] = -1
        self.distances = distances  # Distances between restriction fragments
        del distances

        # Total Reads
        self.N = len(self.chrms1)

        self.metadata["100_TotalReads"] = self.N

        try:
            dictLike['misc']['genome']['idx2label']
            self.updateGenome(
                self.genome,
                oldGenome=dictLike["misc"]["genome"]["idx2label"],
                putMetadata=True)
        except KeyError:
            assumedGenome = Genome(self.genome.genomePath)
            self.updateGenome(self.genome,
                              oldGenome=assumedGenome,
                              putMetadata=True)

        DSmask = (self.chrms1 >= 0) * (self.chrms2 >= 0)
        self.metadata["200_totalDSReads"] = DSmask.sum()

        self.metadata["201_DS+SS"] = len(DSmask)
        self.metadata["202_SSReadsRemoved"] = len(DSmask) - DSmask.sum()

        mask = DSmask

        ## Information based on restriction fragments
        sameFragMask = self.evaluate("a = (fragids1 == fragids2)",
                                     ["fragids1", "fragids2"]) * DSmask
        cutDifs = self.cuts2[sameFragMask] > self.cuts1[sameFragMask]
        s1 = self.strands1[sameFragMask]
        s2 = self.strands2[sameFragMask]
        SSDE = (s1 != s2)
        SS = SSDE * (cutDifs == s2)
        Dangling = SSDE & (~SS)
        SS_N = SS.sum()
        SSDE_N = SSDE.sum()
        sameFrag_N = sameFragMask.sum()

        dist = self.evaluate(
            "a = - cuts1 * (2 * strands1 -1) - "
            "cuts2 * (2 * strands2 - 1)",
            ["cuts1", "cuts2", "strands1", "strands2"])
        Dangling_L = dist[sameFragMask][Dangling]
        library_L = int(np.ceil((np.percentile(Dangling_L, 95))))
        self.maximumMoleculeLength = library_L

        readsMolecules = self.evaluate(
            "a = numexpr.evaluate('(chrms1 == chrms2) & (strands1 != strands2) &  (dist >=0) &"
            " (dist <= maximumMoleculeLength)')",
            internalVariables=["chrms1", "chrms2", "strands1", "strands2"],
            externalVariables={"dist": dist},
            constants={
                "maximumMoleculeLength": self.maximumMoleculeLength,
                "numexpr": numexpr
            })

        if commandArgs.sameFragments:
            mask *= (-sameFragMask)
            noSameFrag = mask.sum()
            self.metadata["210_sameFragmentReadsRemoved"] = sameFrag_N
            self.metadata["212_Self-Circles"] = SS_N
            self.metadata["214_DandlingEnds"] = SSDE_N - SS_N
            self.metadata["216_error"] = sameFrag_N - SSDE_N
            mask *= (readsMolecules == False)
            extraDE = mask.sum()
            self.metadata[
                "220_extraDandlingEndsRemoved"] = -extraDE + noSameFrag

        if commandArgs.RandomBreaks:

            ini_N = extraDE
            mask *= ((self.dists1 + self.dists2) <= library_L)
            rb_N = ini_N - mask.sum()
            self.metadata["330_removeRandomBreaks"] = rb_N

        if mask.sum() == 0:
            raise Exception(
                'No reads left after filtering. Please, check the input data')

        del DSmask, sameFragMask
        del dist, readsMolecules

        self.metadata["300_ValidPairs"] = self.N

        self.maskFilter(mask)
Ejemplo n.º 11
0
genomeFolder = "../../../data/hg18"
readChrms = [
    "#",  # read all numbered chromosomes
    "X"
]  # add X chromosome

for inDataset in inDatasets.values():
    if not os.path.exists(inDataset):
        raise IOError("Raw heatmap file does not exist: {}".format(inDataset))

if not os.path.isdir(genomeFolder):
    raise IOError("Genome folder does not exist")

# When you do this, be sure that readChrms used to save heatmap matches
# readChrms that you define here!
genome = Genome(genomeFolder, readChrms=readChrms)

# Read resolution from one of the datasets
sampleDataset = h5dict(inDatasets.values()[0], mode="r")  # random dataset
resolution = int(sampleDataset["resolution"])

# Define the binnedData object, load data
BD = binnedData(resolution, genome, readChrms)
for name, filename in inDatasets.items():
    BD.simpleLoad(filename, name)

BD.removeDiagonal()

# Remove bins with less than half of a bin sequenced
BD.removeBySequencedCount(0.5)
Ejemplo n.º 12
0
    def parseInputData(self, dictLike, **kwargs):
        
        import numexpr
        
        if not os.path.exists(dictLike):
            raise IOError('File not found: %s' % dictLike)
        
        dictLike = h5dict(dictLike, 'r')
        
        self.chrms1 = dictLike['chrms1']
        self.chrms2 = dictLike['chrms2']
        self.cuts1 = dictLike['cuts1']
        self.cuts2 = dictLike['cuts2']
        self.strands1 = dictLike['strands1']
        self.strands2 = dictLike['strands2']
        self.dists1 = np.abs(dictLike['rsites1'] - self.cuts1)
        self.dists2 = np.abs(dictLike['rsites2'] - self.cuts2)
        self.mids1 = (dictLike['uprsites1'] + dictLike['downrsites1']) / 2
        self.mids2 = (dictLike['uprsites2'] + dictLike['downrsites2']) / 2
        self.fraglens1 = np.abs(
            (dictLike['uprsites1'] - dictLike['downrsites1']))
        self.fraglens2 = np.abs(
            (dictLike['uprsites2'] - dictLike['downrsites2']))
        self.fragids1 = self.mids1 + np.array(self.chrms1,
                                              dtype='int64') * self.fragIDmult
        self.fragids2 = self.mids2 + np.array(self.chrms2,
                                              dtype='int64') * self.fragIDmult
        
        distances = np.abs(self.mids1 - self.mids2)
        distances[self.chrms1 != self.chrms2] = -1
        self.distances = distances  # Distances between restriction fragments
        del distances
        
        self.N = len(self.chrms1)

        try:
            dictLike['misc']['genome']['idx2label']
            self.updateGenome(self.genome,
                              oldGenome = dictLike["misc"]["genome"]["idx2label"])
        except KeyError:
            assumedGenome = Genome(self.genome.genomePath)
            self.updateGenome(self.genome, oldGenome = assumedGenome)

        # Discard dangling ends and self-circles
        DSmask = (self.chrms1 >= 0) * (self.chrms2 >= 0)
        self.metadata['100_NormalPairs'] = DSmask.sum()

        sameFragMask = self.evaluate("a = (fragids1 == fragids2)",
                     ["fragids1", "fragids2"]) * DSmask

        cutDifs = self.cuts2[sameFragMask] > self.cuts1[sameFragMask]
        s1 = self.strands1[sameFragMask]
        s2 = self.strands2[sameFragMask]
        SSDE = (s1 != s2)
        SS = SSDE * (cutDifs == s2)
        SS_N = SS.sum()
        SSDE_N = SSDE.sum()
        sameFrag_N = sameFragMask.sum()
        self.metadata['120_SameFragmentReads'] = sameFrag_N
        self.metadata['122_SelfLigationReads'] = SS_N
        self.metadata['124_DanglingReads'] = SSDE_N - SS_N
        self.metadata['126_UnknownMechanism'] = sameFrag_N - SSDE_N
        
        mask = DSmask * (-sameFragMask)

        del DSmask, sameFragMask
        
        noSameFrag = mask.sum()
        
        # distance between sites facing each other
        dist = self.evaluate("a = numexpr.evaluate('- cuts1 * (2 * strands1 -1) - "
                             "cuts2 * (2 * strands2 - 1)')",
                             ["cuts1", "cuts2", "strands1", "strands2"],
                             constants={"numexpr":numexpr})

        readsMolecules = self.evaluate(
            "a = numexpr.evaluate('(chrms1 == chrms2) & (strands1 != strands2) &  (dist >=0) &"
            " (dist <= maximumMoleculeLength)')",
            internalVariables=["chrms1", "chrms2", "strands1", "strands2"],
            externalVariables={"dist": dist},
            constants={"maximumMoleculeLength": self.maximumMoleculeLength, "numexpr": numexpr})

        mask *= (readsMolecules == False)
        extraDE = mask.sum()
        self.metadata['210_ExtraDanglingReads'] = -extraDE + noSameFrag
        if mask.sum() == 0:
            raise Exception('No reads left after filtering. Please, check the input data')

        del dist, readsMolecules
        
        self.maskFilter(mask)
Ejemplo n.º 13
0
from mirnylib.genome import Genome
import pandas as pd
from hiclib import hicShared
from mirnylib.h5dict import h5dict
from mirnylib.numutils import coarsegrain, observedOverExpected, completeIC
import joblib
import pickle
import cooler
from mirnylib.numutils import  zoomArray
from hiclib.hicShared import getResolution
from hiclib import binnedData
from mirnylib.systemutils import setExceptionHook


# defining genomes used here
mygenM = Genome("../../data/mm9", readChrms=["#", "X"])
mygenH = Genome("../../data/hg19", readChrms=["#", "X"])
genomDict = {"mm9":mygenM, "hg19":mygenH}

mem = joblib.Memory(".")




def getFrags():
    """
    A method that calculates a set of restriction fragments covered in all hg19, or in all mm9 datasets
     Used for the correct calculation of scalings
    """
    mouse = "/home/magus/HiC2011/DrosophilaSingleCell2015/alternativeFiltering/mm9/oocyte_combined_refined.frag"
    human = "/home/magus/HiC2011/DrosophilaSingleCell2015/alternativeFiltering/hg19/K562_combined_refined.frag"
Ejemplo n.º 14
0
    and might be in fact pretty fast.
    
    General recommendation: if you have 16+GB of RAM, and .sra (.fastq.gz) files were less than 30GB, then you should be fine with parsing things in memory. 

"""

from hiclib.fragmentHiC import HiCdataset
from mirnylib.systemutils import fmap,setExceptionHook
from mirnylib.genome import Genome 
import numpy as np 
import os
import sys
#from defineGenome import getGenome

genomeName = "galGal4_1MB"
genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/galGal4/"+genomeName,readChrms=[],chrmFileTemplate="chr%s.fa",gapFile="galGal4.gap.txt")
def getGenome(workingGenome):
    if workingGenome != genomeName: raise
    return genome_db

setExceptionHook()
def ensure(f):
    d = os.path.dirname(f)
    if os.path.isdir(d):
        return f
    else:
        try:
            os.makedirs(d)
        except:
            raise ValueError("Cannot create directory")
    return f
Ejemplo n.º 15
0
overhead, and therefore having 23*11 chromosome pairs would slow it down a bit.

Note the trick of converting a global heatmap map to a smaller map.

It tests only iterative correction now.
"""

from hiclib.highResBinnedData import HiResHiC
from mirnylib.genome import Genome
from hiclib.binnedData import binnedData
from mirnylib.h5dict import h5dict
import numpy as np
import sys
import os

genome = Genome(sys.argv[1], readChrms=["1", "2", "3", "4", "5"])

a = HiResHiC(genome, 1000000, "hiResDict", mode='w')
a.loadData(dictLike="../fragmentHiC/test-1M-byChr.hm")
a.removeDiagonal()
a.removePoorRegions(2)
a.iterativeCorrection(1e-10)

b = binnedData(1000000, genome)

data = {"heatmap": h5dict("../fragmentHiC/test-1M.hm")["heatmap"]}
lim = b.genome.chrmEndsBinCont[-1]
data["heatmap"] = data["heatmap"][:lim, :lim]

b.simpleLoad(data, "data")
b.removeDiagonal()
Ejemplo n.º 16
0
def refine_paper(filename, create=True):
    """filename[0] is a list of filenames of incoming files
    filename[1] is a folder for outgoing file"""
    if create == True:
        for onename in filename[0]:
            #Parsing individual files
            if not os.path.exists(onename):
                raise StandardError("path not found: %s" % onename)
            TR = HiCdataset("bla",
                            genome=genomeFolder,
                            enzymeName="HindIII",
                            maximumMoleculeLength=500,
                            inMemory=True)
            print "\nTesting loading new data without rsite information    "
            TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII")
            #assert len(TR.DS) == 856143

            #assert len(TR.ufragments) == 634572
            TR.save(onename + "_parsed.frag")

        #Merging files alltogether, applying filters
        TR = HiCdataset(filename[1] + "_merged.frag",
                        enzymeName="HindIII",
                        genome=genomeFolder,
                        mode="w")
        TR.merge([i + "_parsed.frag" for i in filename[0]])

        TR = HiCdataset("refined",
                        genome=genomeFolder,
                        enzymeName="HindIII",
                        mode="w",
                        inMemory=True)

        print "\nTesting chunking during all tests"
        TR.chunksize = 30000
        #because we do many operations, we disable autoFlush here
        TR.load(filename[1] + "_merged.frag")

        print "\nTesting Rsite filter"
        TR.filterRsiteStart(offset=5)

        #assert len(TR.DS) == 832110

        print "\nTesting duplicate filter"
        TR.filterDuplicates(chunkSize=30000)

        #assert len(TR.DS) == 830275

        print "\nTesting small/large and extreme fragment filter"
        TR.filterLarge()

        #assert len(TR.DS) == 825442
        TR.filterExtreme(cutH=0.005, cutL=0)
        TR.writeFilteringStats()

        #assert len(TR.DS) == 803845

    #-------------------------------------------
    TR.printMetadata(saveTo="metadata")
    import cPickle

    stop = False
    mdata = cPickle.load(open("sampleMetadata"))
    for i in sorted(mdata.keys()):
        if TR.metadata[i] != mdata[i]:
            print "Key {0} is not consistent: should be {1}, is {2}".format(
                i, mdata[i], TR.metadata[i])
            stop = True
    if stop == True:
        print("""------------_ERROR_--------------
        Inconsistent metadata: see above
        ----------------------------------------""")
        raise ValueError("Inconsistent Metadata")

    print "Testing allxall and by-chromosome heatmap counting diagonal twice"

    print "----> saving allxall heatmap"
    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice")
    a = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = a["heatmap"][st:end, st:end]
    chrom12 = a["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(filename[1] + "-1M.hm",
                               resolution=1000000,
                               includeTrans=True,
                               countDiagonalReads="twice")

    b = h5dict(filename[1] + "-1M.hm")["1 1"]
    bb = h5dict(filename[1] + "-1M.hm")["1 2"]
    assert (b - chrom1).sum() == 0
    print "Cis heatmap consistent"
    assert (bb - chrom12).sum() == 0
    print 'Trans heatmap consistent'
    print a["heatmap"][::10, ::10].sum()
    #assert  a["heatmap"][::10, ::10].sum() == 21800
    print "Heatmap sum correct\n"

    #---------------------------------
    print "Testing allxall and by-chromosome heatmap counting diagonal once"

    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once")
    Ta = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = Ta["heatmap"][st:end, st:end]
    chrom12 = Ta["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(filename[1] + "-1M-byChr.hm",
                               resolution=1000000,
                               includeTrans=True,
                               countDiagonalReads="once")

    TR.saveHiResHeatmapWithOverlaps(filename[1] + "-1M-highRes.hm",
                                    resolution=50000,
                                    countDiagonalReads="twice")
    TR.saveSuperHighResMapWithOverlaps(filename[1] + "-5k-SuperHighRes.hm",
                                       resolution=5000,
                                       chromosomes=[14],
                                       countDiagonalReads="twice")

    Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"]
    Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"]
    assert ((Tb - chrom1) == 0).all()
    assert ((Tbb - chrom12) == 0).all()
    assert ((Tb + np.diag(np.diag(Tb))) == b).all()
    print "Diagonal counting methods are consistent\n"

    newchrom1 = chrom1.copy()
    for i in xrange(len(newchrom1)):
        newchrom1[i, i] = 2 * newchrom1[i, i]

    Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"]
    assert np.abs(Tb.sum() - newchrom1.sum()) < 1
    assert np.sum(np.abs(coarsegrain(Tb, 20, True) - newchrom1)) < 500

    #------------------------------
    print "Testing updateGenome method"
    from mirnylib.genome import Genome
    removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0])
    #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum()
    t = ((removeChromIDs[TR.chrms1] == 1) *
         (removeChromIDs[TR.chrms2] == 1)).sum() + (
             (removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum()
    newGenome = Genome(genomePath=genomeFolder,
                       readChrms=["2", "3", "4", "5", "X"])
    TR.updateGenome(newGenome)
    assert TR.N == t

    a = h5dict(filename[1] + "-1M.hm")["heatmap"]
Ejemplo n.º 17
0
    c2 = hd["chrms2"]
    p1 = hd["cuts1"]
    p2 = hd["cuts2"]
    mask = c1 == c2
    cis = mask.sum()
    more20kb = (np.abs(p1[mask] - p2[mask]) > 20000).sum()
    return more20kb / cis




dfs = []

for genome  in ["mm9", 'hg19']:

    genomeObject = Genome("/home/magus/HiC2011/data/{0}".format(genome), readChrms = ["#", "X"])
    filenames = [i.replace(".1000000.cool", "") for i in os.listdir(genome) if
                 ".1000000.cool" in i and ("DpnII" in i or "combined" in i or "sperm" in i.lower() or "ad" in i.lower() )]


    sampleDict = pd.read_csv("samples.csv", index_col=0)
    #nsnDict = sampleDict["Stage"]

    ### Bug fix to read files only contained within sampleDict ###
    # RUN THIS THE FIRST TIME only
    #fnames = []; 
    #for f in list(sampleDict.index): 
    #    thisFile = [file for file in filenames if str(f) in file]
    #    if len(thisFile)>0:
    #        fnames.append(thisFile[0])
    #filenames = fnames
Ejemplo n.º 18
0
def get_chrom_arms(c, gen_name):
    genome = Genome('/net/levsha/share/lab/genomes/'+gen_name)
Ejemplo n.º 19
0
try:
    import numpy as np
except:
    sys.path = ["/usr/lib64/python2.7/site-packages"] + sys.path
    import numpy as np

print "Numpy inported!"
from hiclib.fragmentHiC import HiCdataset
from mirnylib.systemutils import fmap, setExceptionHook
from mirnylib.genome import Genome

import os
#from defineGenome import getGenome

genomeName = "mm10"
genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName,
                   readChrms=["#", "X", "Y"])

data_folder = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/"
fdataset_fname = "mESC-all-HindIII_refined.frag"
setExceptionHook()

print "Loading HiCdataset"
TR = HiCdataset(data_folder + fdataset_fname,
                enzymeName="HindIII",
                mode='r',
                genome=genome_db)
#print "Saving heatmap"
#TR.saveHeatmap(data_folder + fdataset_fname + "1000k.hm", 1000000)
#print "Saving heatmap"
#TR.saveHeatmap(data_folder + fdataset_fname + "40k.hm", 40000)
print "Saving heatmap"
Ejemplo n.º 20
0
from hiclib.hicShared import byChrEig
from mirnylib.genome import Genome
import matplotlib.pyplot as plt
from mirnylib.systemutils import setExceptionHook
from mirnylib.plotting import nicePlot, maximumContrastList

setExceptionHook()

gen = Genome('../../../../hg19', readChrms=["#", "X"])
mychroms = [0, 2, 5, 13, 20]
eigs = byChrEig("../fragmentHiC/test-1M-byChr.hm", gen, chromosomes=mychroms)

for j, chrom in enumerate(mychroms):
    plt.scatter(eigs[j],
                gen.GCBin[chrom],
                color=maximumContrastList[j],
                label="Chr {0}".format(chrom + 1))

plt.xlabel("eigenvector")
plt.ylabel("GC content")
nicePlot()
Ejemplo n.º 21
0
converter.create_agp_dict()							


import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os

sys.path.append("/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/mESC")
from  getIntraChrHeatmaps import get_chromosomes,extractResolutionFromFileName

sys.path.append("/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/mESC")


genome_db_contig = Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/",
				readChrms=[],
				chrmFileTemplate="N%s.fa")
genome_db_chrmLevel = Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/",
				readChrms=[],
				chrmFileTemplate="%s.fna")
genome_db_contig = genome_db_chrmLevel

#hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filtered/ChEF-all-HindIII-100k.hm.IC"
#second_hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filtered/Blood-all-HindIII-100k.hm.IC"

########################WRITE YOUR HEATMAP HERE########################
hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-40k.hm.IC"
domains_files_Arm = "mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-40k.hm.gzipped_matrix/ChEF-all-HindIII-40k.hm.gzipped_matrix.jucebox_domains.annotation"
domains_files_Dix = "/mnt/storage/home/vsfishman/HiC/data/chick/DixonDomainsChEF_all_HindIII_40k.hm.IC_domains_40KB/DixonDomainsChEF_all_HindIII_40k.hm.IC_domains_40KB.jucebox_domains.annotation"

second_hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/Blood-all-HindIII-40k.hm.IC"
Ejemplo n.º 22
0
    help=
    'domains file to use for annotation,coud be several files separated with ","'
)
parser.add_argument(
    "--colors",
    default="",
    help=
    'colors for domains,should be same number of colors as number of domains')
parser.add_argument("--out",
                    default="heatmap_plots",
                    help='output dir for plots')

args = parser.parse_args()
if args.level == "contig":
    genome_db = Genome(
        "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/",
        readChrms=[],
        chrmFileTemplate="N%s.fa")
elif args.level == "chr":
    #genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/",
    #			readChrms=[],
    #			chrmFileTemplate="%s.fna")
    genome_db = Genome('/mnt/storage/home/vsfishman/HiC/fasta/mm9',
                       readChrms=['#', 'X'])

hm_file = args.hmap
figure_path = args.out + "/" + hm_file.split("/")[-1]
if args.domains != "":
    domains = args.domains.split(",")
    colors = args.colors.split(",")
else:
    domains = []