def writeIndexes(self):
        numIndexElements = int(math.ceil(1.0 * self._chrSize / CompBinManager.getIndexBinSize()))
        self._leftIndexFile = OutputFile(self._path, 'leftIndex', numIndexElements, allowAppend=False)
        self._rightIndexFile = OutputFile(self._path, 'rightIndex', numIndexElements, allowAppend=False)
        
        if self._startFile:
            lefts = self._startFile.getContents()
        else:
            lefts = np.r_[0, self._endFile.getContents()[:-1]]
        
        if self._endFile:
            rights = self._endFile.getContents()
            if not self._startFile:
                rights = rights[1:]
        else:
            rights = self._startFile.getContents() + 1
            
        bin_i = 0
        i = 0
        for i, right in enumerate(rights):
            while right > (bin_i) * CompBinManager.getIndexBinSize():
                self._leftIndexFile.write(i)
                bin_i += 1

        bin_j = 0
        j = 0
        for j, left in enumerate(lefts):
            while left >= (bin_j+1) * CompBinManager.getIndexBinSize():
                self._rightIndexFile.write(j)
                bin_j += 1
                
        self._fillRestOfIndexFile(bin_i, i+1, self._leftIndexFile)
        self._fillRestOfIndexFile(bin_j, j+1, self._rightIndexFile)
    def testIsCompBin(self):
        self.assertTrue(
            CompBinManager.isCompBin(
                GenomeRegion('TestGenome', 'chr21', 0, 100)))
        self.assertTrue(
            CompBinManager.isCompBin(
                GenomeRegion('TestGenome', 'chr21', 200, 300)))
        self.assertTrue(
            CompBinManager.isCompBin(
                GenomeRegion('TestGenome', 'chr21', 46944300, 46944323)))

        self.assertFalse(
            CompBinManager.isCompBin(GenomeRegion('TestGenome', 'chr21', 0,
                                                  40)))
        self.assertFalse(
            CompBinManager.isCompBin(
                GenomeRegion('TestGenome', 'chr21', 10, 100)))
        self.assertFalse(
            CompBinManager.isCompBin(
                GenomeRegion('TestGenome', 'chr21', 10, 200)))
        self.assertFalse(
            CompBinManager.isCompBin(
                GenomeRegion('TestGenome', 'chr21', 100, 300)))
        self.assertFalse(
            CompBinManager.isCompBin(
                GenomeRegion('TestGenome', 'chr21', 46944300, 46944322)))
        self.assertFalse(
            CompBinManager.isCompBin(
                GenomeRegion('TestGenome', 'chr21', 46944300, 46944324)))
 def testGetNumOfBins(self):
     self.assertEqual(
         0, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 0, 0)))
     self.assertEqual(
         1,
         CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 0, 100)))
     self.assertEqual(
         2,
         CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 200,
                                                  400)))
     self.assertEqual(
         4,
         CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 67, 314)))
 def testIsCompBin(self):
     self.assertTrue(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 0, 100) ))
     self.assertTrue(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 200, 300) ))
     self.assertTrue(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944323) ))
     
     self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 0, 40) ))
     self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 10, 100) ))
     self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 10, 200) ))
     self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 100, 300) ))
     self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944322) ))
     self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944324) ))
 def _assertSplitUserBin(self, compBins, start, end):
     region = GenomeRegion('hg18', 'chr1', start, end)
     compBinRegions = [
         GenomeRegion('hg18', 'chr1', elStart, elEnd)
         for elStart, elEnd in compBins
     ]
     AssertList(compBinRegions, CompBinManager.splitUserBin(region),
                self.assertEqual)
Exemple #6
0
 def loadTrackView(trackData, region, borderHandling, allowOverlaps, trackName=[]):
     """
     trackData : see TrackSource.getTrackData {'id' : smartmemmap}
     region : see GenomeRegion
     """
     #brShelve = BoundingRegionShelve(region.genome, trackName, allowOverlaps)
     brShelve = trackData.boundingRegionShelve
     brInfo = brShelve.getBoundingRegionInfo(region) if brShelve is not None else None
     
     extraArrayNames = [arrayName for arrayName in trackData if arrayName not in \
                        RESERVED_PREFIXES.keys() + ['leftIndex', 'rightIndex']]
     
     reservedArrays = [TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in RESERVED_PREFIXES]
     extraArrays = [TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in extraArrayNames]
     trackFormat = TrackFormat( *(reservedArrays + [OrderedDict(zip(extraArrayNames, extraArrays))]) )
     
     if trackFormat.reprIsDense():
         if brInfo is None:
             leftIndex = region.start
             rightIndex = region.end
         else:
             leftIndex = region.start - brInfo.start
             rightIndex = region.end - brInfo.start 
     else:
         leftBin = CompBinManager.getBinNumber(region.start)
         rightBin = CompBinManager.getBinNumber(region.end-1)
         #leftBin = region.start/COMP_BIN_SIZE
         #rightBin = (region.end-1)/COMP_BIN_SIZE
         
         if trackData.get('leftIndex') is None or trackData.get('rightIndex') is None:
             raise IOError('Preprocessed track not found. TrackData: ' + ', '.join(trackData.keys()))
         
         leftIndex = TrackViewLoader._getArray(trackData, 'leftIndex', brInfo, leftBin)
         rightIndex = TrackViewLoader._getArray(trackData, 'rightIndex', brInfo, rightBin)
     
     slicedReservedArrays = [(array[leftIndex:rightIndex] if array is not None else None) for array in reservedArrays]
     slicedExtraArrays = [(array[leftIndex:rightIndex] if array is not None else None) for array in extraArrays]
     
     argList = [region] + slicedReservedArrays + [borderHandling, allowOverlaps] + [OrderedDict(zip(extraArrayNames, slicedExtraArrays))]
     tv = TrackView( *(argList) )
     
     if not trackFormat.reprIsDense():
         tv.sliceElementsAccordingToGenomeAnchor()
         #tv._doScatteredSlicing()
     return tv
    def writeIndexes(self):
        numIndexElements = int(
            math.ceil(1.0 * self._chrSize / CompBinManager.getIndexBinSize()))
        self._leftIndexFile = OutputFile(self._path,
                                         'leftIndex',
                                         numIndexElements,
                                         allowAppend=False)
        self._rightIndexFile = OutputFile(self._path,
                                          'rightIndex',
                                          numIndexElements,
                                          allowAppend=False)

        if self._startFile:
            lefts = self._startFile.getContents()
        else:
            lefts = np.r_[0, self._endFile.getContents()[:-1]]

        if self._endFile:
            rights = self._endFile.getContents()
            if not self._startFile:
                rights = rights[1:]
        else:
            rights = self._startFile.getContents() + 1

        bin_i = 0
        i = 0
        for i, right in enumerate(rights):
            while right > (bin_i) * CompBinManager.getIndexBinSize():
                self._leftIndexFile.write(i)
                bin_i += 1

        bin_j = 0
        j = 0
        for j, left in enumerate(lefts):
            while left >= (bin_j + 1) * CompBinManager.getIndexBinSize():
                self._rightIndexFile.write(j)
                bin_j += 1

        self._fillRestOfIndexFile(bin_i, i + 1, self._leftIndexFile)
        self._fillRestOfIndexFile(bin_j, j + 1, self._rightIndexFile)
def createDirPath(trackName, genome, chr=None, allowOverlaps=False, basePath=Config.PROCESSED_DATA_PATH):
    """
    >>> createDirPath(['trackname'],'genome','chr1')
    '/100000/noOverlaps/genome/trackname/chr1'
    """
    from gtrackcore.util.CompBinManager import CompBinManager
    if len(trackName)>0 and trackName[0] == 'redirect':
        genome = trackName[1]
        chr = trackName[2]
        #trackName[3] is description
        trackName = trackName[4:]
        
    #print [basePath, str(CompBinManager.getIndexBinSize()), ('withOverlaps' if allowOverlaps else 'noOverlaps'), genome] +\
    #    list(trackName) + ([chr] if chr is not None else [])
    
    return os.sep.join( [basePath, str(CompBinManager.getIndexBinSize()), ('withOverlaps' if allowOverlaps else 'noOverlaps'), genome] +\
        list(trackName) + ([chr] if chr is not None else []) )
Exemple #9
0
def createDirPath(trackName,
                  genome,
                  chr=None,
                  allowOverlaps=False,
                  basePath=Config.PROCESSED_DATA_PATH):
    """
    >>> createDirPath(['trackname'],'genome','chr1')
    '/100000/noOverlaps/genome/trackname/chr1'
    """
    from gtrackcore.util.CompBinManager import CompBinManager
    if len(trackName) > 0 and trackName[0] == 'redirect':
        genome = trackName[1]
        chr = trackName[2]
        #trackName[3] is description
        trackName = trackName[4:]

    #print [basePath, str(CompBinManager.getIndexBinSize()), ('withOverlaps' if allowOverlaps else 'noOverlaps'), genome] +\
    #    list(trackName) + ([chr] if chr is not None else [])

    return os.sep.join( [basePath, str(CompBinManager.getIndexBinSize()), ('withOverlaps' if allowOverlaps else 'noOverlaps'), genome] +\
        list(trackName) + ([chr] if chr is not None else []) )
    def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList, sparse):
        assert sparse in [False, True]

        tempContents = OrderedDict()

        genomeElementChrs = set(genomeElementChrList)    
        lastRegion = None
        chrStartIdxs = OrderedDict()
        chrEndIdxs = OrderedDict()
        totElCount = 0
        totBinCount = 0
        
        for br in boundingRegionTuples:
            if lastRegion is None or br.region.chr != lastRegion.chr:
                if br.region.chr in tempContents:
                    raise InvalidFormatError("Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)." % br.region)
                
                lastRegion = None
                tempContents[br.region.chr] = OrderedDict()
                if sparse:
                    chrStartIdxs[br.region.chr] = totElCount
            else:
                if br.region < lastRegion:
                    raise InvalidFormatError("Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s." % (lastRegion, br.region))
                if lastRegion.overlaps(br.region):
                    raise InvalidFormatError("Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region))
                if lastRegion.end == br.region.start:
                    raise InvalidFormatError("Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastRegion, br.region))
            
            if len(br.region) < 1:
                raise InvalidFormatError("Error: bounding region '%s' does not have positive length." % br.region)
                
            if not sparse and len(br.region) != br.elCount:
                raise InvalidFormatError("Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s" % (br.region, len(br.region), br.elCount))
            
            startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None)
            totElCount += br.elCount
            if sparse:
                chrEndIdxs[br.region.chr] = totElCount
            
            tempContents[br.region.chr][br.region.start] = BoundingRegionInfo(br.region.start, br.region.end, startIdx, endIdx, 0, 0)
            
            lastRegion = br.region
        
        if sparse:
            totBinCount = 0
            for chr in tempContents:
                chrLen = GenomeInfo.getChrLen(self._genome, chr)
                numBinsInChr = CompBinManager.getNumOfBins(GenomeRegion(start=0, end=chrLen))
                for key in tempContents[chr].keys():
                    startBinIdx = totBinCount
                    endBinIdx = totBinCount + numBinsInChr
                    brInfo = tempContents[chr][key]
                    
                    if chr in genomeElementChrs:
                        tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \
                                                                    chrStartIdxs[chr], chrEndIdxs[chr], \
                                                                    startBinIdx, endBinIdx)
                    else:
                        if chrEndIdxs[chr] - chrStartIdxs[chr] > 0:
                            raise InvalidFormatError("Error: bounding region '%s' has incorrect element count: %s > 0" % (GenomeRegion(chr=chr, start=brInfo.start, end=brInfo.end), chrEndIdxs[chr] - chrStartIdxs[chr]))
                        tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, 0, 0, 0, 0)
                
                if chr in genomeElementChrs:
                    totBinCount += numBinsInChr
        
        if len(genomeElementChrs - set(tempContents.keys())) > 0:
            raise InvalidFormatError('Error: some chromosomes (sequences) contains data, but has no bounding regions: %s' % ', '.join(genomeElementChrs - set(tempContents.keys())))
        
        ensurePathExists(self._fn)
        
        for chr in tempContents:
            brInfoDict = tempContents[chr]
            tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()), tuple(brInfoDict.values()))
        
        brShelve = safeshelve.open(self._fn)
        brShelve.update(tempContents)
        brShelve.close()
        
        while not self.fileExists():
            from gtrackcore.application.LogSetup import logMessage
            logMessage("Bounding region shelve file '%s' has yet to be created" % self._fn)
            import time
            time.sleep(0.2)
 def testGetBinNumber(self):
     self.assertEqual(0, CompBinManager.getBinNumber(0))
     self.assertEqual(2, CompBinManager.getBinNumber(200))
     self.assertEqual(3, CompBinManager.getBinNumber(314))
 def testGetOffset(self):
     self.assertEqual(0, CompBinManager.getOffset(0,0))
     self.assertEqual(0,CompBinManager.getOffset(200,2))
     self.assertEqual(14,CompBinManager.getOffset(314,3))
     self.assertEqual(-86,CompBinManager.getOffset(314,4))
 def testGetNumOfBins(self):
     self.assertEqual(0, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 0, 0)))
     self.assertEqual(1, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 0, 100)))
     self.assertEqual(2, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 200, 400)))
     self.assertEqual(4, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 67, 314)))
 def testGetPosFromBinNumber(self):
     self.assertEqual(0, CompBinManager.getPosFromBinNumber(0))
     self.assertEqual(200, CompBinManager.getPosFromBinNumber(2))
     self.assertEqual(300, CompBinManager.getPosFromBinNumber(3))
 def testGetBinNumber(self):
     self.assertEqual(0, CompBinManager.getBinNumber(0))
     self.assertEqual(2, CompBinManager.getBinNumber(200))
     self.assertEqual(3, CompBinManager.getBinNumber(314))
 def _assertSplitUserBin(self, compBins, start, end):
     region = GenomeRegion('hg18','chr1', start, end)
     compBinRegions = [GenomeRegion('hg18', 'chr1', elStart, elEnd) for elStart, elEnd in compBins]
     AssertList(compBinRegions, CompBinManager.splitUserBin(region), self.assertEqual)
 def testGetPosFromBinNumber(self):
     self.assertEqual(0, CompBinManager.getPosFromBinNumber(0))
     self.assertEqual(200, CompBinManager.getPosFromBinNumber(2))
     self.assertEqual(300, CompBinManager.getPosFromBinNumber(3))
    def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList,
                             sparse):
        assert sparse in [False, True]

        tempContents = OrderedDict()

        genomeElementChrs = set(genomeElementChrList)
        lastRegion = None
        chrStartIdxs = OrderedDict()
        chrEndIdxs = OrderedDict()
        totElCount = 0
        totBinCount = 0

        for br in boundingRegionTuples:
            if lastRegion is None or br.region.chr != lastRegion.chr:
                if br.region.chr in tempContents:
                    raise InvalidFormatError(
                        "Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)."
                        % br.region)

                lastRegion = None
                tempContents[br.region.chr] = OrderedDict()
                if sparse:
                    chrStartIdxs[br.region.chr] = totElCount
            else:
                if br.region < lastRegion:
                    raise InvalidFormatError(
                        "Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s."
                        % (lastRegion, br.region))
                if lastRegion.overlaps(br.region):
                    raise InvalidFormatError(
                        "Error: bounding regions '%s' and '%s' overlap." %
                        (lastRegion, br.region))
                if lastRegion.end == br.region.start:
                    raise InvalidFormatError(
                        "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)."
                        % (lastRegion, br.region))

            if len(br.region) < 1:
                raise InvalidFormatError(
                    "Error: bounding region '%s' does not have positive length."
                    % br.region)

            if not sparse and len(br.region) != br.elCount:
                raise InvalidFormatError(
                    "Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s"
                    % (br.region, len(br.region), br.elCount))

            startIdx, endIdx = (totElCount, totElCount +
                                br.elCount) if not sparse else (None, None)
            totElCount += br.elCount
            if sparse:
                chrEndIdxs[br.region.chr] = totElCount

            tempContents[br.region.chr][br.region.start] = BoundingRegionInfo(
                br.region.start, br.region.end, startIdx, endIdx, 0, 0)

            lastRegion = br.region

        if sparse:
            totBinCount = 0
            for chr in tempContents:
                chrLen = GenomeInfo.getChrLen(self._genome, chr)
                numBinsInChr = CompBinManager.getNumOfBins(
                    GenomeRegion(start=0, end=chrLen))
                for key in tempContents[chr].keys():
                    startBinIdx = totBinCount
                    endBinIdx = totBinCount + numBinsInChr
                    brInfo = tempContents[chr][key]

                    if chr in genomeElementChrs:
                        tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \
                                                                    chrStartIdxs[chr], chrEndIdxs[chr], \
                                                                    startBinIdx, endBinIdx)
                    else:
                        if chrEndIdxs[chr] - chrStartIdxs[chr] > 0:
                            raise InvalidFormatError(
                                "Error: bounding region '%s' has incorrect element count: %s > 0"
                                % (GenomeRegion(chr=chr,
                                                start=brInfo.start,
                                                end=brInfo.end),
                                   chrEndIdxs[chr] - chrStartIdxs[chr]))
                        tempContents[chr][key] = BoundingRegionInfo(
                            brInfo.start, brInfo.end, 0, 0, 0, 0)

                if chr in genomeElementChrs:
                    totBinCount += numBinsInChr

        if len(genomeElementChrs - set(tempContents.keys())) > 0:
            raise InvalidFormatError(
                'Error: some chromosomes (sequences) contains data, but has no bounding regions: %s'
                % ', '.join(genomeElementChrs - set(tempContents.keys())))

        ensurePathExists(self._fn)

        for chr in tempContents:
            brInfoDict = tempContents[chr]
            tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()),
                                             tuple(brInfoDict.values()))

        brShelve = safeshelve.open(self._fn)
        brShelve.update(tempContents)
        brShelve.close()

        while not self.fileExists():
            from gtrackcore.application.LogSetup import logMessage
            logMessage(
                "Bounding region shelve file '%s' has yet to be created" %
                self._fn)
            import time
            time.sleep(0.2)
 def testGetOffset(self):
     self.assertEqual(0, CompBinManager.getOffset(0, 0))
     self.assertEqual(0, CompBinManager.getOffset(200, 2))
     self.assertEqual(14, CompBinManager.getOffset(314, 3))
     self.assertEqual(-86, CompBinManager.getOffset(314, 4))