def writeIndexes(self): numIndexElements = int(math.ceil(1.0 * self._chrSize / CompBinManager.getIndexBinSize())) self._leftIndexFile = OutputFile(self._path, 'leftIndex', numIndexElements, allowAppend=False) self._rightIndexFile = OutputFile(self._path, 'rightIndex', numIndexElements, allowAppend=False) if self._startFile: lefts = self._startFile.getContents() else: lefts = np.r_[0, self._endFile.getContents()[:-1]] if self._endFile: rights = self._endFile.getContents() if not self._startFile: rights = rights[1:] else: rights = self._startFile.getContents() + 1 bin_i = 0 i = 0 for i, right in enumerate(rights): while right > (bin_i) * CompBinManager.getIndexBinSize(): self._leftIndexFile.write(i) bin_i += 1 bin_j = 0 j = 0 for j, left in enumerate(lefts): while left >= (bin_j+1) * CompBinManager.getIndexBinSize(): self._rightIndexFile.write(j) bin_j += 1 self._fillRestOfIndexFile(bin_i, i+1, self._leftIndexFile) self._fillRestOfIndexFile(bin_j, j+1, self._rightIndexFile)
def testIsCompBin(self): self.assertTrue( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 0, 100))) self.assertTrue( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 200, 300))) self.assertTrue( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944323))) self.assertFalse( CompBinManager.isCompBin(GenomeRegion('TestGenome', 'chr21', 0, 40))) self.assertFalse( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 10, 100))) self.assertFalse( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 10, 200))) self.assertFalse( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 100, 300))) self.assertFalse( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944322))) self.assertFalse( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944324)))
def testGetNumOfBins(self): self.assertEqual( 0, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 0, 0))) self.assertEqual( 1, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 0, 100))) self.assertEqual( 2, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 200, 400))) self.assertEqual( 4, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 67, 314)))
def testIsCompBin(self): self.assertTrue(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 0, 100) )) self.assertTrue(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 200, 300) )) self.assertTrue(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944323) )) self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 0, 40) )) self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 10, 100) )) self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 10, 200) )) self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 100, 300) )) self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944322) )) self.assertFalse(CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944324) ))
def _assertSplitUserBin(self, compBins, start, end): region = GenomeRegion('hg18', 'chr1', start, end) compBinRegions = [ GenomeRegion('hg18', 'chr1', elStart, elEnd) for elStart, elEnd in compBins ] AssertList(compBinRegions, CompBinManager.splitUserBin(region), self.assertEqual)
def loadTrackView(trackData, region, borderHandling, allowOverlaps, trackName=[]): """ trackData : see TrackSource.getTrackData {'id' : smartmemmap} region : see GenomeRegion """ #brShelve = BoundingRegionShelve(region.genome, trackName, allowOverlaps) brShelve = trackData.boundingRegionShelve brInfo = brShelve.getBoundingRegionInfo(region) if brShelve is not None else None extraArrayNames = [arrayName for arrayName in trackData if arrayName not in \ RESERVED_PREFIXES.keys() + ['leftIndex', 'rightIndex']] reservedArrays = [TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in RESERVED_PREFIXES] extraArrays = [TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in extraArrayNames] trackFormat = TrackFormat( *(reservedArrays + [OrderedDict(zip(extraArrayNames, extraArrays))]) ) if trackFormat.reprIsDense(): if brInfo is None: leftIndex = region.start rightIndex = region.end else: leftIndex = region.start - brInfo.start rightIndex = region.end - brInfo.start else: leftBin = CompBinManager.getBinNumber(region.start) rightBin = CompBinManager.getBinNumber(region.end-1) #leftBin = region.start/COMP_BIN_SIZE #rightBin = (region.end-1)/COMP_BIN_SIZE if trackData.get('leftIndex') is None or trackData.get('rightIndex') is None: raise IOError('Preprocessed track not found. TrackData: ' + ', '.join(trackData.keys())) leftIndex = TrackViewLoader._getArray(trackData, 'leftIndex', brInfo, leftBin) rightIndex = TrackViewLoader._getArray(trackData, 'rightIndex', brInfo, rightBin) slicedReservedArrays = [(array[leftIndex:rightIndex] if array is not None else None) for array in reservedArrays] slicedExtraArrays = [(array[leftIndex:rightIndex] if array is not None else None) for array in extraArrays] argList = [region] + slicedReservedArrays + [borderHandling, allowOverlaps] + [OrderedDict(zip(extraArrayNames, slicedExtraArrays))] tv = TrackView( *(argList) ) if not trackFormat.reprIsDense(): tv.sliceElementsAccordingToGenomeAnchor() #tv._doScatteredSlicing() return tv
def writeIndexes(self): numIndexElements = int( math.ceil(1.0 * self._chrSize / CompBinManager.getIndexBinSize())) self._leftIndexFile = OutputFile(self._path, 'leftIndex', numIndexElements, allowAppend=False) self._rightIndexFile = OutputFile(self._path, 'rightIndex', numIndexElements, allowAppend=False) if self._startFile: lefts = self._startFile.getContents() else: lefts = np.r_[0, self._endFile.getContents()[:-1]] if self._endFile: rights = self._endFile.getContents() if not self._startFile: rights = rights[1:] else: rights = self._startFile.getContents() + 1 bin_i = 0 i = 0 for i, right in enumerate(rights): while right > (bin_i) * CompBinManager.getIndexBinSize(): self._leftIndexFile.write(i) bin_i += 1 bin_j = 0 j = 0 for j, left in enumerate(lefts): while left >= (bin_j + 1) * CompBinManager.getIndexBinSize(): self._rightIndexFile.write(j) bin_j += 1 self._fillRestOfIndexFile(bin_i, i + 1, self._leftIndexFile) self._fillRestOfIndexFile(bin_j, j + 1, self._rightIndexFile)
def createDirPath(trackName, genome, chr=None, allowOverlaps=False, basePath=Config.PROCESSED_DATA_PATH): """ >>> createDirPath(['trackname'],'genome','chr1') '/100000/noOverlaps/genome/trackname/chr1' """ from gtrackcore.util.CompBinManager import CompBinManager if len(trackName)>0 and trackName[0] == 'redirect': genome = trackName[1] chr = trackName[2] #trackName[3] is description trackName = trackName[4:] #print [basePath, str(CompBinManager.getIndexBinSize()), ('withOverlaps' if allowOverlaps else 'noOverlaps'), genome] +\ # list(trackName) + ([chr] if chr is not None else []) return os.sep.join( [basePath, str(CompBinManager.getIndexBinSize()), ('withOverlaps' if allowOverlaps else 'noOverlaps'), genome] +\ list(trackName) + ([chr] if chr is not None else []) )
def createDirPath(trackName, genome, chr=None, allowOverlaps=False, basePath=Config.PROCESSED_DATA_PATH): """ >>> createDirPath(['trackname'],'genome','chr1') '/100000/noOverlaps/genome/trackname/chr1' """ from gtrackcore.util.CompBinManager import CompBinManager if len(trackName) > 0 and trackName[0] == 'redirect': genome = trackName[1] chr = trackName[2] #trackName[3] is description trackName = trackName[4:] #print [basePath, str(CompBinManager.getIndexBinSize()), ('withOverlaps' if allowOverlaps else 'noOverlaps'), genome] +\ # list(trackName) + ([chr] if chr is not None else []) return os.sep.join( [basePath, str(CompBinManager.getIndexBinSize()), ('withOverlaps' if allowOverlaps else 'noOverlaps'), genome] +\ list(trackName) + ([chr] if chr is not None else []) )
def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList, sparse): assert sparse in [False, True] tempContents = OrderedDict() genomeElementChrs = set(genomeElementChrList) lastRegion = None chrStartIdxs = OrderedDict() chrEndIdxs = OrderedDict() totElCount = 0 totBinCount = 0 for br in boundingRegionTuples: if lastRegion is None or br.region.chr != lastRegion.chr: if br.region.chr in tempContents: raise InvalidFormatError("Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)." % br.region) lastRegion = None tempContents[br.region.chr] = OrderedDict() if sparse: chrStartIdxs[br.region.chr] = totElCount else: if br.region < lastRegion: raise InvalidFormatError("Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s." % (lastRegion, br.region)) if lastRegion.overlaps(br.region): raise InvalidFormatError("Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region)) if lastRegion.end == br.region.start: raise InvalidFormatError("Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastRegion, br.region)) if len(br.region) < 1: raise InvalidFormatError("Error: bounding region '%s' does not have positive length." % br.region) if not sparse and len(br.region) != br.elCount: raise InvalidFormatError("Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s" % (br.region, len(br.region), br.elCount)) startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None) totElCount += br.elCount if sparse: chrEndIdxs[br.region.chr] = totElCount tempContents[br.region.chr][br.region.start] = BoundingRegionInfo(br.region.start, br.region.end, startIdx, endIdx, 0, 0) lastRegion = br.region if sparse: totBinCount = 0 for chr in tempContents: chrLen = GenomeInfo.getChrLen(self._genome, chr) numBinsInChr = CompBinManager.getNumOfBins(GenomeRegion(start=0, end=chrLen)) for key in tempContents[chr].keys(): startBinIdx = totBinCount endBinIdx = totBinCount + numBinsInChr brInfo = tempContents[chr][key] if chr in genomeElementChrs: tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \ chrStartIdxs[chr], chrEndIdxs[chr], \ startBinIdx, endBinIdx) else: if chrEndIdxs[chr] - chrStartIdxs[chr] > 0: raise InvalidFormatError("Error: bounding region '%s' has incorrect element count: %s > 0" % (GenomeRegion(chr=chr, start=brInfo.start, end=brInfo.end), chrEndIdxs[chr] - chrStartIdxs[chr])) tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, 0, 0, 0, 0) if chr in genomeElementChrs: totBinCount += numBinsInChr if len(genomeElementChrs - set(tempContents.keys())) > 0: raise InvalidFormatError('Error: some chromosomes (sequences) contains data, but has no bounding regions: %s' % ', '.join(genomeElementChrs - set(tempContents.keys()))) ensurePathExists(self._fn) for chr in tempContents: brInfoDict = tempContents[chr] tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()), tuple(brInfoDict.values())) brShelve = safeshelve.open(self._fn) brShelve.update(tempContents) brShelve.close() while not self.fileExists(): from gtrackcore.application.LogSetup import logMessage logMessage("Bounding region shelve file '%s' has yet to be created" % self._fn) import time time.sleep(0.2)
def testGetBinNumber(self): self.assertEqual(0, CompBinManager.getBinNumber(0)) self.assertEqual(2, CompBinManager.getBinNumber(200)) self.assertEqual(3, CompBinManager.getBinNumber(314))
def testGetOffset(self): self.assertEqual(0, CompBinManager.getOffset(0,0)) self.assertEqual(0,CompBinManager.getOffset(200,2)) self.assertEqual(14,CompBinManager.getOffset(314,3)) self.assertEqual(-86,CompBinManager.getOffset(314,4))
def testGetNumOfBins(self): self.assertEqual(0, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 0, 0))) self.assertEqual(1, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 0, 100))) self.assertEqual(2, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 200, 400))) self.assertEqual(4, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 67, 314)))
def testGetPosFromBinNumber(self): self.assertEqual(0, CompBinManager.getPosFromBinNumber(0)) self.assertEqual(200, CompBinManager.getPosFromBinNumber(2)) self.assertEqual(300, CompBinManager.getPosFromBinNumber(3))
def _assertSplitUserBin(self, compBins, start, end): region = GenomeRegion('hg18','chr1', start, end) compBinRegions = [GenomeRegion('hg18', 'chr1', elStart, elEnd) for elStart, elEnd in compBins] AssertList(compBinRegions, CompBinManager.splitUserBin(region), self.assertEqual)
def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList, sparse): assert sparse in [False, True] tempContents = OrderedDict() genomeElementChrs = set(genomeElementChrList) lastRegion = None chrStartIdxs = OrderedDict() chrEndIdxs = OrderedDict() totElCount = 0 totBinCount = 0 for br in boundingRegionTuples: if lastRegion is None or br.region.chr != lastRegion.chr: if br.region.chr in tempContents: raise InvalidFormatError( "Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)." % br.region) lastRegion = None tempContents[br.region.chr] = OrderedDict() if sparse: chrStartIdxs[br.region.chr] = totElCount else: if br.region < lastRegion: raise InvalidFormatError( "Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s." % (lastRegion, br.region)) if lastRegion.overlaps(br.region): raise InvalidFormatError( "Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region)) if lastRegion.end == br.region.start: raise InvalidFormatError( "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastRegion, br.region)) if len(br.region) < 1: raise InvalidFormatError( "Error: bounding region '%s' does not have positive length." % br.region) if not sparse and len(br.region) != br.elCount: raise InvalidFormatError( "Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s" % (br.region, len(br.region), br.elCount)) startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None) totElCount += br.elCount if sparse: chrEndIdxs[br.region.chr] = totElCount tempContents[br.region.chr][br.region.start] = BoundingRegionInfo( br.region.start, br.region.end, startIdx, endIdx, 0, 0) lastRegion = br.region if sparse: totBinCount = 0 for chr in tempContents: chrLen = GenomeInfo.getChrLen(self._genome, chr) numBinsInChr = CompBinManager.getNumOfBins( GenomeRegion(start=0, end=chrLen)) for key in tempContents[chr].keys(): startBinIdx = totBinCount endBinIdx = totBinCount + numBinsInChr brInfo = tempContents[chr][key] if chr in genomeElementChrs: tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \ chrStartIdxs[chr], chrEndIdxs[chr], \ startBinIdx, endBinIdx) else: if chrEndIdxs[chr] - chrStartIdxs[chr] > 0: raise InvalidFormatError( "Error: bounding region '%s' has incorrect element count: %s > 0" % (GenomeRegion(chr=chr, start=brInfo.start, end=brInfo.end), chrEndIdxs[chr] - chrStartIdxs[chr])) tempContents[chr][key] = BoundingRegionInfo( brInfo.start, brInfo.end, 0, 0, 0, 0) if chr in genomeElementChrs: totBinCount += numBinsInChr if len(genomeElementChrs - set(tempContents.keys())) > 0: raise InvalidFormatError( 'Error: some chromosomes (sequences) contains data, but has no bounding regions: %s' % ', '.join(genomeElementChrs - set(tempContents.keys()))) ensurePathExists(self._fn) for chr in tempContents: brInfoDict = tempContents[chr] tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()), tuple(brInfoDict.values())) brShelve = safeshelve.open(self._fn) brShelve.update(tempContents) brShelve.close() while not self.fileExists(): from gtrackcore.application.LogSetup import logMessage logMessage( "Bounding region shelve file '%s' has yet to be created" % self._fn) import time time.sleep(0.2)
def testGetOffset(self): self.assertEqual(0, CompBinManager.getOffset(0, 0)) self.assertEqual(0, CompBinManager.getOffset(200, 2)) self.assertEqual(14, CompBinManager.getOffset(314, 3)) self.assertEqual(-86, CompBinManager.getOffset(314, 4))