def __iter__(self): brShelve1 = self._getBoundingRegionShelve(self._trackName1) brShelve2 = self._getBoundingRegionShelve(self._trackName2) allBrsAreWholeChrs1 = self._commonAllBoundingRegionsAreWholeChr(brShelve1) \ if brShelve1 is not None else False allBrsAreWholeChrs2 = self._commonAllBoundingRegionsAreWholeChr(brShelve2) \ if brShelve2 is not None else False for chr in GenomeInfo.getExtendedChrList(self.genome): if brShelve1 is None: yield GenomeRegion(self.genome, chr, 0, GenomeInfo.getChrLen(self.genome, chr)) else: brList1 = brShelve1.getAllBoundingRegionsForChr(chr) if brShelve2 is None or \ (allBrsAreWholeChrs2 and not allBrsAreWholeChrs1): for reg in brList1: yield reg else: brList2 = brShelve2.getAllBoundingRegionsForChr(chr) if allBrsAreWholeChrs1 and not allBrsAreWholeChrs2: for reg in brList2: yield reg else: for reg in self.getAllIntersectingRegions(self.genome, chr, brList1, brList2): yield reg
def _removeBoundingRegionTuplesIfFullChrsAndNotFixedGapSize(self): if self.getFixedGapSize() == 0 and not self._reprIsDense: # If only full chromosomes if all(brt.region.chr in GenomeInfo.getExtendedChrList(self._genome) and \ brt.region.start == 0 and \ brt.region.end == GenomeInfo.getChrLen(self._genome, brt.region.chr) \ for brt in self._boundingRegionTuples): self._boundingRegionTuples = []
def getTotalBpSpan(self): #print 'SELF: ', self.chr, self.start, self.end if self.chr is None: return sum( GenomeInfo.getChrLen(self.genome, chr) for chr in GenomeInfo.getExtendedChrList(self.genome)) #elif not self.start: #return GenomeInfo.getChrLen(self.genome, self.chr) else: return len(self)
def __new__(cls, genome): from gtrackcore.track.core.GenomeRegion import GenomeRegion from gtrackcore.metadata.GenomeInfo import GenomeInfo chrList = GenomeInfo.getChrList(genome) if len(chrList) > 0: return [ GenomeRegion(genome, GenomeInfo.getChrList(genome)[0], 0, 1) ]
def _checkValidStart(self, chr, start): if start < 0: raise InvalidFormatError('Error: start position is negative: %s' % start) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ start > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: start position is larger than the size of chromosome "%s" (%s > %s)' % \ (chr, start, GenomeInfo.getChrLen(self.genome, chr))) return start
def _getBoundingRegionTupleList(self, case, sortedAssertElList): boundingRegions = [br for br in sorted(case.boundingRegionsAssertList) if br.region.chr is not None] if len(boundingRegions) > 0: return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=br.region.chr, \ start=br.region.start if br.region.start is not None else 0, \ end=br.region.end if br.region.end is not None else \ GenomeInfo.getChrLen(self.GENOME, br.region.chr)), br.elCount) for br in boundingRegions] else: totChrList = [ge.chr for ge in sortedAssertElList] chrBrList = OrderedDict( [ (i, totChrList.count(i)) for i in sorted(set(totChrList)) ] ) return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=chr, start=0, \ end=GenomeInfo.getChrLen(self.GENOME, chr)), elCount) \ for chr, elCount in chrBrList.iteritems()]
def _checkValidEnd(self, chr, end, start=None): if end < 0: raise InvalidFormatError('Error: end position is negative: %s' % end) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ end-1 > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: end position is larger than the size of chromosome "%s" (%s > %s)' % \ (chr, end-1, GenomeInfo.getChrLen(self.genome, chr))) if start is not None and end <= start: if not start == end == 1: raise InvalidFormatError('Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d' % (end, start)) return end
def isCompBin(region): if isIter(region): return False offsetOK = (CompBinManager.getOffset( region.start, CompBinManager.getBinNumber(region.start) ) == 0) lengthOK = (len(region) == min(CompBinManager.getCompBinSize(), GenomeInfo.getChrLen(region.genome, region.chr) - region.start)) return offsetOK and lengthOK
def assertChrElCounts(self, trackName, chrElCountDict, allowOverlaps, customBins): for chr in chrElCountDict.keys(): if chr in customBins: region = customBins[chr] else: region = GenomeRegion(self.GENOME, chr, 0, GenomeInfo.getChrLen(self.GENOME, chr)) tv = self._getTrackView(trackName, region, allowOverlaps) self.assertEquals(chrElCountDict[chr], len([x for x in tv]))
def isValidTrack(genome, trackName, fullAccess=False): if not TrackInfo(genome, trackName).isValid(fullAccess): return False for fn in ProcTrackOptions._getDirContents(genome, trackName): if GenomeInfo.isValidChr(genome, fn) or isBoundingRegionFileName(fn): return True return False
def _checkValidEnd(self, chr, end, start=None): if end < 0: raise InvalidFormatError('Error: end position is negative: %s' % end) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ end-1 > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: end position is larger than the size of chromosome "%s" (%s > %s)' % \ (chr, end-1, GenomeInfo.getChrLen(self.genome, chr))) if start is not None and end <= start: if not start == end == 1: raise InvalidFormatError( 'Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d' % (end, start)) return end
def getAllBoundingRegions(self): if not self.fileExists(): from gtrackcore.util.CommonFunctions import prettyPrintTrackName raise BoundingRegionsNotAvailableError('Bounding regions not available for track: ' + \ prettyPrintTrackName(self._trackName)) for chr in GenomeInfo.getExtendedChrList(self._genome): for reg in self.getAllBoundingRegionsForChr(chr): yield reg
def isCompBin(region): if isIter(region): return False offsetOK = (CompBinManager.getOffset( region.start, CompBinManager.getBinNumber(region.start)) == 0) lengthOK = (len(region) == min( CompBinManager.getCompBinSize(), GenomeInfo.getChrLen(region.genome, region.chr) - region.start)) return offsetOK and lengthOK
def __init__(self, genome, trackName, allowOverlaps=False, *args, **kwArgs): from gtrackcore.track.memmap.BoundingRegionShelve import BoundingRegionShelve brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps) if brShelve.fileExists(): boundingRegions = list(brShelve.getAllBoundingRegions()) else: boundingRegions = GenomeInfo.getStdChrRegionList(genome) TrackGenomeElementSource.__init__(self, genome=genome, trackName=trackName, \ boundingRegions=boundingRegions, globalCoords=True, \ allowOverlaps=allowOverlaps, printWarnings=True)
def extend(self, extensionSize, ensureValidity=True): if extensionSize >= 0: self.end += extensionSize else: self.start += extensionSize if ensureValidity: self.start = max(0, self.start) self.end = min(self.end, GenomeInfo.getChrLen(self.genome, self.chr)) return self
def getSubtypes(genome, trackName, fullAccess=False): dirPath = createDirPath(trackName, genome) subtypes = [fn for fn in ProcTrackOptions._getDirContents(genome, trackName) \ if not (fn[0] in ['.','_'] or os.path.isfile(dirPath + os.sep + fn) \ or GenomeInfo.isValidChr(genome, fn))] #fixme, just temporarily:, these dirs should start with _ subtypes = [x for x in subtypes if not x in ['external', 'ucsc']] #if not fullAccess and not ProcTrackOptions._isLiteratureTrack(genome, trackName): # subtypes = [x for x in subtypes if not TrackInfo(genome, trackName+[x]).private] return sorted(subtypes, key=str.lower)
def getSubtypes(genome, trackName, fullAccess=False): dirPath = createDirPath(trackName, genome) subtypes = [fn for fn in ProcTrackOptions._getDirContents(genome, trackName) \ if not (fn[0] in ['.','_'] or os.path.isfile(dirPath + os.sep + fn) \ or GenomeInfo.isValidChr(genome, fn))] #fixme, just temporarily:, these dirs should start with _ subtypes= [x for x in subtypes if not x in ['external','ucsc'] ] #if not fullAccess and not ProcTrackOptions._isLiteratureTrack(genome, trackName): # subtypes = [x for x in subtypes if not TrackInfo(genome, trackName+[x]).private] return sorted(subtypes, key=str.lower)
def _createOutputDirectory(self, genome, chr, trackName, allowOverlaps, geSourceManager): dirPath = createDirPath(trackName, genome, chr, allowOverlaps) from gtrackcore.metadata.GenomeInfo import GenomeInfo return OutputDirectory(dirPath, geSourceManager.getPrefixList(), \ geSourceManager.getNumElementsForChr(chr), \ GenomeInfo.getChrLen(genome, chr), \ geSourceManager.getValDataType(), \ geSourceManager.getValDim(), \ geSourceManager.getEdgeWeightDataType(), \ geSourceManager.getEdgeWeightDim(), \ geSourceManager.getMaxNumEdgesForChr(chr), \ geSourceManager.getMaxStrLensForChr(chr), \ geSourceManager.isSorted())
def nextBin(self): for region in self._userBinSource: start = region.start if region.start is not None else 0 chrLen = GenomeInfo.getChrLen(region.genome, region.chr) if region.genome is not None else None regEnd = min([x for x in [region.end, chrLen] if x is not None]) if self._binLen is None: yield GenomeRegion(region.genome, region.chr, start, regEnd) else: while start < regEnd: end = min(start + self._binLen, regEnd) yield GenomeRegion(region.genome, region.chr, start, end) start += self._binLen
def nextBin(self): for region in self._userBinSource: start = region.start if region.start is not None else 0 chrLen = GenomeInfo.getChrLen( region.genome, region.chr) if region.genome is not None else None regEnd = min([x for x in [region.end, chrLen] if x is not None]) if self._binLen is None: yield GenomeRegion(region.genome, region.chr, start, regEnd) else: while start < regEnd: end = min(start + self._binLen, regEnd) yield GenomeRegion(region.genome, region.chr, start, end) start += self._binLen
def getBoundingRegionTuples(self): boundingRegionTuples = [x for x in self._getBoundingRegionTuples() \ if x.region.chr is not None] if len(boundingRegionTuples) == 0: from gtrackcore.input.core.GenomeElementSource import BoundingRegionTuple from gtrackcore.track.core.GenomeRegion import GenomeRegion from gtrackcore.metadata.GenomeInfo import GenomeInfo geChrList = self.getAllChrs() boundingRegionTuples = [BoundingRegionTuple( \ GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(self._geSource.genome, chr)), \ self.getNumElementsForChr(chr) ) \ for chr in geChrList] self._boundingRegionsAndGEsCorresponds = False else: self._boundingRegionsAndGEsCorresponds = True return boundingRegionTuples
def getBoundingRegionTuples(self): boundingRegionTuples = [x for x in self._getBoundingRegionTuples() \ if x.region.chr is not None] if len(boundingRegionTuples) == 0: from gtrackcore.input.core.GenomeElementSource import BoundingRegionTuple from gtrackcore.track.core.GenomeRegion import GenomeRegion from gtrackcore.metadata.GenomeInfo import GenomeInfo geChrList = self.getAllChrs() boundingRegionTuples = [BoundingRegionTuple( \ GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(self._geSource.genome, chr)), \ self.getNumElementsForChr(chr) ) \ for chr in geChrList] self._boundingRegionsAndGEsCorrespond = False else: self._boundingRegionsAndGEsCorrespond = True return boundingRegionTuples
def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList, sparse): assert sparse in [False, True] tempContents = OrderedDict() genomeElementChrs = set(genomeElementChrList) lastRegion = None chrStartIdxs = OrderedDict() chrEndIdxs = OrderedDict() totElCount = 0 totBinCount = 0 for br in boundingRegionTuples: if lastRegion is None or br.region.chr != lastRegion.chr: if br.region.chr in tempContents: raise InvalidFormatError("Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)." % br.region) lastRegion = None tempContents[br.region.chr] = OrderedDict() if sparse: chrStartIdxs[br.region.chr] = totElCount else: if br.region < lastRegion: raise InvalidFormatError("Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s." % (lastRegion, br.region)) if lastRegion.overlaps(br.region): raise InvalidFormatError("Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region)) if lastRegion.end == br.region.start: raise InvalidFormatError("Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastRegion, br.region)) if len(br.region) < 1: raise InvalidFormatError("Error: bounding region '%s' does not have positive length." % br.region) if not sparse and len(br.region) != br.elCount: raise InvalidFormatError("Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s" % (br.region, len(br.region), br.elCount)) startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None) totElCount += br.elCount if sparse: chrEndIdxs[br.region.chr] = totElCount tempContents[br.region.chr][br.region.start] = BoundingRegionInfo(br.region.start, br.region.end, startIdx, endIdx, 0, 0) lastRegion = br.region if sparse: totBinCount = 0 for chr in tempContents: chrLen = GenomeInfo.getChrLen(self._genome, chr) numBinsInChr = CompBinManager.getNumOfBins(GenomeRegion(start=0, end=chrLen)) for key in tempContents[chr].keys(): startBinIdx = totBinCount endBinIdx = totBinCount + numBinsInChr brInfo = tempContents[chr][key] if chr in genomeElementChrs: tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \ chrStartIdxs[chr], chrEndIdxs[chr], \ startBinIdx, endBinIdx) else: if chrEndIdxs[chr] - chrStartIdxs[chr] > 0: raise InvalidFormatError("Error: bounding region '%s' has incorrect element count: %s > 0" % (GenomeRegion(chr=chr, start=brInfo.start, end=brInfo.end), chrEndIdxs[chr] - chrStartIdxs[chr])) tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, 0, 0, 0, 0) if chr in genomeElementChrs: totBinCount += numBinsInChr if len(genomeElementChrs - set(tempContents.keys())) > 0: raise InvalidFormatError('Error: some chromosomes (sequences) contains data, but has no bounding regions: %s' % ', '.join(genomeElementChrs - set(tempContents.keys()))) ensurePathExists(self._fn) for chr in tempContents: brInfoDict = tempContents[chr] tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()), tuple(brInfoDict.values())) brShelve = safeshelve.open(self._fn) brShelve.update(tempContents) brShelve.close() while not self.fileExists(): from gtrackcore.application.LogSetup import logMessage logMessage("Bounding region shelve file '%s' has yet to be created" % self._fn) import time time.sleep(0.2)
def __new__(cls, genome): from gtrackcore.track.core.GenomeRegion import GenomeRegion from gtrackcore.metadata.GenomeInfo import GenomeInfo chrList = GenomeInfo.getChrList(genome) if len(chrList) > 0: return [GenomeRegion(genome, GenomeInfo.getChrList(genome)[0], 0, 1)]
def _checkValidChr(self, chr): if self.genome and not GenomeInfo.isValidChr(self.genome, chr): raise InvalidFormatWarning('Chromosome incorrectly specified: ' + chr) return chr
def strWithCentromerInfo(self): return str(self) + (' (intersects centromere)' if GenomeInfo.regIntersectsCentromer(self) else '')
def isWholeChr(self): if self.genome is None or self.chr is None: return False return ( (self.start, self.end) == (0, GenomeInfo.getChrLen(self.genome, self.chr)) )
def parseRegSpec(regSpec, genome=None, includeExtraChrs=False): from gtrackcore.track.core.GenomeRegion import GenomeRegion from gtrackcore.metadata.GenomeInfo import GenomeInfo class SimpleUserBinSource(list): pass regions = [] allRegSpecs = regSpec.strip().split(',') for curRegSpec in allRegSpecs: regParts = curRegSpec.strip().split(':') if genome == None: genome = regParts[0] #assert GenomeInfo(genome).isInstalled(), "Specified genome is not installed: %s" % genome if not (regParts[0] == '*' or regParts[0] in GenomeInfo.getExtendedChrList(genome)): #if (regParts[0]=='*' or regParts[0].startswith('chr')): # if genome == None: # genome = DEFAULT_GENOME #else: # assert genome is None or genome == regParts[0], \ assert regParts[0] == genome, \ "Region specification does not start with one of '*' or correct chromosome or genome name. Region specification: %s. Genome: %s" % (curRegSpec, genome) #genome = regParts[0] regParts = regParts[1:] if regParts[0] == '*': assert len(regParts) == 1, \ "Region specification starts with '*' but continues with ':'. Region specification: %s" % curRegSpec assert len(allRegSpecs) == 1, \ "Region specification is '*', but is in a list with other region specifications: %s" % regSpec chrList = GenomeInfo.getExtendedChrList( genome) if includeExtraChrs else GenomeInfo.getChrList(genome) for chr in chrList: regions.append( GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr))) else: #assert(regParts[0].startswith('chr')), \ assert regParts[0] in GenomeInfo.getExtendedChrList(genome), \ "Region specification does not start with chromosome specification. Region specification: %s " % curRegSpec chr = regParts[0] try: chrLen = GenomeInfo.getChrLen(genome, chr) except Exception, e: raise InvalidFormatError( "Chromosome '%s' does not exist for genome '%s'" % (chr, genome)) if len(regParts) > 1: posParts = regParts[1] assert '-' in posParts, \ "Position specification does not include character '-'. Region specification: %s " % curRegSpec rawStart, rawEnd = posParts.split('-') start = int( rawStart.replace('k', '001').replace('m', '000001')) end = int(rawEnd.replace('k', '000').replace( 'm', '000000')) if rawEnd != '' else chrLen assert start >= 1, \ "Start position is not positive. Region specification: %s " % curRegSpec assert end >= start, \ "End position is not larger than start position. Region specification: %s " % curRegSpec assert end <= chrLen, \ "End position is larger than chromosome size. Genome: %s. Chromosome size: %s. Region specification: %s" % (genome, chrLen, curRegSpec) #-1 for conversion from 1-indexing to 0-indexing end-exclusive start -= 1 else: start, end = 0, chrLen regions.append(GenomeRegion(genome, chr, start, end))
def _commonAllBoundingRegionsAreWholeChr(self, brShelve): for chr in GenomeInfo.getExtendedChrList(self.genome): for reg in brShelve.getAllBoundingRegionsForChr(chr): if not reg.isWholeChr(): return False return True
def getTotalElementCount(self): return sum( self.getTotalElementCountForChr(chr) for chr in GenomeInfo.getExtendedChrList(self._genome))
def _getTrackView(self, trackName, region, allowOverlaps): track = Track(trackName) track.addFormatReq(TrackFormatReq(allowOverlaps=allowOverlaps)) return track.getTrackView(region) def _getBoundingRegionTupleList(self, case, sortedAssertElList): boundingRegions = [ br for br in sorted(case.boundingRegionsAssertList) if br.region.chr is not None ] if len(boundingRegions) > 0: return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=br.region.chr, \ start=br.region.start if br.region.start is not None else 0, \ end=br.region.end if br.region.end is not None else \ GenomeInfo.getChrLen(self.GENOME, br.region.chr)), br.elCount) for br in boundingRegions] else: totChrList = [ge.chr for ge in sortedAssertElList] chrBrList = OrderedDict([(i, totChrList.count(i)) for i in sorted(set(totChrList))]) return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=chr, start=0, \ end=GenomeInfo.getChrLen(self.GENOME, chr)), elCount) \ for chr, elCount in chrBrList.iteritems()] def _getCaseTrackView(self, case, br, allowOverlaps): return self._getTrackView(self.TRACK_NAME_PREFIX + case.trackName, \ GenomeRegion(genome=self.GENOME, chr=br.chr, start=br.start, end=br.end), \ allowOverlaps=allowOverlaps) def _assertGenomeAndTrackElementEqual(self, ge, te, teChr, teOffset):
def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList, sparse): assert sparse in [False, True] tempContents = OrderedDict() genomeElementChrs = set(genomeElementChrList) lastRegion = None chrStartIdxs = OrderedDict() chrEndIdxs = OrderedDict() totElCount = 0 totBinCount = 0 for br in boundingRegionTuples: if lastRegion is None or br.region.chr != lastRegion.chr: if br.region.chr in tempContents: raise InvalidFormatError( "Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)." % br.region) lastRegion = None tempContents[br.region.chr] = OrderedDict() if sparse: chrStartIdxs[br.region.chr] = totElCount else: if br.region < lastRegion: raise InvalidFormatError( "Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s." % (lastRegion, br.region)) if lastRegion.overlaps(br.region): raise InvalidFormatError( "Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region)) if lastRegion.end == br.region.start: raise InvalidFormatError( "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastRegion, br.region)) if len(br.region) < 1: raise InvalidFormatError( "Error: bounding region '%s' does not have positive length." % br.region) if not sparse and len(br.region) != br.elCount: raise InvalidFormatError( "Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s" % (br.region, len(br.region), br.elCount)) startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None) totElCount += br.elCount if sparse: chrEndIdxs[br.region.chr] = totElCount tempContents[br.region.chr][br.region.start] = BoundingRegionInfo( br.region.start, br.region.end, startIdx, endIdx, 0, 0) lastRegion = br.region if sparse: totBinCount = 0 for chr in tempContents: chrLen = GenomeInfo.getChrLen(self._genome, chr) numBinsInChr = CompBinManager.getNumOfBins( GenomeRegion(start=0, end=chrLen)) for key in tempContents[chr].keys(): startBinIdx = totBinCount endBinIdx = totBinCount + numBinsInChr brInfo = tempContents[chr][key] if chr in genomeElementChrs: tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \ chrStartIdxs[chr], chrEndIdxs[chr], \ startBinIdx, endBinIdx) else: if chrEndIdxs[chr] - chrStartIdxs[chr] > 0: raise InvalidFormatError( "Error: bounding region '%s' has incorrect element count: %s > 0" % (GenomeRegion(chr=chr, start=brInfo.start, end=brInfo.end), chrEndIdxs[chr] - chrStartIdxs[chr])) tempContents[chr][key] = BoundingRegionInfo( brInfo.start, brInfo.end, 0, 0, 0, 0) if chr in genomeElementChrs: totBinCount += numBinsInChr if len(genomeElementChrs - set(tempContents.keys())) > 0: raise InvalidFormatError( 'Error: some chromosomes (sequences) contains data, but has no bounding regions: %s' % ', '.join(genomeElementChrs - set(tempContents.keys()))) ensurePathExists(self._fn) for chr in tempContents: brInfoDict = tempContents[chr] tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()), tuple(brInfoDict.values())) brShelve = safeshelve.open(self._fn) brShelve.update(tempContents) brShelve.close() while not self.fileExists(): from gtrackcore.application.LogSetup import logMessage logMessage( "Bounding region shelve file '%s' has yet to be created" % self._fn) import time time.sleep(0.2)
def parseRegSpec(regSpec, genome = None, includeExtraChrs = False): from gtrackcore.track.core.GenomeRegion import GenomeRegion from gtrackcore.metadata.GenomeInfo import GenomeInfo class SimpleUserBinSource(list): pass regions = [] allRegSpecs = regSpec.strip().split(',') for curRegSpec in allRegSpecs: regParts = curRegSpec.strip().split(':') if genome == None: genome = regParts[0] #assert GenomeInfo(genome).isInstalled(), "Specified genome is not installed: %s" % genome if not (regParts[0]=='*' or regParts[0] in GenomeInfo.getExtendedChrList(genome)): #if (regParts[0]=='*' or regParts[0].startswith('chr')): # if genome == None: # genome = DEFAULT_GENOME #else: # assert genome is None or genome == regParts[0], \ assert regParts[0] == genome, \ "Region specification does not start with one of '*' or correct chromosome or genome name. Region specification: %s. Genome: %s" % (curRegSpec, genome) #genome = regParts[0] regParts = regParts[1:] if regParts[0] == '*': assert len(regParts) == 1, \ "Region specification starts with '*' but continues with ':'. Region specification: %s" % curRegSpec assert len(allRegSpecs) == 1, \ "Region specification is '*', but is in a list with other region specifications: %s" % regSpec chrList = GenomeInfo.getExtendedChrList(genome) if includeExtraChrs else GenomeInfo.getChrList(genome) for chr in chrList: regions.append(GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr))) else: #assert(regParts[0].startswith('chr')), \ assert regParts[0] in GenomeInfo.getExtendedChrList(genome), \ "Region specification does not start with chromosome specification. Region specification: %s " % curRegSpec chr = regParts[0] try: chrLen = GenomeInfo.getChrLen(genome, chr) except Exception, e: raise InvalidFormatError("Chromosome '%s' does not exist for genome '%s'" % (chr, genome)) if len(regParts)>1: posParts = regParts[1] assert '-' in posParts, \ "Position specification does not include character '-'. Region specification: %s " % curRegSpec rawStart, rawEnd = posParts.split('-') start = int(rawStart.replace('k','001').replace('m','000001')) end = int(rawEnd.replace('k','000').replace('m','000000')) if rawEnd != '' else chrLen assert start >= 1, \ "Start position is not positive. Region specification: %s " % curRegSpec assert end >= start, \ "End position is not larger than start position. Region specification: %s " % curRegSpec assert end <= chrLen, \ "End position is larger than chromosome size. Genome: %s. Chromosome size: %s. Region specification: %s" % (genome, chrLen, curRegSpec) #-1 for conversion from 1-indexing to 0-indexing end-exclusive start-=1 else: start,end = 0, chrLen regions.append( GenomeRegion(genome, chr, start, end) )
def isWholeChr(self): if self.genome is None or self.chr is None: return False return ((self.start, self.end) == (0, GenomeInfo.getChrLen(self.genome, self.chr)))
def _isOldTypeChromDirectory(dirPath, genome): if dirPath[-1] == os.sep: dirPath = os.path.dirname(dirPath) dirName = os.path.basename(dirPath) return dirName in set(GenomeInfo.getExtendedChrList(genome)) and \ not any(os.path.isdir(os.path.join(dirPath, subFn)) for subFn in os.listdir(dirPath))
def getTotalElementCount(self): return sum(self.getTotalElementCountForChr(chr) for chr in GenomeInfo.getExtendedChrList(self._genome))