def checkIfEdgeIdsExist(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) if not collector.getTrackFormat().isLinked(): return uniqueIds = numpy.array([], dtype='S') uniqueEdgeIds = numpy.array([], dtype='S') for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) uniqueIds = numpy.unique( numpy.concatenate((uniqueIds, trackData['id'][:]))) uniqueEdgeIds = numpy.unique( numpy.concatenate( (uniqueEdgeIds, trackData['edges'][:].flatten()))) uniqueIds = uniqueIds[uniqueIds != ''] uniqueEdgeIds = uniqueEdgeIds[uniqueEdgeIds != ''] unmatchedIds = set(uniqueEdgeIds) - set(uniqueIds) if len(unmatchedIds) > 0: raise InvalidFormatError( "Error: the following ids specified in the 'edges' column do not exist in the dataset: " + ', '.join(sorted(unmatchedIds)))
def checkUndirectedEdges(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) if not (collector.getTrackFormat().isLinked() and collector.hasUndirectedEdges()): return complementEdgeWeightDict = {} for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) ids = trackData['id'] edges = trackData['edges'] weights = trackData.get('weights') for i, id in enumerate(ids): edgesAttr = edges[i][edges[i] != ''] weightsAttr = weights[i][edges[i] != ''] if weights is not None else None PreProcessUtils._adjustComplementaryEdgeWeightDict(complementEdgeWeightDict, id, edgesAttr, weightsAttr) if len(complementEdgeWeightDict) != 0: unmatchedPairs = [] for toId in complementEdgeWeightDict: for fromId in complementEdgeWeightDict[toId]: unmatchedPairs.append((fromId, toId, complementEdgeWeightDict[toId][fromId])) raise InvalidFormatError("Error: All edges are not undirected. The following edges specifications " +\ "are not matched by an opposite edge with equal weight:" + os.linesep +\ os.linesep.join(["from '%s' to '%s'" % (fromId, toId) + \ (" with weight '%s'" % weight if weight != '' else '') \ for fromId, toId, weight in unmatchedPairs]))
def checkUndirectedEdges(genome, trackName, allowOverlaps): collector = TrackInfoDataCollector(genome, trackName) if not (collector.getTrackFormat().isLinked() and collector.hasUndirectedEdges()): return complementEdgeWeightDict = {} for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) ids = trackData['id'] edges = trackData['edges'] weights = trackData.get('weights') for i, id in enumerate(ids): edgesAttr = edges[i][edges[i] != ''] weightsAttr = weights[i][edges[i] != ''] if weights is not None else None PreProcessUtils._adjustComplementaryEdgeWeightDict(complementEdgeWeightDict, id, edgesAttr, weightsAttr) if len(complementEdgeWeightDict) != 0: unmatchedPairs = [] for toId in complementEdgeWeightDict: for fromId in complementEdgeWeightDict[toId]: unmatchedPairs.append((fromId, toId, complementEdgeWeightDict[toId][fromId])) raise InvalidFormatError("Error: All edges are not undirected. The following edges specifications " +\ "are not matched by an opposite edge with equal weight:" + os.linesep +\ os.linesep.join(["from '%s' to '%s'" % (fromId, toId) + \ (" with weight '%s'" % weight if weight != '' else '') \ for fromId, toId, weight in unmatchedPairs]))
def merge(genome, trackName, allowOverlaps): path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) chrList = TrackInfoDataCollector(genome, trackName).getPreProcessedChrs(allowOverlaps) existingChrList = [chr for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList)] if len(existingChrList) == 0: raise EmptyGESourceError('No data lines has been read from source file (probably because it is empty).') firstChrTrackData = TrackSource().getTrackData(trackName, genome, existingChrList[0], allowOverlaps, forceChrFolders=True) arrayList = firstChrTrackData.keys() for arrayName in arrayList: mergedArray = firstChrTrackData[arrayName][:] elementDim, dtypeDim = parseMemmapFileFn(firstChrTrackData[arrayName].filename)[1:3] del firstChrTrackData[arrayName] for chr in existingChrList[1:]: chrTrackData = TrackSource().getTrackData(trackName, genome, chr, allowOverlaps, forceChrFolders=True) mergedArray = ChrMemmapFolderMerger.mergeArrays(mergedArray, np.array(chrTrackData[arrayName][:])) elementDimNew, dtypeDimNew = parseMemmapFileFn(chrTrackData[arrayName].filename)[1:3] elementDim = max(elementDim, elementDimNew) dtypeDim = max(dtypeDim, dtypeDimNew) del chrTrackData[arrayName] mergedFn = createMemmapFileFn(path, arrayName, elementDim, dtypeDim, str(mergedArray.dtype)) f = np.memmap(mergedFn, dtype=mergedArray.dtype, mode='w+', shape=mergedArray.shape) f[:] = mergedArray f.flush() del f del mergedArray
def __init__(self, trackName, trackTitle=None): self.trackName = trackName self.trackTitle = trackTitle self._trackSource = TrackSource() self._trackViewLoader = TrackViewLoader() self._trackFormatReq = NeutralTrackFormatReq() self.formatConverters = None self._trackId = None
def merge(genome, trackName, allowOverlaps): path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) collector = PreProcMetaDataCollector(genome, trackName) chrList = collector.getPreProcessedChrs(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): chrList = sorted(chrList) existingChrList = [ chr for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList) ] if len(existingChrList) == 0: raise EmptyGESourceError( 'No data lines has been read from source file (probably because it is empty).' ) firstChrTrackData = TrackSource().getTrackData(trackName, genome, existingChrList[0], allowOverlaps, forceChrFolders=True) arrayList = firstChrTrackData.keys() for arrayName in arrayList: mergedArray = firstChrTrackData[arrayName][:] elementDim, dtypeDim = parseMemmapFileFn( firstChrTrackData[arrayName].filename)[1:3] del firstChrTrackData[arrayName] for chr in existingChrList[1:]: chrTrackData = TrackSource().getTrackData(trackName, genome, chr, allowOverlaps, forceChrFolders=True) mergedArray = ChrMemmapFolderMerger.mergeArrays( mergedArray, np.array(chrTrackData[arrayName][:])) elementDimNew, dtypeDimNew = parseMemmapFileFn( chrTrackData[arrayName].filename)[1:3] elementDim = max(elementDim, elementDimNew) dtypeDim = max(dtypeDim, dtypeDimNew) del chrTrackData[arrayName] mergedFn = createMemmapFileFn(path, arrayName, elementDim, dtypeDim, str(mergedArray.dtype)) f = np.memmap(mergedFn, dtype=mergedArray.dtype, mode='w+', shape=mergedArray.shape) f[:] = mergedArray f.flush() del f del mergedArray
def getMergedPrefixInfoDict(cls, genome, trackName, allowOverlaps): assert cls.mergedPreProcFilesExist(genome, trackName, allowOverlaps) trackData = TrackSource().getTrackData(trackName, genome, None, allowOverlaps) prefixInfoDict = {} for prefix, smartMemmap in trackData.items(): if prefix not in ['leftIndex', 'rightIndex']: prefixInfoDict[prefix] = PrefixInfo( *parseMemmapFileFn(smartMemmap.getFilename())) return prefixInfoDict
def __init__(self, trackName): self.trackName = trackName self._trackSource = TrackSource() self._trackViewLoader = TrackViewLoader() self._trackFormatReq = NeutralTrackFormatReq() self.formatConverters = None self._trackId = None
def _getNumTrackElements(self, trackName, chr, allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, self.GENOME, chr, allowOverlaps) brShelve = trackData.boundingRegionShelve if brShelve: chrBoundingRegions = brShelve.getAllBoundingRegionsForChr(chr) numTrackElements = sum( self._getTrackView(trackName, br, allowOverlaps).getNumElements() for br in chrBoundingRegions) assert numTrackElements == brShelve.getTotalElementCountForChr(chr) else: # chr-specific folders have not been merged numTrackElements = self._getNumTrackElementsFromMemmap( trackName, chr, allowOverlaps) return numTrackElements
def _getNumTrackElementsFromMemmap(self, trackName, chr, allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, self.GENOME, chr, allowOverlaps, forceChrFolders=True) allArrayLenghts = [ memmap.getShape()[0] for name, memmap in trackData.iteritems() if name not in ['leftIndex', 'rightIndex'] ] if len(allArrayLenghts) > 0: assert all(arrayLen == allArrayLenghts[0] for arrayLen in allArrayLenghts) return allArrayLenghts[0] else: return 0
def testCreateIntensityTrack(self): regions = [GenomeRegion(self._genome, self._chr, 1000, 5000),\ GenomeRegion(self._genome, self._chr, 6000, 7000),\ GenomeRegion(self._genome, self._chr, 10000, 16000)] job = AnalysisDefJob('[dataStat=SimpleBpIntensityStat] [outTrackName=' + '^'.join(self._trackName) + '] [numDiscreteVals=10] -> CreateFunctionTrackStat', \ ['nums'], ['points'], regions, genome=self._genome) for x in range(2): job.run() brShelve = BoundingRegionShelve(self._genome, self._trackName, allowOverlaps=False) self.assertRaises(OutsideBoundingRegionError, \ brShelve.getBoundingRegionInfo, GenomeRegion(self._genome, self._chr, 0, 1)) #self.assertEquals(BoundingRegionInfo(0, 1, 0, 0, 0, 0), # brShelve.getBoundingRegionInfo(GenomeRegion(self._genome, self._chr, 0, 1))) self.assertEquals( BoundingRegionInfo(1000, 5000, 0, 4000, 0, 0), brShelve.getBoundingRegionInfo( GenomeRegion(self._genome, self._chr, 2000, 2001))) self.assertRaises(OutsideBoundingRegionError, \ brShelve.getBoundingRegionInfo, GenomeRegion(self._genome, self._chr, 5500, 5501)) #self.assertEquals(BoundingRegionInfo(5500, 5501, 0, 0, 0, 0), # brShelve.getBoundingRegionInfo(GenomeRegion(self._genome, self._chr, 5500, 5501))) self.assertEquals( BoundingRegionInfo(6000, 7000, 4000, 5000, 0, 0), brShelve.getBoundingRegionInfo( GenomeRegion(self._genome, self._chr, 6500, 6501))) self.assertRaises(OutsideBoundingRegionError, \ brShelve.getBoundingRegionInfo, GenomeRegion(self._genome, self._chr, 8000, 8001)) #self.assertEquals(BoundingRegionInfo(8000, 8001, 0, 0, 0, 0), # brShelve.getBoundingRegionInfo(GenomeRegion(self._genome, self._chr, 8000, 8001))) self.assertEquals( BoundingRegionInfo(10000, 16000, 5000, 11000, 0, 0), brShelve.getBoundingRegionInfo( GenomeRegion(self._genome, self._chr, 11000, 11001))) self.assertRaises(OutsideBoundingRegionError, \ brShelve.getBoundingRegionInfo, GenomeRegion(self._genome, self._chr, 16500, 16501)) #self.assertEquals(BoundingRegionInfo(16500, 16501, 0, 0, 0, 0), # brShelve.getBoundingRegionInfo(GenomeRegion(self._genome, self._chr, 16500, 16501))) trackData = TrackSource().getTrackData(self._trackName, self._genome, None, False) self.assertListsOrDicts(['val'], trackData.keys()) self.assertListsOrDicts((11000, ), trackData['val'].shape)
def _getTrackData(self): for br in self._boundingRegions: chr = br.chr break return TrackSource().getTrackData(self._trackName, self._genome, chr, allowOverlaps=self._allowOverlaps)
def checkIfEdgeIdsExist(genome, trackName, allowOverlaps): collector = TrackInfoDataCollector(genome, trackName) if not collector.getTrackFormat().isLinked(): return uniqueIds = numpy.array([], dtype='S') uniqueEdgeIds = numpy.array([], dtype='S') for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) uniqueIds = numpy.unique(numpy.concatenate((uniqueIds, trackData['id'][:]))) uniqueEdgeIds = numpy.unique(numpy.concatenate((uniqueEdgeIds, trackData['edges'][:].flatten()))) uniqueIds = uniqueIds[uniqueIds != ''] uniqueEdgeIds = uniqueEdgeIds[uniqueEdgeIds != ''] unmatchedIds = set(uniqueEdgeIds) - set(uniqueIds) if len(unmatchedIds) > 0: raise InvalidFormatError("Error: the following ids specified in the 'edges' column do not exist in the dataset: " + ', '.join(sorted(unmatchedIds)))
class Track(object): IS_MEMOIZABLE = True def __new__(cls, trackName): if trackName == [] or trackName is None: return None else: if ExternalTrackManager.isVirtualTrack(trackName): return VirtualMinimalTrack.__new__(VirtualMinimalTrack) else: return object.__new__(cls) def __init__(self, trackName): self.trackName = trackName self._trackSource = TrackSource() self._trackViewLoader = TrackViewLoader() self._trackFormatReq = NeutralTrackFormatReq() self.formatConverters = None self._trackId = None def _getRawTrackView(self, region, borderHandling, allowOverlaps): trackData = self._trackSource.getTrackData(self.trackName, region.genome, region.chr, allowOverlaps) return self._trackViewLoader.loadTrackView(trackData, region, borderHandling, allowOverlaps, self.trackName) def getTrackView(self, region): allowOverlaps = self._trackFormatReq.allowOverlaps() borderHandling = self._trackFormatReq.borderHandling() assert(allowOverlaps is not None) assert(borderHandling is not None) origTrackView = self._getRawTrackView(region, borderHandling, allowOverlaps) if self.formatConverters is None: self.formatConverters = getFormatConverters(origTrackView.trackFormat, self._trackFormatReq) if self.formatConverters == []: raise IncompatibleTracksError(prettyPrintTrackName(self.trackName) + ' with format: '\ + str(origTrackView.trackFormat) + ('(' + origTrackView.trackFormat._val + ')' if origTrackView.trackFormat._val else '') + \ ' does not satisfy ' + str(self._trackFormatReq)) if not self.formatConverters[0].canHandle(origTrackView.trackFormat, self._trackFormatReq): raise IncompatibleTracksError(getClassName(self.formatConverters[0]) +\ ' does not support conversion from ' + str(origTrackView.trackFormat) + \ ' to ' + str(self._trackFormatReq)) return self.formatConverters[0].convert(origTrackView) def addFormatReq(self, requestedTrackFormat): prevFormatReq = self._trackFormatReq self._trackFormatReq = TrackFormatReq.merge(self._trackFormatReq, requestedTrackFormat) if self._trackFormatReq is None: raise IncompatibleTracksError(str(prevFormatReq ) + \ ' is incompatible with additional ' + str(requestedTrackFormat)) def setFormatConverter(self, converterClassName): assert( self.formatConverters is None ) if converterClassName is not None: self.formatConverters = [getFormatConverterByName(converterClassName)] def getUniqueKey(self, genome): assert self.formatConverters is not None and len(self.formatConverters) == 1, 'FC: '+str(self.formatConverters) assert( not None in [self._trackFormatReq.allowOverlaps(), \ self._trackFormatReq.borderHandling()] ) if not self._trackId: self._trackId = TrackInfo(genome, self.trackName).id return hash((tuple(self.trackName), self._trackId, getClassName(self.formatConverters[0]), \ self.formatConverters[0].VERSION, self._trackFormatReq.allowOverlaps(), \ self._trackFormatReq.borderHandling()))
def resetTrackSource(self): self._trackSource = TrackSource()
class Track(object): IS_MEMOIZABLE = True def __new__(cls, trackName, trackTitle=None): if trackName == [] or trackName is None: return None else: if ExternalTrackManager.isVirtualTrack(trackName): return VirtualMinimalTrack.__new__(VirtualMinimalTrack) else: return object.__new__(cls) def __init__(self, trackName, trackTitle=None): self.trackName = trackName self.trackTitle = trackTitle self._trackSource = TrackSource() self._trackViewLoader = TrackViewLoader() self._trackFormatReq = NeutralTrackFormatReq() self.formatConverters = None self._trackId = None self._randIndex = None def _getRawTrackView(self, region, borderHandling, allowOverlaps): trackData = self._trackSource.getTrackData(self.trackName, region.genome, region.chr, allowOverlaps) return self._trackViewLoader.loadTrackView(trackData, region, borderHandling, allowOverlaps, self.trackName) def getTrackView(self, region): allowOverlaps = self._trackFormatReq.allowOverlaps() borderHandling = self._trackFormatReq.borderHandling() assert(allowOverlaps is not None) assert(borderHandling is not None) origTrackView = self._getRawTrackView(region, borderHandling, allowOverlaps) if self.formatConverters is None: self.formatConverters = getFormatConverters(origTrackView.trackFormat, self._trackFormatReq) if self.formatConverters == []: raise IncompatibleTracksError(prettyPrintTrackName(self.trackName) + ' with format: '\ + str(origTrackView.trackFormat) + ('(' + origTrackView.trackFormat._val + ')' if origTrackView.trackFormat._val else '') + \ ' does not satisfy ' + str(self._trackFormatReq)) if not self.formatConverters[0].canHandle(origTrackView.trackFormat, self._trackFormatReq): raise IncompatibleTracksError(getClassName(self.formatConverters[0]) +\ ' does not support conversion from ' + str(origTrackView.trackFormat) + \ ' to ' + str(self._trackFormatReq)) return self.formatConverters[0].convert(origTrackView) def addFormatReq(self, requestedTrackFormat): prevFormatReq = self._trackFormatReq self._trackFormatReq = TrackFormatReq.merge(self._trackFormatReq, requestedTrackFormat) if self._trackFormatReq is None: raise IncompatibleTracksError(str(prevFormatReq ) + \ ' is incompatible with additional ' + str(requestedTrackFormat)) def setFormatConverter(self, converterClassName): assert( self.formatConverters is None ) if converterClassName is not None: self.formatConverters = [getFormatConverterByName(converterClassName)] def getUniqueKey(self, genome): if not self._trackId: self._trackId = TrackInfo(genome, self.trackName).id return hash((tuple(self.trackName), self._trackId if self._trackId else '', getClassName(self.formatConverters[0]) if self.formatConverters else '', self.formatConverters[0].VERSION if self.formatConverters else '', self._trackFormatReq.allowOverlaps() if self._trackFormatReq.allowOverlaps() else '', self._trackFormatReq.borderHandling() if self._trackFormatReq.borderHandling() else '')) def resetTrackSource(self): self._trackSource = TrackSource() def setRandIndex(self, randIndex): pass #used only by TsBasedRandomTrack
class Track(object): IS_MEMOIZABLE = True def __new__(cls, trackName, trackTitle=None): if trackName == [] or trackName is None: return None else: if ExternalTrackManager.isVirtualTrack(trackName): return VirtualMinimalTrack.__new__(VirtualMinimalTrack) else: return object.__new__(cls) def __init__(self, trackName, trackTitle=None): self.trackName = trackName self.trackTitle = trackTitle self._trackSource = TrackSource() self._trackViewLoader = TrackViewLoader() self._trackFormatReq = NeutralTrackFormatReq() self.formatConverters = None self._trackId = None self._hasBeenRead = False def _getRawTrackView(self, region, borderHandling, allowOverlaps): trackData = self._trackSource.getTrackData(self.trackName, region.genome, region.chr, allowOverlaps) return self._trackViewLoader.loadTrackView(trackData, region, borderHandling, allowOverlaps, self.trackName) def getTrackView(self, region): allowOverlaps = self._trackFormatReq.allowOverlaps() borderHandling = self._trackFormatReq.borderHandling() assert(allowOverlaps is not None) assert(borderHandling is not None) origTrackView = self._getRawTrackView(region, borderHandling, allowOverlaps) if self.formatConverters is None: self.formatConverters = getFormatConverters(origTrackView.trackFormat, self._trackFormatReq) self._hasBeenRead = True if self.formatConverters == []: raise IncompatibleTracksError(prettyPrintTrackName(self.trackName) + ' with format: '\ + str(origTrackView.trackFormat) + ('(' + origTrackView.trackFormat._val + ')' if origTrackView.trackFormat._val else '') + \ ' does not satisfy ' + str(self._trackFormatReq)) if not self.formatConverters[0].canHandle(origTrackView.trackFormat, self._trackFormatReq): raise IncompatibleTracksError(getClassName(self.formatConverters[0]) +\ ' does not support conversion from ' + str(origTrackView.trackFormat) + \ ' to ' + str(self._trackFormatReq)) return self.formatConverters[0].convert(origTrackView) def addFormatReq(self, requestedTrackFormat): prevFormatReq = self._trackFormatReq self._trackFormatReq = TrackFormatReq.merge(self._trackFormatReq, requestedTrackFormat) if self._trackFormatReq is None: raise IncompatibleTracksError(str(prevFormatReq ) + \ ' is incompatible with additional ' + str(requestedTrackFormat)) # TODO: track.formatConverters needs a complete overhaul. It is currently used: # 1) to link tracks with possible choices for track conversion # 2) to store the choice of format converter made by the user # 3) to manage the default choice of format converters [=always the first item in the list] # 4) to hold the currently selected format converter class until needed by getTrackView def setFormatConverter(self, converterClassName): assert( self.formatConverters is None ) if converterClassName is not None: self.formatConverters = [getFormatConverterByName(converterClassName)] def hasBeenFlaggedAsRead(self): return self._hasBeenRead def getUniqueKey(self, genome): itemsToBeHashed = [tuple(self.trackName)] itemsToBeHashed.append(self._getTrackId(genome)) if self._trackFormatReq is not None: if self._trackFormatReq.allowOverlaps() is not None: itemsToBeHashed.append(self._trackFormatReq.allowOverlaps()) itemsToBeHashed.append(self._trackFormatReq.borderHandling()) itemsToBeHashed.append(getClassName(self.formatConverters[0]) if self.formatConverters else '') # TODO: Move away from fixed VERSION, as these in practice are never updated. Also for statistics. itemsToBeHashed.append(self.formatConverters[0].VERSION if self.formatConverters else '') from config.DebugConfig import DebugConfig if DebugConfig.VERBOSE: from gold.application.LogSetup import logMessage logMessage('Unique key items for track "{}": '.format(self.trackName) + ', '.join(str(_) for _ in itemsToBeHashed)) return hash(tuple(itemsToBeHashed)) def _getTrackId(self, genome): if not self._trackId: trackInfo = TrackInfo(genome, self.trackName) self._trackId = trackInfo.id return self._trackId if self._trackId else '' def resetTrackSource(self): self._trackSource = TrackSource()
def _getTrackData(self, track, curBin, allowOverlaps): return TrackSource().getTrackData(track.trackName, curBin.genome, curBin.chr, allowOverlaps)