def _removeAllTrackData(self, trackName, removeOrigData=True): self._removeDir( createDirPath(trackName, self.GENOME, allowOverlaps=False), trackName) self._removeDir( createDirPath(trackName, self.GENOME, allowOverlaps=True), trackName) if removeOrigData: self._removeDir(createOrigPath(self.GENOME, trackName), trackName)
def _preProcessTrackToTrack(self, fromTrackName, toTrackName, allowOverlaps): fromTrackName = self.TRACK_NAME_PREFIX + fromTrackName toTrackName = self.TRACK_NAME_PREFIX + toTrackName noOverlapsPath = createDirPath(toTrackName, self.GENOME, allowOverlaps=False) withOverlapsPath = createDirPath(toTrackName, self.GENOME, allowOverlaps=True) self._removeDir(noOverlapsPath, toTrackName) self._removeDir(withOverlapsPath, toTrackName) trackGESource = FullTrackGenomeElementSource(self.GENOME, fromTrackName, allowOverlaps=allowOverlaps) self._runWithProfiling('PreProcessTrackGESourceJob(' + repr(self.GENOME) + ',' + repr(toTrackName) + ', trackGESource=trackGESource, username="******").process()',\ globals(), locals())
def renameProcTrack(genome, oldTn, newTn): for allowOverlaps in [False, True]: oldPath = createDirPath(oldTn, genome, allowOverlaps=allowOverlaps) if not os.path.exists(oldPath): print 'Warning: TN did not exist as preproc ' + ('with overlaps' if allowOverlaps else ' without overlaps') else: print '(renaming TN in preproc ' + ('with overlaps' if allowOverlaps else ' without overlaps') + ')' newPath = createDirPath(newTn, genome, allowOverlaps=allowOverlaps) if not ONLY_SIMULATION: assert not os.path.exists(newPath), 'ERROR: Target path already exists: ' + newPath ensurePathExists(newPath) shutil.move(oldPath, newPath) else: print 'Would move %s to %s' % (oldPath, newPath)
def getTrackData(self, trackName, genome, chr, allowOverlaps, forceChrFolders=False): trackData = TrackData() brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps) if not forceChrFolders and brShelve.fileExists(): chr = None dir = createDirPath(trackName, genome, chr, allowOverlaps) for fn in os.listdir(dir): fullFn = dir + os.sep + fn if fn[0] == '.' or os.path.isdir(fullFn): continue if isBoundingRegionFileName(fn): if fullFn not in self._fileDict: self._fileDict[fullFn] = brShelve trackData.boundingRegionShelve = self._fileDict[fullFn] continue prefix, elementDim, dtypeDim, dtype = parseMemmapFileFn(fn) assert prefix not in trackData trackData[prefix] = self._getFile(chr, dir, fullFn, elementDim, dtype, dtypeDim) return trackData
def getTrackData(self, trackName, genome, chr, allowOverlaps, forceChrFolders=False): trackData = TrackData() brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps) if not forceChrFolders and brShelve.fileExists(): chr = None dir = createDirPath(trackName, genome, chr, allowOverlaps) for fn in os.listdir(dir): fullFn = dir + os.sep + fn if fn[0] == '.' or os.path.isdir(fullFn): continue if isBoundingRegionFileName(fn): if fullFn not in self._fileDict: self._fileDict[fullFn] = brShelve trackData.boundingRegionShelve = self._fileDict[fullFn] continue prefix, elementDim, dtypeDim, dtype = parseMemmapFileFn(fn) assert prefix not in trackData trackData[prefix] = self._getFile(chr, dir, fullFn, elementDim, dtype, dtypeDim) return trackData
def _renameTrackNameIfIllegal(self, trackName): legalTrackName = [replaceIllegalElementsInTrackNames(x) for x in trackName] if legalTrackName != trackName and os.path.exists(createDirPath(trackName, self._genome)): renameTrack(self._genome, trackName, legalTrackName) return legalTrackName
def removeChrMemmapFolders(genome, trackName, allowOverlaps): chrList = PreProcMetaDataCollector(genome, trackName).getPreProcessedChrs(allowOverlaps) for chr in chrList: path = createDirPath(trackName, genome, chr, allowOverlaps) assert os.path.exists(path), 'Path does not exist: ' + path assert os.path.isdir(path), 'Path is not a directory: ' + path shutil.rmtree(path)
def removeOutdatedPreProcessedFiles(genome, trackName, allowOverlaps, mode): collector = PreProcMetaDataCollector(genome, trackName) if PreProcessUtils.preProcFilesExist(genome, trackName, allowOverlaps) and not \ collector.hasRemovedPreProcFiles(allowOverlaps): dirPath = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) assert dirPath.startswith(Config.PROCESSED_DATA_PATH), \ "Processed data path '%s' does not start with '%s'" % \ (dirPath, Config.PROCESSED_DATA_PATH) if mode == 'Real': print 'Removing outdated preprocessed data: ', dirPath for fn in os.listdir(dirPath): fullFn = os.path.join(dirPath, fn) if os.path.isfile(fullFn): os.unlink(fullFn) if os.path.isdir(fullFn): if PreProcessUtils._isOldTypeChromDirectory(fullFn, genome): shutil.rmtree(fullFn) else: print 'Would now have removed outdated preprocessed data if real run: ', dirPath collector.updateRemovedPreProcFilesFlag(allowOverlaps, True) if mode == 'Real': ti = TrackInfo(genome, trackName) ti.resetTimeOfPreProcessing()
def renameProcTrack(genome, oldTn, newTn): for allowOverlaps in [False, True]: oldPath = createDirPath(oldTn, genome, allowOverlaps=allowOverlaps) if not os.path.exists(oldPath): print 'Warning: TN did not exist as preproc ' + ( 'with overlaps' if allowOverlaps else ' without overlaps') else: print '(renaming TN in preproc ' + ('with overlaps' if allowOverlaps else ' without overlaps') + ')' newPath = createDirPath(newTn, genome, allowOverlaps=allowOverlaps) if not ONLY_SIMULATION: assert not os.path.exists( newPath), 'ERROR: Target path already exists: ' + newPath ensurePathExists(newPath) shutil.move(oldPath, newPath) else: print 'Would move %s to %s' % (oldPath, newPath)
def merge(genome, trackName, allowOverlaps): path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) collector = PreProcMetaDataCollector(genome, trackName) chrList = collector.getPreProcessedChrs(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): chrList = sorted(chrList) existingChrList = [ chr for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList) ] if len(existingChrList) == 0: raise EmptyGESourceError( 'No data lines has been read from source file (probably because it is empty).' ) firstChrTrackData = TrackSource().getTrackData(trackName, genome, existingChrList[0], allowOverlaps, forceChrFolders=True) arrayList = firstChrTrackData.keys() for arrayName in arrayList: mergedArray = firstChrTrackData[arrayName][:] elementDim, dtypeDim = parseMemmapFileFn( firstChrTrackData[arrayName].filename)[1:3] del firstChrTrackData[arrayName] for chr in existingChrList[1:]: chrTrackData = TrackSource().getTrackData(trackName, genome, chr, allowOverlaps, forceChrFolders=True) mergedArray = ChrMemmapFolderMerger.mergeArrays( mergedArray, np.array(chrTrackData[arrayName][:])) elementDimNew, dtypeDimNew = parseMemmapFileFn( chrTrackData[arrayName].filename)[1:3] elementDim = max(elementDim, elementDimNew) dtypeDim = max(dtypeDim, dtypeDimNew) del chrTrackData[arrayName] mergedFn = createMemmapFileFn(path, arrayName, elementDim, dtypeDim, str(mergedArray.dtype)) f = np.memmap(mergedFn, dtype=mergedArray.dtype, mode='w+', shape=mergedArray.shape) f[:] = mergedArray f.flush() del f del mergedArray
def _preProcessTrackToTrack(self, fromTrackName, toTrackName, allowOverlaps): fromTrackName = self.TRACK_NAME_PREFIX + fromTrackName toTrackName = self.TRACK_NAME_PREFIX + toTrackName noOverlapsPath = createDirPath(toTrackName, self.GENOME, allowOverlaps=False) withOverlapsPath = createDirPath(toTrackName, self.GENOME, allowOverlaps=True) self._removeDir(noOverlapsPath, toTrackName) self._removeDir(withOverlapsPath, toTrackName) trackGESource = FullTrackGenomeElementSource( self.GENOME, fromTrackName, allowOverlaps=allowOverlaps) self._runWithProfiling('PreProcessTrackGESourceJob(' + repr(self.GENOME) + ',' + repr(toTrackName) + ', trackGESource=trackGESource, username="******").process()',\ globals(), locals())
def _renameTrackNameIfIllegal(self, trackName): legalTrackName = [ replaceIllegalElementsInTrackNames(x) for x in trackName ] if legalTrackName != trackName and os.path.exists( createDirPath(trackName, self._genome)): renameTrack(self._genome, trackName, legalTrackName) return legalTrackName
def renameTrack(genome, oldTn, newTn): assert newTn != oldTn[:len(newTn)], 'ERROR: it is not allowed to move a track into itself (%s -> %s)' % (':'.join(oldTn), ':'.join(newTn)) #First check to filter out misspellings.. oldPath = createDirPath(oldTn, genome) assert os.path.exists(oldPath), 'ERROR: TN did not exist in processed tracks: ' + oldPath #renaming TI first, in case of problems, such as incomplete records.. renameTrackInfo(genome, oldTn, newTn) try: renameStdTrack(genome, oldTn, newTn) except Exception, e: print e
def getSubtypes(genome, trackName, fullAccess=False): dirPath = createDirPath(trackName, genome) subtypes = [fn for fn in ProcTrackOptions._getDirContents(genome, trackName) \ if not (fn[0] in ['.','_'] or os.path.isfile(dirPath + os.sep + fn) \ or GenomeInfo.isValidChr(genome, fn))] #fixme, just temporarily:, these dirs should start with _ subtypes = [x for x in subtypes if not x in ['external', 'ucsc']] #if not fullAccess and not ProcTrackOptions._isLiteratureTrack(genome, trackName): # subtypes = [x for x in subtypes if not TrackInfo(genome, trackName+[x]).private] return sorted(subtypes, key=str.lower)
def getSubtypes(genome, trackName, fullAccess=False): dirPath = createDirPath(trackName, genome) subtypes = [fn for fn in ProcTrackOptions._getDirContents(genome, trackName) \ if not (fn[0] in ['.','_'] or os.path.isfile(dirPath + os.sep + fn) \ or GenomeInfo.isValidChr(genome, fn))] #fixme, just temporarily:, these dirs should start with _ subtypes= [x for x in subtypes if not x in ['external','ucsc'] ] #if not fullAccess and not ProcTrackOptions._isLiteratureTrack(genome, trackName): # subtypes = [x for x in subtypes if not TrackInfo(genome, trackName+[x]).private] return sorted(subtypes, key=str.lower)
def __init__(self, genome, trackName, allowOverlaps): assert allowOverlaps in [False, True] self._genome = genome self._trackName = trackName self._fn = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) + os.sep + BR_SHELVE_FILE_NAME self._contents = {} #None self._updatedChrs = set([]) from gtrackcore.input.userbins.UserBinSource import MinimalBinSource minimalBinList = MinimalBinSource(genome) self._minimalRegion = minimalBinList[0] if minimalBinList is not None else None
def _preProcess(self, trackName, noOverlapsFileCount=None, withOverlapsFileCount=None, \ noOverlapsChrElCount=None, withOverlapsChrElCount=None, customBins={}): trackName = self.TRACK_NAME_PREFIX + trackName noOverlapsPath = createDirPath(trackName, self.GENOME, allowOverlaps=False) withOverlapsPath = createDirPath(trackName, self.GENOME, allowOverlaps=True) self._removeDir(noOverlapsPath, trackName) self._removeDir(withOverlapsPath, trackName) self._runWithProfiling('PreProcessAllTracksJob(' + repr(self.GENOME) + ',' + repr(trackName) + ', username="******").process()',\ globals(), locals()) if noOverlapsFileCount is not None: self.assertEquals(noOverlapsFileCount, len([x for x in os.listdir(noOverlapsPath) if not x.startswith('.')])) if withOverlapsFileCount is not None: self.assertEquals(withOverlapsFileCount, len([x for x in os.listdir(withOverlapsPath) if not x.startswith('.')])) if noOverlapsChrElCount is not None: self.assertChrElCounts(trackName, noOverlapsChrElCount, False, customBins) if withOverlapsChrElCount is not None: self.assertChrElCounts(trackName, withOverlapsChrElCount, True, customBins)
def _createOutputDirectory(self, genome, chr, trackName, allowOverlaps, geSourceManager): dirPath = createDirPath(trackName, genome, chr, allowOverlaps) from gtrackcore.metadata.GenomeInfo import GenomeInfo return OutputDirectory(dirPath, geSourceManager.getPrefixList(), \ geSourceManager.getNumElementsForChr(chr), \ GenomeInfo.getChrLen(genome, chr), \ geSourceManager.getValDataType(), \ geSourceManager.getValDim(), \ geSourceManager.getEdgeWeightDataType(), \ geSourceManager.getEdgeWeightDim(), \ geSourceManager.getMaxNumEdgesForChr(chr), \ geSourceManager.getMaxStrLensForChr(chr), \ geSourceManager.isSorted())
def _preProcess(self, trackName, noOverlapsFileCount=None, withOverlapsFileCount=None, \ noOverlapsChrElCount=None, withOverlapsChrElCount=None, customBins={}): trackName = self.TRACK_NAME_PREFIX + trackName noOverlapsPath = createDirPath(trackName, self.GENOME, allowOverlaps=False) withOverlapsPath = createDirPath(trackName, self.GENOME, allowOverlaps=True) self._removeDir(noOverlapsPath, trackName) self._removeDir(withOverlapsPath, trackName) self._runWithProfiling('PreProcessAllTracksJob(' + repr(self.GENOME) + ',' + repr(trackName) + ', username="******").process()',\ globals(), locals()) if noOverlapsFileCount is not None: self.assertEquals( noOverlapsFileCount, len([ x for x in os.listdir(noOverlapsPath) if not x.startswith('.') ])) if withOverlapsFileCount is not None: self.assertEquals( withOverlapsFileCount, len([ x for x in os.listdir(withOverlapsPath) if not x.startswith('.') ])) if noOverlapsChrElCount is not None: self.assertChrElCounts(trackName, noOverlapsChrElCount, False, customBins) if withOverlapsChrElCount is not None: self.assertChrElCounts(trackName, withOverlapsChrElCount, True, customBins)
def _createOutputDirectory(self, genome, chr, trackName, allowOverlaps, geSourceManager): dirPath = createDirPath(trackName, genome, chr, allowOverlaps) from gtrackcore.metadata.GenomeInfo import GenomeInfo return OutputDirectory(dirPath, geSourceManager.getPrefixList(), \ geSourceManager.getNumElementsForChr(chr), \ GenomeInfo.getChrLen(genome, chr), \ geSourceManager.getValDataType(), \ geSourceManager.getValDim(), \ geSourceManager.getEdgeWeightDataType(), \ geSourceManager.getEdgeWeightDim(), \ geSourceManager.getMaxNumEdgesForChr(chr), \ geSourceManager.getMaxStrLensForChr(chr), \ geSourceManager.isSorted())
def __init__(self, genome, trackName, allowOverlaps): assert allowOverlaps in [False, True] self._genome = genome self._trackName = trackName self._fn = createDirPath( trackName, genome, allowOverlaps=allowOverlaps) + os.sep + BR_SHELVE_FILE_NAME self._contents = {} #None self._updatedChrs = set([]) from gtrackcore.input.userbins.UserBinSource import MinimalBinSource minimalBinList = MinimalBinSource(genome) self._minimalRegion = minimalBinList[ 0] if minimalBinList is not None else None
def preProcFilesExist(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) preProcFilesExist = collector.preProcFilesExist(allowOverlaps) if preProcFilesExist is None: dirPath = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) if BoundingRegionShelve(genome, trackName, allowOverlaps).fileExists(): preProcFilesExist = True # any( fn.split('.')[0] in ['start', 'end', 'val', 'edges'] \ # for fn in os.listdir(dirPath) if os.path.isfile(os.path.join(dirPath, fn)) ) else: if os.path.exists(dirPath): preProcFilesExist = PreProcessUtils._hasOldTypeChromSubDirs(dirPath, genome) else: preProcFilesExist = False collector.updatePreProcFilesExistFlag(allowOverlaps, preProcFilesExist) return preProcFilesExist
def renameTrack(genome, oldTn, newTn): assert newTn != oldTn[:len( newTn )], 'ERROR: it is not allowed to move a track into itself (%s -> %s)' % ( ':'.join(oldTn), ':'.join(newTn)) #First check to filter out misspellings.. oldPath = createDirPath(oldTn, genome) assert os.path.exists( oldPath), 'ERROR: TN did not exist in processed tracks: ' + oldPath #renaming TI first, in case of problems, such as incomplete records.. renameTrackInfo(genome, oldTn, newTn) try: renameStdTrack(genome, oldTn, newTn) except Exception, e: print e
def extract(cls, trackName, regionList, fn, fileFormatName=DEFAULT_FILE_FORMAT_NAME, globalCoords=True, \ addSuffix=False, asOriginal=False, allowOverlaps=False, ignoreEmpty=False): from gtrackcore.input.adapters.TrackGenomeElementSource import TrackGenomeElementSource from gtrackcore.extract.fileformats.FileFormatComposer import getComposerClsFromFileFormatName, getComposerClsFromFileSuffix assert len(regionList) > 0 for region in regionList: genome = region.genome break #To silently extract correctly if track type is dense if allowOverlaps: allowOverlaps = os.path.exists( createDirPath(trackName, genome, allowOverlaps=True)) trackGESource = TrackGenomeElementSource(genome, trackName, regionList, globalCoords=globalCoords, \ allowOverlaps=allowOverlaps, printWarnings=False) composerCls = None if asOriginal: ti = TrackInfo(genome, trackName) if ti.fileType != '': try: composerCls = getComposerClsFromFileSuffix(ti.fileType) except: pass if composerCls is None: composerCls = getComposerClsFromFileFormatName(fileFormatName) if addSuffix: fn = os.path.splitext( fn)[0] + '.' + composerCls.getDefaultFileNameSuffix() composer = composerCls(trackGESource) ok = composer.composeToFile(fn, ignoreEmpty=ignoreEmpty) if ok: return fn
def extract(cls, trackName, regionList, fn, fileFormatName=DEFAULT_FILE_FORMAT_NAME, globalCoords=True, \ addSuffix=False, asOriginal=False, allowOverlaps=False, ignoreEmpty=False): from gtrackcore.input.adapters.TrackGenomeElementSource import TrackGenomeElementSource from gtrackcore.extract.fileformats.FileFormatComposer import getComposerClsFromFileFormatName, getComposerClsFromFileSuffix assert len(regionList) > 0 for region in regionList: genome = region.genome break #To silently extract correctly if track type is dense if allowOverlaps: allowOverlaps = os.path.exists(createDirPath(trackName, genome, allowOverlaps=True)) trackGESource = TrackGenomeElementSource(genome, trackName, regionList, globalCoords=globalCoords, \ allowOverlaps=allowOverlaps, printWarnings=False) composerCls = None if asOriginal: ti = TrackInfo(genome, trackName) if ti.fileType != '': try: composerCls = getComposerClsFromFileSuffix(ti.fileType) except: pass if composerCls is None: composerCls = getComposerClsFromFileFormatName(fileFormatName) if addSuffix: fn = os.path.splitext(fn)[0] + '.' + composerCls.getDefaultFileNameSuffix() composer = composerCls(trackGESource) ok = composer.composeToFile(fn, ignoreEmpty=ignoreEmpty) if ok: return fn
def merge(genome, trackName, allowOverlaps): path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) collector = PreProcMetaDataCollector(genome, trackName) chrList = collector.getPreProcessedChrs(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): chrList = sorted(chrList) existingChrList = [chr for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList)] if len(existingChrList) == 0: raise EmptyGESourceError('No data lines has been read from source file (probably because it is empty).') firstChrTrackData = TrackSource().getTrackData(trackName, genome, existingChrList[0], allowOverlaps, forceChrFolders=True) arrayList = firstChrTrackData.keys() for arrayName in arrayList: mergedArray = firstChrTrackData[arrayName][:] elementDim, dtypeDim = parseMemmapFileFn(firstChrTrackData[arrayName].filename)[1:3] del firstChrTrackData[arrayName] for chr in existingChrList[1:]: chrTrackData = TrackSource().getTrackData(trackName, genome, chr, allowOverlaps, forceChrFolders=True) mergedArray = ChrMemmapFolderMerger.mergeArrays(mergedArray, np.array(chrTrackData[arrayName][:])) elementDimNew, dtypeDimNew = parseMemmapFileFn(chrTrackData[arrayName].filename)[1:3] elementDim = max(elementDim, elementDimNew) dtypeDim = max(dtypeDim, dtypeDimNew) del chrTrackData[arrayName] mergedFn = createMemmapFileFn(path, arrayName, elementDim, dtypeDim, str(mergedArray.dtype)) f = np.memmap(mergedFn, dtype=mergedArray.dtype, mode='w+', shape=mergedArray.shape) f[:] = mergedArray f.flush() del f del mergedArray
def _getDirPath(genome=''): from gtrackcore.util.CommonFunctions import createDirPath, ensurePathExists dirPath = createDirPath([], '') ensurePathExists(dirPath) return dirPath
def _getDirContents(genome, trackName): dirPath = createDirPath(trackName, genome) return os.listdir(dirPath) if os.path.exists(dirPath) else []
def _getDirPath(genome=''): from gtrackcore.util.CommonFunctions import createDirPath, ensurePathExists dirPath = createDirPath([], '') ensurePathExists(dirPath) return dirPath
def _getDirContents(genome, trackName): dirPath = createDirPath(trackName, genome) return os.listdir(dirPath) if os.path.exists(dirPath) else []
def setUp(self): self._path = createDirPath(['testBoundingRegionShelve'], 'TestGenome', allowOverlaps=False) self._fn = self._path + os.sep + 'boundingRegions.shelve'
def setUp(self): self._path = createDirPath(['testBoundingRegionShelve'], 'TestGenome', allowOverlaps=False) self._fn = self._path + os.sep + 'boundingRegions.shelve'
def _removeAllTrackData(self, trackName, removeOrigData=True): self._removeDir(createDirPath(trackName, self.GENOME, allowOverlaps=False), trackName) self._removeDir(createDirPath(trackName, self.GENOME, allowOverlaps=True), trackName) if removeOrigData: self._removeDir(createOrigPath(self.GENOME, trackName), trackName)
def _preProcess(self, trackName): self._removeDir(createDirPath(trackName, self.GENOME, allowOverlaps=False), trackName) self._removeDir(createDirPath(trackName, self.GENOME, allowOverlaps=True), trackName) PreProcessAllTracksJob(self.GENOME, trackName, username="******").process()