def __init__(self, genomeAnchor, startList, endList, valList, strandList, idList, edgesList, \ weightsList, borderHandling, allowOverlaps, extraLists=OrderedDict()): assert (startList is not None) or (endList is not None) or ( valList is not None) or (edgesList is not None) assert borderHandling in ['crop'] self.genomeAnchor = genomeAnchor.getCopy() self.trackFormat = TrackFormat(startList, endList, valList, strandList, idList, edgesList, weightsList, extraLists) self.borderHandling = borderHandling self.allowOverlaps = allowOverlaps self._trackElement = TrackElement(self) #self._bpLevelArray = None self._startList = startList self._endList = endList self._valList = valList self._strandList = strandList self._idList = idList self._edgesList = edgesList self._weightsList = weightsList self._extraLists = copy(extraLists) self._handlePointsAndPartitions() if self._startList is None: self._trackElement.start = noneFunc if self._endList is None: self._trackElement.end = noneFunc if self._valList is None: self._trackElement.val = noneFunc if self._strandList is None: self._trackElement.strand = noneFunc if self._idList is None: self._trackElement.id = noneFunc if self._edgesList is None: self._trackElement.edges = noneFunc if self._weightsList is None: self._trackElement.weights = noneFunc self._updateNumListElements() for i, list in enumerate([self._startList, self._endList, self._valList, self._strandList, self._idList, self._edgesList, self._weightsList] \ + [extraList for extraList in self._extraLists.values()]): assert list is None or len( list) == self._numListElements, 'List (%s): ' % i + str( list) + ' (expected %s elements, found %s)' % ( self._numListElements, len(list))
def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorrespond = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False
def _assertIsCompatibleWith(self, tfReq, reqList): for start in [None, []]: for end in [None, []]: for val in [None, []]: for strand in [None, []]: for id,edges,weights in [(None,None,None), ([],None,None), ([],[],None), ([],[],[])]: for extra in [None, {'a':[],'b':[]}]: if [] in [start, end, val]: tf = TrackFormat(start, end, val, strand, id, edges, weights, extra) propList = [tf.isDense(), tf.isValued(), tf.isInterval(), tf.isLinked(), tf.hasStrand(), tf.hasId(), tf.isWeighted(), tf.hasExtra(), \ tf.getValTypeName() if tf.getValTypeName() != '' else False, \ tf.getWeightTypeName() if tf.getWeightTypeName() != '' else False, \ tf.getExtraNames() if tf.getExtraNames() != [] else False] isCompatible = (not False in [(r==None or r==p) for r,p in zip(reqList, propList)]) self.assertEqual(isCompatible, tfReq.isCompatibleWith(tf))
def validateAndReturnErrors(cls, choices): ''' Should validate the selected input parameters. If the parameters are not valid, an error text explaining the problem should be returned. The GUI then shows this text to the user (if not empty) and greys out the execute button (even if the text is empty). If all parameters are valid, the method should return None, which enables the execute button. ''' errorStr = cls._checkTrack(choices, trackChoiceIndex='track', genomeChoiceIndex='genome') if errorStr: return errorStr if choices.track and not choices.attr: return 'You have chosen a track with no attributes (columns) supported for splitting. ' \ 'Attributes that do not support splitting are: ' + ', '.join(cls.UNSUPPORTED_ATTRS) geSource = etm.getGESourceFromGalaxyOrVirtualTN( choices.track, choices.genome) trackFormat = TrackFormat.createInstanceFromGeSource(geSource) if trackFormat.isDense(): return 'The track format of the selected track file is: %s' % trackFormat.getFormatName() +\ ' This tool only supports track types Points, Segments, or variations of these.'
def _composeContents(self, out, hbColumns, columns, geSource, onlyNonDefault=True, singleDataLine=False): tf = TrackFormat.createInstanceFromGeSource(self._geSource) out.write(self._composeHeaderLines(onlyNonDefault)) out.write(self._composeColSpecLine(columns)) for br, geList in iterateOverBRTuplesWithContainedGEs( geSource, onlyYieldTwoGEs=singleDataLine): if br is not None: out.write(self._composeBoundingRegionLine(br)) for i, ge in enumerate( self._removeStartElementIfApplicable(tf, geList)): out.write( self._composeDataLine(ge, hbColumns, i + 1, i + 1 == len(geList))) if singleDataLine: break if singleDataLine: break
def updateTrackInfoToVersion15(genome, trackName): "genome trackName" if isinstance(trackName, str): trackName = qcf.convertTNstrToTNListFormat(trackName) ti = TrackInfo(genome, trackName) ti.preProcOverlapRules = [] trackFormatList = [] for allowOverlaps in [True, False]: from gold.origdata.PreProcessUtils import PreProcessUtils if PreProcessUtils.preProcFilesExist(genome, trackName, allowOverlaps): ti.preProcOverlapRules.append(allowOverlaps) if PreProcessUtils.mergedPreProcFilesExist(genome, trackName, allowOverlaps): mergedPrefixInfoDict = PreProcessUtils.getMergedPrefixInfoDict(genome, trackName, allowOverlaps) ti.mergedPrefixInfoDictPerOverlapRule[allowOverlaps] = mergedPrefixInfoDict trackFormatList.append(TrackFormat.createInstanceFromPrefixInfoDict(mergedPrefixInfoDict)) if trackFormatList: ti.trackFormatHash = hash(tuple(trackFormatList)) if not ti.geSourceVersion: ti.geSourceVersion = ti.preProcVersion ti.preProcVersion = '1.5' origPath = qcf.createOrigPath(genome, trackName) if os.path.exists(origPath): ti.id = ti.constructIdFromPath(genome, qcf.createOrigPath(genome, trackName), ti.geSourceVersion, ti.preProcVersion) print ti.id, ti.preProcOverlapRules, ti.mergedPrefixInfoDictPerOverlapRule, ti.trackFormatHash, ti.geSourceVersion, ti.preProcVersion ti.store() print Track(trackName).getUniqueKey(genome)
def _getBasicTrackFormat(choices, tnChoiceIndex=1, genomeChoiceIndex=0): genome = GeneralGuiTool._getGenomeChoice(choices, genomeChoiceIndex)[0] tn = GeneralGuiTool._getTrackChoice(choices, tnChoiceIndex)[0] from quick.application.GalaxyInterface import GalaxyInterface from gold.description.TrackInfo import TrackInfo from quick.application.ExternalTrackManager import ExternalTrackManager from gold.track.TrackFormat import TrackFormat if ExternalTrackManager.isGalaxyTrack(tn): geSource = ExternalTrackManager.getGESourceFromGalaxyOrVirtualTN( tn, genome) try: tf = GeneralGuiTool._convertToBasicTrackFormat( TrackFormat.createInstanceFromGeSource( geSource).getFormatName()) except Warning: return genome, tn, '' else: if GalaxyInterface.isNmerTrackName(genome, tn): tfName = 'Points' else: tfName = TrackInfo(genome, tn).trackFormatName tf = GeneralGuiTool._convertToBasicTrackFormat(tfName) return genome, tn, tf
def getOptionsBoxOutputFormat(cls, prevChoices): if prevChoices.changeFormat == cls.OUTPUT_FORMAT_CONVERT: try: from gold.origdata.GenomeElementSource import GenomeElementSource from gold.origdata.FileFormatComposer import findMatchingFileFormatComposers from gold.track.TrackFormat import TrackFormat gSuite = getGSuiteFromGalaxyTN(prevChoices.gsuite) selectedTracks = cls._getSelectedTracks(prevChoices, gSuite) allGeSources = [ GenomeElementSource(track.path, genome=track.genome, printWarnings=False, suffix=track.suffix) for track in selectedTracks ] matchingComposersForAllSelectedTracks = \ [findMatchingFileFormatComposers(TrackFormat.createInstanceFromGeSource(geSource)) for geSource in allGeSources] commonComposers = reduce( set.intersection, map(set, matchingComposersForAllSelectedTracks)) return [ composer.fileFormatName for composer in commonComposers ] except: return []
def loadTrackView(trackData, region, borderHandling, allowOverlaps, trackName=[]): """ trackData : see TrackSource.getTrackData {'id' : smartmemmap} region : see GenomeRegion """ #brShelve = BoundingRegionShelve(region.genome, trackName, allowOverlaps) brShelve = trackData.boundingRegionShelve brInfo = brShelve.getBoundingRegionInfo(region) if brShelve is not None else None extraArrayNames = [arrayName for arrayName in trackData if arrayName not in \ RESERVED_PREFIXES.keys() + ['leftIndex', 'rightIndex']] reservedArrays = [TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in RESERVED_PREFIXES] extraArrays = [TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in extraArrayNames] trackFormat = TrackFormat( *(reservedArrays + [OrderedDict(zip(extraArrayNames, extraArrays))]) ) if trackFormat.reprIsDense(): if brInfo is None: leftIndex = region.start rightIndex = region.end else: leftIndex = region.start - brInfo.start rightIndex = region.end - brInfo.start else: leftBin = CompBinManager.getBinNumber(region.start) rightBin = CompBinManager.getBinNumber(region.end-1) #leftBin = region.start/COMP_BIN_SIZE #rightBin = (region.end-1)/COMP_BIN_SIZE if trackData.get('leftIndex') is None or trackData.get('rightIndex') is None: raise IOError('Preprocessed track not found. TrackData: ' + ', '.join(trackData.keys())) leftIndex = TrackViewLoader._getArray(trackData, 'leftIndex', brInfo, leftBin) rightIndex = TrackViewLoader._getArray(trackData, 'rightIndex', brInfo, rightBin) slicedReservedArrays = [(array[leftIndex:rightIndex] if array is not None else None) for array in reservedArrays] slicedExtraArrays = [(array[leftIndex:rightIndex] if array is not None else None) for array in extraArrays] argList = [region] + slicedReservedArrays + [borderHandling, allowOverlaps] + [OrderedDict(zip(extraArrayNames, slicedExtraArrays))] tv = TrackView( *(argList) ) if not trackFormat.reprIsDense(): tv.sliceElementsAccordingToGenomeAnchor() #tv._doScatteredSlicing() return tv
def extractToFile(self, fn, outTrackName): append = False for region in GlobalBinSource(self._genome): print 'Creating segmentation for chr: ',region.chr trackView = PlainTrack(self._inTrackName).getTrackView(region) teSource = FunctionCategorizerWrapper(trackView, self._categorizerMethod, minSegLen=self._minSegLen) teSource.trackFormat = TrackFormat.createInstanceFromPrefixList(['start','end','val']) TrackExtractor._extract(teSource, outTrackName, region, fn, append=append, globalCoords=True, addSuffix=True) append = True
def __init__(self, geSource): GESourceManager.__init__(self, geSource) self._tf = TrackFormat.createInstanceFromGeSource(geSource) self._numElements = defaultdict(partial(OrderedDefaultDict, int)) self._valCategories = defaultdict(partial(OrderedDefaultDict, set)) self._edgeWeightCategories = defaultdict(partial(OrderedDefaultDict, set)) self._maxStrLens = defaultdict(partial(OrderedDefaultDict, \ partial(self._initMaxStrLens, self._getMaxStrLensKeys()))) self._maxNumEdges = defaultdict(partial(OrderedDefaultDict, int))
def testExtra(self): tf = TrackFormat.createInstanceFromPrefixList(['start', 'a', 'b', 'c'], 'float64', 1, 'float64', 1) self.assertTrue(tf.hasExtra(specificExtra='a')) self.assertFalse(tf.hasExtra(specificExtra='d')) self.assertEqual(['a','b','c'], tf.getExtraNames()) tfq = TrackFormatReq(interval=False, extra=['a','b']) self.assertFalse(tfq.isCompatibleWith(tf))
def _getRawTrackView(self, region, borderHandling, allowOverlaps): assert region.start == 0 and region.end == 1 from collections import OrderedDict from gold.track.CommonMemmapFunctions import findEmptyVal from gold.track.TrackView import TrackView import numpy as np geSource = ExternalTrackManager.getGESourceFromGalaxyOrVirtualTN(self.trackName, region.genome) prefixList = geSource.getPrefixList() valDataType = geSource.getValDataType() valDim = geSource.getValDim() weightDataType = geSource.getEdgeWeightDataType() weightDim = geSource.getEdgeWeightDim() startList, endList, valList, strandList, idList, edgesList, weightsList = [None]*7 extraLists=OrderedDict() tf = TrackFormat.createInstanceFromPrefixList(prefixList, valDataType, valDim, \ weightDataType, weightDim) if allowOverlaps and (tf.isDense() or geSource.hasNoOverlappingElements()): raise IncompatibleTracksError(prettyPrintTrackName(self.trackName) + ' with format: '\ + str(tf) + ' does not satisfy ' + str(self._trackFormatReq)) denseAndInterval = tf.isDense() and tf.isInterval() numEls = 2 if denseAndInterval else 1 if valDataType == 'S': valDataType = 'S2' if weightDataType == 'S': weightDataType = 'S2' for prefix in prefixList: if prefix == 'start': startList = np.array([-1], dtype='int32') elif prefix == 'end': if denseAndInterval: endList = np.array([0, 1], dtype='int32') else: endList = np.array([0], dtype='int32') elif prefix == 'val': valList = np.array([findEmptyVal(valDataType)] * valDim * numEls, \ dtype=valDataType).reshape((numEls, valDim) if valDim > 1 else numEls) elif prefix == 'strand': strandList = np.array([1] * numEls, dtype='int8') elif prefix == 'id': idList = np.array([''] * numEls, dtype='S1') elif prefix == 'edges': edgesList = np.array([['']] * numEls, dtype='S1') elif prefix == 'weights': weightsList = np.array([[[findEmptyVal(weightDataType)]]] * weightDim * numEls, \ dtype=weightDataType).reshape((numEls, 1, weightDim) if weightDim > 1 else (numEls, 1)) else: extraLists[prefix] = np.array([''] * numEls, dtype='S1') return TrackView(region, startList, endList, valList, strandList, idList, edgesList, weightsList, borderHandling, allowOverlaps, extraLists)
def _getRawTrackView(self, region, borderHandling, allowOverlaps): assert len(region) == 1 from collections import OrderedDict from gold.track.CommonMemmapFunctions import findEmptyVal from gold.track.TrackView import TrackView import numpy as np geSource = ExternalTrackManager.getGESourceFromGalaxyOrVirtualTN(self.trackName, region.genome) prefixList = geSource.getPrefixList() valDataType = geSource.getValDataType() valDim = geSource.getValDim() weightDataType = geSource.getEdgeWeightDataType() weightDim = geSource.getEdgeWeightDim() startList, endList, valList, strandList, idList, edgesList, weightsList = [None]*7 extraLists=OrderedDict() tf = TrackFormat.createInstanceFromPrefixList(prefixList, valDataType, valDim, \ weightDataType, weightDim) if allowOverlaps and (tf.isDense() or geSource.hasNoOverlappingElements()): raise IncompatibleTracksError(prettyPrintTrackName(self.trackName) + ' with format: '\ + str(tf) + ' does not satisfy ' + str(self._trackFormatReq)) denseAndInterval = tf.isDense() and tf.isInterval() numEls = 2 if denseAndInterval else 1 if valDataType == 'S': valDataType = 'S2' if weightDataType == 'S': weightDataType = 'S2' for prefix in prefixList: if prefix == 'start': startList = np.array([-1], dtype='int32') elif prefix == 'end': if denseAndInterval: endList = np.array([0, 1], dtype='int32') else: endList = np.array([0], dtype='int32') elif prefix == 'val': valList = np.array([findEmptyVal(valDataType)] * valDim * numEls, \ dtype=valDataType).reshape((numEls, valDim) if valDim > 1 else numEls) elif prefix == 'strand': strandList = np.array([1] * numEls, dtype='int8') elif prefix == 'id': idList = np.array([''] * numEls, dtype='S1') elif prefix == 'edges': edgesList = np.array([['']] * numEls, dtype='S1') elif prefix == 'weights': weightsList = np.array([[[findEmptyVal(weightDataType)]]] * weightDim * numEls, \ dtype=weightDataType).reshape((numEls, 1, weightDim) if weightDim > 1 else (numEls, 1)) else: extraLists[prefix] = np.array([''] * numEls, dtype='S1') return TrackView(region, startList, endList, valList, strandList, idList, edgesList, weightsList, borderHandling, allowOverlaps, extraLists)
def getOptionsBoxConversion(prevChoices): if prevChoices.history: try: geSource = UniversalConverterTool._getGESource(prevChoices) matchingComposers = findMatchingFileFormatComposers(TrackFormat.createInstanceFromGeSource(geSource)) return ['%s -> %s (track type: %s)' % \ (geSource.getFileFormatName(), composerInfo.fileFormatName, composerInfo.trackFormatName) \ for composerInfo in matchingComposers if geSource.getFileFormatName() != composerInfo.fileFormatName] except: return []
def _calcTrackFormatHash(mergedPrefixInfoDictPerOverlapRule): trackFormatList = [] for allowOverlaps in mergedPrefixInfoDictPerOverlapRule.keys(): mergedPrefixInfoDict = mergedPrefixInfoDictPerOverlapRule[ allowOverlaps] trackFormatList.append( TrackFormat.createInstanceFromPrefixInfoDict( mergedPrefixInfoDict)) trackFormatHash = hash(tuple(trackFormatList)) return trackFormatHash
def testValTypes(self): tf = TrackFormat.createInstanceFromPrefixList(['start', 'val'], 'float128', 2, 'float64', 1) self.assertTrue(tf.isValued(specificValType='mean_sd')) self.assertFalse(tf.isValued(specificValType='number')) self.assertEqual('Mean and std.dev.', tf.getValTypeName()) self.assertEqual('Valued points', tf.getFormatName()) tfq = TrackFormatReq(interval=False, val='tc') self.assertFalse(tfq.isCompatibleWith(tf))
def testWeightTypes(self): tf = TrackFormat.createInstanceFromPrefixList(['id', 'edges', 'weights'], 'float64', 1, 'S8', 3) self.assertTrue(tf.isWeighted(specificWeightType='category_vector')) self.assertFalse(tf.isWeighted(specificWeightType='number')) self.assertEqual('Vector of categories', tf.getWeightTypeName()) self.assertEqual('Linked base pairs', tf.getFormatName()) tfq = TrackFormatReq(linked=True, weights='number') self.assertFalse(tfq.isCompatibleWith(tf))
def testCompatibilityWithExceptions(self): tf = TrackFormat.createInstanceFromPrefixList(['start', 'val'], 'float64', 1, 'float64', 1) self.assertFalse(TrackFormatReq(interval=True, strand=True, val='number')\ .isCompatibleWith(tf)) self.assertFalse(TrackFormatReq(interval=True, strand=True, val='number')\ .isCompatibleWith(tf, ['interval'])) self.assertTrue(TrackFormatReq(interval=True, strand=True, val='number')\ .isCompatibleWith(tf, ['interval', 'hasStrand'])) self.assertFalse(TrackFormatReq(interval=True, strand=True, val='tc')\ .isCompatibleWith(tf, ['interval', 'hasStrand']))
def __new__(self, geSource, brRegionList): tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.reprIsDense(): if tf.getValTypeName() == 'Number': return NumberFunctionOneChrSortedNoOverlapsGESourceManager.__new__\ (NumberFunctionOneChrSortedNoOverlapsGESourceManager, geSource, brRegionList) else: raise NotSupportedError else: return SparseOneChrSortedNoOverlapsGESourceManager.__new__\ (SparseOneChrSortedNoOverlapsGESourceManager, geSource, brRegionList)
def _getGESourceManagerFromGESource(self, geSource): tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.reprIsDense(): if tf.getValTypeName() in [ 'Number', 'Number (integer)', 'Case-control' ]: return SkipExtraPassDenseGESourceManager(geSource) else: raise NotSupportedError else: return GESourceManager(geSource)
def getOptionsBoxConversion(prevChoices): if prevChoices.history: try: geSource = UniversalConverterTool._getGESource(prevChoices) matchingComposers = findMatchingFileFormatComposers( TrackFormat.createInstanceFromGeSource(geSource)) return ['%s -> %s (track type: %s)' % \ (geSource.getFileFormatName(), composerInfo.fileFormatName, composerInfo.trackFormatName) \ for composerInfo in matchingComposers if geSource.getFileFormatName() != composerInfo.fileFormatName] except: return []
def testExtra(self): tf = TrackFormat.createInstanceFromPrefixList(['start', 'a', 'b', 'c'], 'float64', 1, 'float64', 1) self.assertTrue(tf.hasExtra(specificExtra='a')) self.assertFalse(tf.hasExtra(specificExtra='d')) self.assertEqual(['a', 'b', 'c'], tf.getExtraNames()) tfq = TrackFormatReq(interval=False, extra=['a', 'b']) self.assertFalse(tfq.isCompatibleWith(tf))
def testWeightTypes(self): tf = TrackFormat.createInstanceFromPrefixList( ['id', 'edges', 'weights'], 'float64', 1, 'S8', 3) self.assertTrue(tf.isWeighted(specificWeightType='category_vector')) self.assertFalse(tf.isWeighted(specificWeightType='number')) self.assertEqual('Vector of categories', tf.getWeightTypeName()) self.assertEqual('Linked base pairs', tf.getFormatName()) tfq = TrackFormatReq(linked=True, weights='number') self.assertFalse(tfq.isCompatibleWith(tf))
def getOptionsBoxFormat(prevChoices): if prevChoices.track: geSource = etm.getGESourceFromGalaxyOrVirtualTN( prevChoices.track, prevChoices.genome) tf = TrackFormat.createInstanceFromGeSource(geSource) matchingComposers = findMatchingFileFormatComposers(tf) conversions = [geSource.getFileFormatName() + \ ' (no conversion, track type: %s)' % tf.getFormatName()] conversions += ['%s -> %s (track type: %s)' % (geSource.getFileFormatName(), \ composerInfo.fileFormatName, composerInfo.trackFormatName) \ for composerInfo in matchingComposers \ if geSource.getFileFormatName() != composerInfo.fileFormatName] return conversions
def __iter__(self): self = copy(self) #does not support function, partitions and points: if (False in [attrs in self._geSource.getPrefixList() for attrs in ['start', 'end']]): raise NotSupportedError('Binning file must be segments. Current file format: ' + \ TrackFormat.createInstanceFromPrefixList(self._geSource.getPrefixList(), \ self._geSource.getValDataType(), \ self._geSource.getValDim(), \ self._geSource.getEdgeWeightDataType(), \ self._geSource.getEdgeWeightDim()).getFormatName() ) self._geIter = self._geSource.__iter__() return self
def testFormats(self): for start in [None, []]: for end in [None, []]: for val in [None, []]: for strand in [None, []]: for id, edges, weights in [(None, None, None), ([], None, None), ([], [], None), ([], [], [])]: for extra in [None, {'a': [], 'b': []}]: if [] in [start, end, val, edges]: tf = TrackFormat(start, end, val, strand, id, edges, weights, extra) self._assertTrackFormat(tf, start==[], end==[], val==[], strand==[], id==[], edges==[], weights==[], \ hasExtra=extra is not None, extra=extra.keys() if extra is not None else [])
def _composeContents(self, out, hbColumns, columns, geSource, onlyNonDefault=True, singleDataLine=False): tf = TrackFormat.createInstanceFromGeSource(self._geSource) out.write( self._composeHeaderLines(onlyNonDefault) ) out.write( self._composeColSpecLine(columns) ) for br, geList in iterateOverBRTuplesWithContainedGEs(geSource, onlyAddTwoGEs=singleDataLine): if br is not None: out.write( self._composeBoundingRegionLine(br) ) for i, ge in enumerate(self._removeStartElementIfApplicable(tf, geList)): out.write( self._composeDataLine(ge, hbColumns, i+1, i+1 == len(geList)) ) if singleDataLine: break if singleDataLine: break
def _calcTrackStatistics(self, chr, allowOverlaps): if chr not in self._numElements[allowOverlaps]: # In order to handle the first element of each bounding region for # genome partitions and step functions correctly tf = TrackFormat.createInstanceFromGeSource(self._geSource) if tf.isDense() and tf.isInterval(): geList = self._getGEBuckets(allowOverlaps)[chr] prevEnd = 0 for br in self._getBRBuckets(allowOverlaps)[chr]: for i, el in enumerate(geList[prevEnd:prevEnd + br.elCount]): self._updateTrackStatistics(el, chr, allowOverlaps, \ firstElInPartitionBoundingRegion=(i==0)) prevEnd += br.elCount else: for el in self._getGEBuckets(allowOverlaps)[chr]: self._updateTrackStatistics(el, chr, allowOverlaps)
def _allGESources(self, trackName): regionList = self._regionList if self._preProcess else [ self._regionList[0] ] for region in regionList: self._status = "Trying to create custom track geSource for region: {}".format( region) geSource = self._getGeSourceCallBackFunc(self._genome, self._trackName, region, **self._callBackArgs) tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.reprIsDense(): brList = [BoundingRegionTuple(region, len(region))] yield BrTuplesGESourceWrapper(geSource, brList) else: brList = [BoundingRegionTuple(region, 0)] yield GEBoundingRegionElementCounter(geSource, brList)
def __iter__(self): self = copy(self) #does not support function, partitions and points: if (False in [ attrs in self._geSource.getPrefixList() for attrs in ['start', 'end'] ]): raise NotSupportedError('Binning file must be segments. Current file format: ' + \ TrackFormat.createInstanceFromPrefixList(self._geSource.getPrefixList(), \ self._geSource.getValDataType(), \ self._geSource.getValDim(), \ self._geSource.getEdgeWeightDataType(), \ self._geSource.getEdgeWeightDim()).getFormatName() ) self._geIter = self._geSource.__iter__() return self
def extractToFile(self, fn, outTrackName): append = False for region in GlobalBinSource(self._genome): print 'Creating segmentation for chr: ', region.chr trackView = PlainTrack(self._inTrackName).getTrackView(region) teSource = FunctionCategorizerWrapper(trackView, self._categorizerMethod, minSegLen=self._minSegLen) teSource.trackFormat = TrackFormat.createInstanceFromPrefixList( ['start', 'end', 'val']) TrackExtractor._extract(teSource, outTrackName, region, fn, append=append, globalCoords=True, addSuffix=True) append = True
def _getValueTypeName(choices, tnChoiceIndex=1, genomeChoiceIndex=0): genome = GeneralGuiTool._getGenomeChoice(choices, genomeChoiceIndex)[0] tn = GeneralGuiTool._getTrackChoice(choices, tnChoiceIndex)[0] from quick.application.GalaxyInterface import GalaxyInterface from gold.description.TrackInfo import TrackInfo from quick.application.ExternalTrackManager import ExternalTrackManager from gold.track.TrackFormat import TrackFormat if ExternalTrackManager.isGalaxyTrack(tn): geSource = ExternalTrackManager.getGESourceFromGalaxyOrVirtualTN( tn, genome) valTypeName = TrackFormat.createInstanceFromGeSource( geSource).getValTypeName() else: if GalaxyInterface.isNmerTrackName(genome, tn): valTypeName = '' else: valTypeName = TrackInfo(genome, tn).markType return valTypeName.lower()
def testSorting(self): geSourceTest = self._commonSetup() for caseName in geSourceTest.cases: if not caseName.startswith('gtrack'): continue if 'no_sort' in caseName: print 'Test case skipped: ' + caseName continue print caseName print case = geSourceTest.cases[caseName] testFn = self._writeTestFile(case) print open(testFn).read() print sortedContents = sortGtrackFileAndReturnContents(testFn, case.genome) print sortedContents sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass forPreProcessor = True if case.sourceClass is None else False sortedGeSource = GEDependentAttributesHolder(sourceClass('sortedFile.gtrack', case.genome, \ forPreProcessor=forPreProcessor, \ printWarnings=False, \ strToUseInsteadOfFn=sortedContents)) reprIsDense = TrackFormat.createInstanceFromGeSource(sortedGeSource).reprIsDense() if not reprIsDense: self.assertEquals(sorted(case.assertElementList), [ge for ge in sortedGeSource]) else: for ge in sortedGeSource: pass self.assertEquals(sorted(case.boundingRegionsAssertList), [br for br in sortedGeSource.getBoundingRegionTuples()])
def _compose(self, out): trackName = self._geSource.getTrackName() if trackName is not None: name = ':'.join(self._geSource.getTrackName()).replace(' ','_') else: name = None print >>out, 'track type=wiggle_0' + (' name=%s' % name if name is not None else '') tf = TrackFormat.createInstanceFromGeSource(self._geSource) span = self._geSource.getFixedLength() step = self._geSource.getFixedGapSize() + span isFixedStep = (tf.reprIsDense() or step > 1 or (step == 1 and span != 1)) for brt, geList in iterateOverBRTuplesWithContainedGEs(self._geSource): if len(geList) == 0: continue if isFixedStep: self._composeFixedStepDeclarationLine(out, brt.region, step, span) else: curChr, curSpan = self._composeVariableStepDeclarationLine(out, geList[0]) for i,ge in enumerate(geList): if i==0 and tf.isDense() and tf.isInterval() and \ self._geSource.addsStartElementToDenseIntervals(): continue val = self._commonFormatNumberVal(ge.val) if isFixedStep: cols = [val] else: if ge.chr != curChr or self._getVariableSpan(ge) != curSpan: curChr, curSpan = self._composeVariableStepDeclarationLine(out, ge) cols = [str(ge.start+1), val] print >>out, '\t'.join([str(x) for x in cols])
def validateAndReturnErrors(choices): ''' Should validate the selected input parameters. If the parameters are not valid, an error text explaining the problem should be returned. The GUI then shows this text to the user (if not empty) and greys out the execute button (even if the text is empty). If all parameters are valid, the method should return None, which enables the execute button. ''' if not choices.genome: return 'Please select genome' if not choices.catTrack: return 'Please select categorical track from history' geSource = ExternalTrackManager.getGESourceFromGalaxyOrVirtualTN( choices.catTrack.split(':'), choices.genome) trackFormat = TrackFormat.createInstanceFromGeSource(geSource) if trackFormat.getValTypeName() != 'Category': return 'Please select <b>categorical</b> track from history, current is of type ' + trackFormat.getValTypeName( )
def _allGESourceManagers(self, trackName, allowOverlaps): trackNameStr = ':'.join(trackName) self._status = "Trying to create GESourceManager " \ "(trackName: {}, allowOverlaps: {})".format(trackNameStr, allowOverlaps) collector = PreProcMetaDataCollector(self._genome, trackName) if allowOverlaps == False and collector.overlapRuleHasBeenFinalized( True): for i in range(1): self._status = 'Trying to prepare preprocessing for track "%s"' % trackNameStr + \ (' (allowOverlaps: %s)' % allowOverlaps) yield self._getGESourceManagerFromTrack(trackName) else: for geSource in self._allGESources(trackName): if allowOverlaps == True: tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.isDense() or geSource.hasNoOverlappingElements(): return self._status = 'Trying to prepare preprocessing for track "%s"' % trackNameStr + \ (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \ (' (allowOverlaps: %s)' % allowOverlaps) if PreProcessUtils.shouldPreProcessGESource( trackName, geSource, allowOverlaps): yield self._getGESourceManagerFromGESource(geSource)
def getTrackFormat(self): return TrackFormat.createInstanceFromPrefixList(self._tempTrackInfo.prefixList, \ self._tempTrackInfo.valDataType, \ self._tempTrackInfo.valDim, \ self._tempTrackInfo.weightDataType, \ self._tempTrackInfo.weightDim)
def getOptionsBoxFormat(prevChoices): tf = TrackFormat.createInstanceFromPrefixList(['start', 'end']) return [ composer.fileFormatName for composer in findMatchingFileFormatComposers(tf) ]
def __new__(self, geSource): tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.reprIsDense(): return DenseStdGESourceManager.__new__(DenseStdGESourceManager, geSource) else: return SparseStdGESourceManager.__new__(SparseStdGESourceManager, geSource)
class TrackView(object): def _handlePointsAndPartitions(self): #if self.trackFormat.isDense() and not self.trackFormat.reprIsDense(): if self.trackFormat.isPartitionOrStepFunction(): self._startList = self._endList[:-1] self._endList = self._endList[1:] if self._valList is not None: self._valList = self._valList[1:] if self._strandList is not None: self._strandList = self._strandList[1:] if self._idList is not None: self._idList = self._idList[1:] if self._edgesList is not None: self._edgesList = self._edgesList[1:] if self._weightsList is not None: self._weightsList = self._weightsList[1:] for key, extraList in self._extraLists.items(): if extraList is not None: self._extraLists[key] = extraList[1:] if self.trackFormat.isPoints(): self._endList = VirtualPointEnd(self._startList) def __init__(self, genomeAnchor, startList=None, endList=None, valList=None, strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling='crop', allowOverlaps=False, extraLists=OrderedDict()): assert (startList is not None) or (endList is not None) or (valList is not None) or (edgesList is not None) assert borderHandling in ['crop'] self.genomeAnchor = genomeAnchor.getCopy() self.trackFormat = TrackFormat(startList, endList, valList, strandList, idList, edgesList, weightsList, extraLists) self.borderHandling = borderHandling self.allowOverlaps = allowOverlaps self._trackElement = TrackElement(self) #self._bpLevelArray = None self._startList = startList self._endList = endList self._valList = valList self._strandList = strandList self._idList = idList self._edgesList = edgesList self._weightsList = weightsList self._extraLists = copy(extraLists) self._handlePointsAndPartitions() if self._startList is None: self._trackElement.start = noneFunc if self._endList is None: self._trackElement.end = noneFunc if self._valList is None: self._trackElement.val = noneFunc if self._strandList is None: self._trackElement.strand = noneFunc if self._idList is None: self._trackElement.id = noneFunc if self._edgesList is None: self._trackElement.edges = noneFunc if self._weightsList is None: self._trackElement.weights = noneFunc self._updateNumListElements() for i, list in enumerate([self._startList, self._endList, self._valList, self._strandList, self._idList, self._edgesList, self._weightsList] \ + [extraList for extraList in self._extraLists.values()]): assert list is None or len(list) == self._numListElements, 'List (%s): ' % i + str(list) + ' (expected %s elements, found %s)' % (self._numListElements, len(list)) def __iter__(self): self._trackElement._index = -1 return self def _updateNumListElements(self): "" self._numListElements = self._computeNumListElements() if self.allowOverlaps and self._numListElements > 0: self._numIterElements = self._computeNumIterElements() else: self._numIterElements = self._numListElements def _computeNumListElements(self): for list in [self._startList, self._endList, self._valList, self._edgesList]: if list is not None: return len(list) raise ShouldNotOccurError def _computeNumIterElements(self): for list in [self._startList, self._endList, self._valList, self._edgesList]: if list is not None: if isinstance(list, numpy.ndarray): return len(self._removeStowawaysFromNumpyArray(list)) else: return sum(1 for x in self) raise ShouldNotOccurError def __len__(self): "" return self._bpSize() def getNumElements(self): return self._numIterElements def _bpSize(self): return len(self.genomeAnchor) def next(self): self._trackElement._index += 1 #To remove any blind passengers - segments entirely in front of genomeanchor, # but sorted after a larger segment crossing the border if self.allowOverlaps and not self.trackFormat.reprIsDense(): while self._trackElement._index < self._numListElements and self._endList[self._trackElement._index] <= self.genomeAnchor.start: #self._trackElement.end() <= 0: self._trackElement._index += 1 if self._trackElement._index < self._numListElements: return self._trackElement else: raise StopIteration def _findLeftIndex(self): leftIndex = 0 #remove track elements entirely to the left of the anchor while leftIndex < len(self._endList) and self._endList[leftIndex] <= self.genomeAnchor.start: leftIndex += 1 return leftIndex def _findRightIndex(self): rightIndex = self._numListElements while rightIndex > 0 and self._startList[rightIndex-1] >= self.genomeAnchor.end: rightIndex -= 1 return rightIndex def sliceElementsAccordingToGenomeAnchor(self): assert( not self.trackFormat.reprIsDense() ) self._doScatteredSlicing() def _doScatteredSlicing(self): leftIndex = self._findLeftIndex() rightIndex = self._findRightIndex() if self._bpSize() == 0: rightIndex = leftIndex self._startList = self._startList[leftIndex:rightIndex] self._endList = self._endList[leftIndex:rightIndex] if self._valList is not None: self._valList = self._valList[leftIndex:rightIndex] if self._strandList is not None: self._strandList = self._strandList[leftIndex:rightIndex] if self._idList is not None: self._idList = self._idList[leftIndex:rightIndex] if self._edgesList is not None: self._edgesList = self._edgesList[leftIndex:rightIndex] if self._weightsList is not None: self._weightsList = self._weightsList[leftIndex:rightIndex] for key, extraList in self._extraLists.items(): self._extraLists[key] = extraList[leftIndex:rightIndex] self._updateNumListElements() def _doDenseSlicing(self, i, j): if self._valList is not None: self._valList = self._valList[i:j] if self._strandList is not None: self._strandList = self._strandList[i:j] if self._idList is not None: self._idList = self._idList[i:j] if self._edgesList is not None: self._edgesList = self._edgesList[i:j] if self._weightsList is not None: self._weightsList = self._weightsList[i:j] for key, extraList in self._extraLists.items(): self._extraLists[key] = extraList[i:j] self._updateNumListElements() def __getslice__(self, i, j): slicedTV = TrackView(self.genomeAnchor, self._startList, self._endList, \ self._valList, self._strandList, self._idList, \ self._edgesList, self._weightsList, \ self.borderHandling, self.allowOverlaps, \ extraLists=self._extraLists) slicedTV.trackFormat = self.trackFormat slicedTV.genomeAnchor.start += i if j>=0: try: slicedTV.genomeAnchor.end = min(self.genomeAnchor.end, self.genomeAnchor.start + j) except FloatingPointError: # Caused by trackView[:] with self.genomeAnchor.start > 0 slicedTV.genomeAnchor.end = self.genomeAnchor.end if j<0: slicedTV.genomeAnchor.end += j if self.trackFormat.reprIsDense(): slicedTV._doDenseSlicing(i,j) else: slicedTV._doScatteredSlicing() return slicedTV def _getBpLevelModificationArray(self, indexes, vals): bpLevelMod = numpy.bincount(indexes, vals) origLen = len(bpLevelMod) bpLevelMod.resize(self._bpSize()+1) bpLevelMod[origLen:] = 0 return bpLevelMod def _commonGetBpLevelArray(self, vals): if self.trackFormat.reprIsDense(): if self.allowOverlaps: raise ShouldNotOccurError() return vals else: bpLevelArray = numpy.zeros(self._bpSize()+1) numElements = self.getNumElements() if numElements > 0: bpLevelArray += self._getBpLevelModificationArray(self.startsAsNumpyArray(), vals) bpLevelArray -= self._getBpLevelModificationArray(self.endsAsNumpyArray(), vals) bpLevelArray = bpLevelArray.cumsum(dtype='float64') return bpLevelArray[:-1] def getBinaryBpLevelArray(self): vals = numpy.ones(self.getNumElements(), dtype='int32') return numpy.array(self._commonGetBpLevelArray(vals), dtype='bool8') def getCoverageBpLevelArray(self): vals = numpy.ones(self.getNumElements(), dtype='int32') return numpy.array(self._commonGetBpLevelArray(vals), dtype='int32') def getValueBpLevelArray(self, voidValue=0): ''' Creates a bp-level function of any valued track. In case of scattered tracks, uncovered aras are filled with voidValue (which would typically be set to 0 or numpy.nan). In the case of overlapping regions, the values are added.''' assert self.trackFormat.isValued('number'), self.trackFormat vals = self.valsAsNumpyArray() bpLevelArray = numpy.array(self._commonGetBpLevelArray(vals), dtype=vals.dtype) if voidValue != 0: bpLevelArray[~self.getBinaryBpLevelArray()] = voidValue return bpLevelArray def _removeStowawaysFromNumpyArray(self, numpyArray): ''' To remove any stowaways - segments entirely in front of genomeanchor, but sorted after a larger segment crossing the border. ''' if self.allowOverlaps and len(numpyArray) > 0: numpyArray = numpyArray[numpy.where(self._endList > self.genomeAnchor.start)] return numpyArray def _commonAsNumpyArray(self, numpyArray, numpyArrayModMethod, name): assert(self.borderHandling in ['crop']) if numpyArray is None: return None numpyArray = self._removeStowawaysFromNumpyArray(numpyArray) if numpyArrayModMethod is not None: return numpyArrayModMethod(numpyArray) else: return numpyArray def startsAsNumpyArray(self): return self._commonAsNumpyArray(self._startList, self._startListModMethod, 'starts') def _startListModMethod(self, startList): return numpy.maximum(startList - self.genomeAnchor.start, \ numpy.zeros(len(startList), dtype='int32')) def endsAsNumpyArray(self): return self._commonAsNumpyArray(self._endList, self._endListModMethod, 'ends') def _endListModMethod(self, endList): return numpy.minimum(endList - self.genomeAnchor.start, \ numpy.zeros(len(endList), dtype='int32') + len(self.genomeAnchor)) def valsAsNumpyArray(self): return self._commonAsNumpyArray(self._valList, None, 'vals') def strandsAsNumpyArray(self): return self._commonAsNumpyArray(self._strandList, None, 'strands') def idsAsNumpyArray(self): return self._commonAsNumpyArray(self._idList, None, 'ids') def edgesAsNumpyArray(self): return self._commonAsNumpyArray(self._edgesList, None, 'edges') def weightsAsNumpyArray(self): return self._commonAsNumpyArray(self._weightsList, None, 'weights') def extrasAsNumpyArray(self, key): assert self.hasExtra(key) from functools import partial return self._commonAsNumpyArray(self._extraLists[key], None, 'extras') def allExtrasAsDictOfNumpyArrays(self): return OrderedDict([(key,self.extrasAsNumpyArray(key)) for key in self._extraLists]) def hasExtra(self, key): return key in self._extraLists
def _init(self): self._allValsAreBedVals = False tf = TrackFormat.createInstanceFromGeSource(self._geSource) if tf.getValTypeName() == 'Number (integer)': self._allValsAreBedVals = all((0 <= ge.val <= 1000) for ge in self._geSource)
def _init(self): self._allValsAreBedVals = False tf = TrackFormat.createInstanceFromGeSource(self._geSource) if tf.getValTypeName() == 'Number (integer)': self._allValsAreBedVals = all( (0 <= ge.val <= 1000) for ge in self._geSource)
testFn = self._writeTestFile(case) print open(testFn).read() print sortedContents = sortGtrackFileAndReturnContents( testFn, case.genome) print sortedContents sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass forPreProcessor = True if case.sourceClass is None else False sortedGeSource = GEDependentAttributesHolder(sourceClass('sortedFile.gtrack', case.genome, \ forPreProcessor=forPreProcessor, \ printWarnings=False, \ strToUseInsteadOfFn=sortedContents)) reprIsDense = TrackFormat.createInstanceFromGeSource( sortedGeSource).reprIsDense() if not reprIsDense: self.assertEquals(sorted(case.assertElementList), [ge for ge in sortedGeSource]) else: for ge in sortedGeSource: pass self.assertEquals( sorted(case.boundingRegionsAssertList), [br for br in sortedGeSource.getBoundingRegionTuples()]) def runTest(self): pass
def getTrackFormat(self): return TrackFormat.createInstanceFromPrefixList(self._prefixList, \ self._valDataType, \ self._valDim, \ self._weightDataType, \ self._weightDim)
def _assertIsCompatibleWith(self, tfReq, reqList): for start in [None, []]: for end in [None, []]: for val in [None, []]: for strand in [None, []]: for id, edges, weights in [(None, None, None), ([], None, None), ([], [], None), ([], [], [])]: for extra in [None, {'a': [], 'b': []}]: if [] in [start, end, val]: tf = TrackFormat(start, end, val, strand, id, edges, weights, extra) propList = [tf.isDense(), tf.isValued(), tf.isInterval(), tf.isLinked(), tf.hasStrand(), tf.hasId(), tf.isWeighted(), tf.hasExtra(), \ tf.getValTypeName() if tf.getValTypeName() != '' else False, \ tf.getWeightTypeName() if tf.getWeightTypeName() != '' else False, \ tf.getExtraNames() if tf.getExtraNames() != [] else False] isCompatible = (not False in [ (r == None or r == p) for r, p in zip(reqList, propList) ]) self.assertEqual( isCompatible, tfReq.isCompatibleWith(tf))