def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorrespond = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False
def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorresponds = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False
def _composeContents(self, out, hbColumns, columns, geSource, onlyNonDefault=True, singleDataLine=False): tf = TrackFormat.createInstanceFromGeSource(self._geSource) out.write(self._composeHeaderLines(onlyNonDefault)) out.write(self._composeColSpecLine(columns)) for br, geList in iterateOverBRTuplesWithContainedGEs( geSource, onlyYieldTwoGEs=singleDataLine): if br is not None: out.write(self._composeBoundingRegionLine(br)) for i, ge in enumerate( self._removeStartElementIfApplicable(tf, geList)): out.write( self._composeDataLine(ge, hbColumns, i + 1, i + 1 == len(geList))) if singleDataLine: break if singleDataLine: break
def loadTrackView(trackData, region, borderHandling, allowOverlaps, trackName=[]): """ trackData : see TrackSource.getTrackData {'id' : smartmemmap} region : see GenomeRegion """ #brShelve = BoundingRegionShelve(region.genome, trackName, allowOverlaps) brShelve = trackData.boundingRegionShelve brInfo = brShelve.getBoundingRegionInfo(region) if brShelve is not None else None extraArrayNames = [arrayName for arrayName in trackData if arrayName not in \ RESERVED_PREFIXES.keys() + ['leftIndex', 'rightIndex']] reservedArrays = [TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in RESERVED_PREFIXES] extraArrays = [TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in extraArrayNames] trackFormat = TrackFormat( *(reservedArrays + [OrderedDict(zip(extraArrayNames, extraArrays))]) ) if trackFormat.reprIsDense(): if brInfo is None: leftIndex = region.start rightIndex = region.end else: leftIndex = region.start - brInfo.start rightIndex = region.end - brInfo.start else: leftBin = CompBinManager.getBinNumber(region.start) rightBin = CompBinManager.getBinNumber(region.end-1) #leftBin = region.start/COMP_BIN_SIZE #rightBin = (region.end-1)/COMP_BIN_SIZE if trackData.get('leftIndex') is None or trackData.get('rightIndex') is None: raise IOError('Preprocessed track not found. TrackData: ' + ', '.join(trackData.keys())) leftIndex = TrackViewLoader._getArray(trackData, 'leftIndex', brInfo, leftBin) rightIndex = TrackViewLoader._getArray(trackData, 'rightIndex', brInfo, rightBin) slicedReservedArrays = [(array[leftIndex:rightIndex] if array is not None else None) for array in reservedArrays] slicedExtraArrays = [(array[leftIndex:rightIndex] if array is not None else None) for array in extraArrays] argList = [region] + slicedReservedArrays + [borderHandling, allowOverlaps] + [OrderedDict(zip(extraArrayNames, slicedExtraArrays))] tv = TrackView( *(argList) ) if not trackFormat.reprIsDense(): tv.sliceElementsAccordingToGenomeAnchor() #tv._doScatteredSlicing() return tv
def testExtra(self): tf = TrackFormat.createInstanceFromPrefixList(["start", "a", "b", "c"], "float64", 1, "float64", 1) self.assertTrue(tf.hasExtra(specificExtra="a")) self.assertFalse(tf.hasExtra(specificExtra="d")) self.assertEqual(["a", "b", "c"], tf.getExtraNames()) tfq = TrackFormatReq(interval=False, extra=["a", "b"]) self.assertFalse(tfq.isCompatibleWith(tf))
def __init__(self, genomeAnchor, startList, endList, valList, strandList, idList, edgesList, \ weightsList, borderHandling, allowOverlaps, extraLists=OrderedDict()): assert startList!=None or endList!=None or valList!=None or edgesList!=None assert borderHandling in ['crop'] self.genomeAnchor = genomeAnchor.getCopy() self.trackFormat = TrackFormat(startList, endList, valList, strandList, idList, edgesList, weightsList, extraLists) self.borderHandling = borderHandling self.allowOverlaps = allowOverlaps self._trackElement = TrackElement(self) #self._bpLevelArray = None self._startList = startList self._endList = endList self._valList = valList self._strandList = strandList self._idList = idList self._edgesList = edgesList self._weightsList = weightsList self._extraLists = copy(extraLists) self._handlePointsAndPartitions() if self._startList is None: self._trackElement.start = noneFunc if self._endList is None: self._trackElement.end = noneFunc if self._valList is None: self._trackElement.val = noneFunc if self._strandList is None: self._trackElement.strand = noneFunc if self._idList is None: self._trackElement.id = noneFunc if self._edgesList is None: self._trackElement.edges = noneFunc if self._weightsList is None: self._trackElement.weights = noneFunc self._updateNumListElements() for i, list in enumerate([self._startList, self._endList, self._valList, self._strandList, self._idList, self._edgesList, self._weightsList] \ + [extraList for extraList in self._extraLists.values()]): assert list is None or len(list) == self._numListElements, 'List (%s): ' % i + str(list) + ' (expected %s elements, found %s)' % (self._numListElements, len(list))
def _getRawTrackView(self, region, borderHandling, allowOverlaps): assert len(region) == 1 from collections import OrderedDict from gtrackcore.track.memmap.CommonMemmapFunctions import findEmptyVal from gtrackcore.track.core.TrackView import TrackView import numpy as np geSource = ExternalTrackManager.getGESourceFromGalaxyOrVirtualTN(self.trackName, region.genome) prefixList = geSource.getPrefixList() valDataType = geSource.getValDataType() valDim = geSource.getValDim() weightDataType = geSource.getEdgeWeightDataType() weightDim = geSource.getEdgeWeightDim() startList, endList, valList, strandList, idList, edgesList, weightsList = [None]*7 extraLists=OrderedDict() tf = TrackFormat.createInstanceFromPrefixList(prefixList, valDataType, valDim, \ weightDataType, weightDim) if allowOverlaps and (tf.isDense() or geSource.hasNoOverlappingElements()): raise IncompatibleTracksError(prettyPrintTrackName(self.trackName) + ' with format: '\ + str(tf) + ' does not satisfy ' + str(self._trackFormatReq)) denseAndInterval = tf.isDense() and tf.isInterval() numEls = 2 if denseAndInterval else 1 if valDataType == 'S': valDataType = 'S2' if weightDataType == 'S': weightDataType = 'S2' for prefix in prefixList: if prefix == 'start': startList = np.array([-1], dtype='int32') elif prefix == 'end': if denseAndInterval: endList = np.array([0, 1], dtype='int32') else: endList = np.array([0], dtype='int32') elif prefix == 'val': valList = np.array([findEmptyVal(valDataType)] * valDim * numEls, \ dtype=valDataType).reshape((numEls, valDim) if valDim > 1 else numEls) elif prefix == 'strand': strandList = np.array([1] * numEls, dtype='int8') elif prefix == 'id': idList = np.array([''] * numEls, dtype='S1') elif prefix == 'edges': edgesList = np.array([['']] * numEls, dtype='S1') elif prefix == 'weights': weightsList = np.array([[[findEmptyVal(weightDataType)]]] * weightDim * numEls, \ dtype=weightDataType).reshape((numEls, 1, weightDim) if weightDim > 1 else (numEls, 1)) else: extraLists[prefix] = np.array([''] * numEls, dtype='S1') return TrackView(region, startList, endList, valList, strandList, idList, edgesList, weightsList, borderHandling, allowOverlaps, extraLists)
def testCompatibilityWithExceptions(self): tf = TrackFormat.createInstanceFromPrefixList(["start", "val"], "float64", 1, "float64", 1) self.assertFalse(TrackFormatReq(interval=True, strand=True, val="number").isCompatibleWith(tf)) self.assertFalse(TrackFormatReq(interval=True, strand=True, val="number").isCompatibleWith(tf, ["interval"])) self.assertTrue( TrackFormatReq(interval=True, strand=True, val="number").isCompatibleWith(tf, ["interval", "hasStrand"]) ) self.assertFalse( TrackFormatReq(interval=True, strand=True, val="tc").isCompatibleWith(tf, ["interval", "hasStrand"]) )
def testExtra(self): tf = TrackFormat.createInstanceFromPrefixList(['start', 'a', 'b', 'c'], 'float64', 1, 'float64', 1) self.assertTrue(tf.hasExtra(specificExtra='a')) self.assertFalse(tf.hasExtra(specificExtra='d')) self.assertEqual(['a', 'b', 'c'], tf.getExtraNames()) tfq = TrackFormatReq(interval=False, extra=['a', 'b']) self.assertFalse(tfq.isCompatibleWith(tf))
def _getGESourceManagerFromGESource(self, geSource): tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.reprIsDense(): if tf.getValTypeName() in ['Number', 'Number (integer)', 'Case-control']: return RegionBasedGESourceManager(geSource, self._regionList, \ calcStatsInExtraPass=False, countElsInBoundingRegions=False) else: raise NotSupportedError else: return RegionBasedGESourceManager(geSource, self._regionList, \ calcStatsInExtraPass=True, countElsInBoundingRegions=True)
def testValTypes(self): tf = TrackFormat.createInstanceFromPrefixList(["start", "val"], "float128", 2, "float64", 1) self.assertTrue(tf.isValued(specificValType="mean_sd")) self.assertFalse(tf.isValued(specificValType="number")) self.assertEqual("Mean and std.dev.", tf.getValTypeName()) self.assertEqual("Valued points", tf.getFormatName()) tfq = TrackFormatReq(interval=False, val="tc") self.assertFalse(tfq.isCompatibleWith(tf))
def testWeightTypes(self): tf = TrackFormat.createInstanceFromPrefixList(["id", "edges", "weights"], "float64", 1, "S8", 3) self.assertTrue(tf.isWeighted(specificWeightType="category_vector")) self.assertFalse(tf.isWeighted(specificWeightType="number")) self.assertEqual("Vector of categories", tf.getWeightTypeName()) self.assertEqual("Linked base pairs", tf.getFormatName()) tfq = TrackFormatReq(linked=True, weights="number") self.assertFalse(tfq.isCompatibleWith(tf))
def testWeightTypes(self): tf = TrackFormat.createInstanceFromPrefixList( ['id', 'edges', 'weights'], 'float64', 1, 'S8', 3) self.assertTrue(tf.isWeighted(specificWeightType='category_vector')) self.assertFalse(tf.isWeighted(specificWeightType='number')) self.assertEqual('Vector of categories', tf.getWeightTypeName()) self.assertEqual('Linked base pairs', tf.getFormatName()) tfq = TrackFormatReq(linked=True, weights='number') self.assertFalse(tfq.isCompatibleWith(tf))
def testValTypes(self): tf = TrackFormat.createInstanceFromPrefixList(['start', 'val'], 'float128', 2, 'float64', 1) self.assertTrue(tf.isValued(specificValType='mean_sd')) self.assertFalse(tf.isValued(specificValType='number')) self.assertEqual('Mean and std.dev.', tf.getValTypeName()) self.assertEqual('Valued points', tf.getFormatName()) tfq = TrackFormatReq(interval=False, val='tc') self.assertFalse(tfq.isCompatibleWith(tf))
def _getGESourceManagerFromGESource(self, geSource): tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.reprIsDense(): if tf.getValTypeName() in [ 'Number', 'Number (integer)', 'Case-control' ]: return RegionBasedGESourceManager(geSource, self._regionList, \ calcStatsInExtraPass=False, countElsInBoundingRegions=False) else: raise NotSupportedError else: return RegionBasedGESourceManager(geSource, self._regionList, \ calcStatsInExtraPass=True, countElsInBoundingRegions=True)
def testCompatibilityWithExceptions(self): tf = TrackFormat.createInstanceFromPrefixList(['start', 'val'], 'float64', 1, 'float64', 1) self.assertFalse(TrackFormatReq(interval=True, strand=True, val='number')\ .isCompatibleWith(tf)) self.assertFalse(TrackFormatReq(interval=True, strand=True, val='number')\ .isCompatibleWith(tf, ['interval'])) self.assertTrue(TrackFormatReq(interval=True, strand=True, val='number')\ .isCompatibleWith(tf, ['interval', 'hasStrand'])) self.assertFalse(TrackFormatReq(interval=True, strand=True, val='tc')\ .isCompatibleWith(tf, ['interval', 'hasStrand']))
def __iter__(self): self = copy(self) #does not support function, partitions and points: if (False in [attrs in self._geSource.getPrefixList() for attrs in ['start', 'end']]): raise NotSupportedError('Binning file must be segments. Current file format: ' + \ TrackFormat.createInstanceFromPrefixList(self._geSource.getPrefixList(), \ self._geSource.getValDataType(), \ self._geSource.getValDim(), \ self._geSource.getEdgeWeightDataType(), \ self._geSource.getEdgeWeightDim()).getFormatName() ) self._geIter = self._geSource.__iter__() return self
def testFormats(self): for start in [None, []]: for end in [None, []]: for val in [None, []]: for strand in [None, []]: for id, edges, weights in [(None, None, None), ([], None, None), ([], [], None), ([], [], [])]: for extra in [None, {'a': [], 'b': []}]: if [] in [start, end, val, edges]: tf = TrackFormat(start, end, val, strand, id, edges, weights, extra) self._assertTrackFormat(tf, start==[], end==[], val==[], strand==[], id==[], edges==[], weights==[], \ hasExtra=extra is not None, extra=extra.keys() if extra is not None else [])
def _composeContents(self, out, hbColumns, columns, geSource, onlyNonDefault=True, singleDataLine=False): tf = TrackFormat.createInstanceFromGeSource(self._geSource) out.write( self._composeHeaderLines(onlyNonDefault) ) out.write( self._composeColSpecLine(columns) ) for br, geList in iterateOverBRTuplesWithContainedGEs(geSource, onlyYieldTwoGEs=singleDataLine): if br is not None: out.write( self._composeBoundingRegionLine(br) ) for i, ge in enumerate(self._removeStartElementIfApplicable(tf, geList)): out.write( self._composeDataLine(ge, hbColumns, i+1, i+1 == len(geList)) ) if singleDataLine: break if singleDataLine: break
def __iter__(self): self = copy(self) #does not support function, partitions and points: if (False in [ attrs in self._geSource.getPrefixList() for attrs in ['start', 'end'] ]): raise NotSupportedError('Binning file must be segments. Current file format: ' + \ TrackFormat.createInstanceFromPrefixList(self._geSource.getPrefixList(), \ self._geSource.getValDataType(), \ self._geSource.getValDim(), \ self._geSource.getEdgeWeightDataType(), \ self._geSource.getEdgeWeightDim()).getFormatName() ) self._geIter = self._geSource.__iter__() return self
def testSorting(self): geSourceTest = self._commonSetup() for caseName in geSourceTest.cases: if not caseName.startswith("gtrack"): continue if "no_sort" in caseName: print "Test case skipped: " + caseName continue print caseName print case = geSourceTest.cases[caseName] testFn = self._writeTestFile(case) print open(testFn).read() print sortedContents = sortGtrackFileAndReturnContents(testFn, case.genome) print sortedContents sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass forPreProcessor = True if case.sourceClass is None else False sortedGeSource = GEDependentAttributesHolder( sourceClass( "sortedFile.gtrack", case.genome, forPreProcessor=forPreProcessor, printWarnings=False, strToUseInsteadOfFn=sortedContents, ) ) reprIsDense = TrackFormat.createInstanceFromGeSource(sortedGeSource).reprIsDense() if not reprIsDense: self.assertEquals(sorted(case.assertElementList), [ge for ge in sortedGeSource]) else: for ge in sortedGeSource: pass self.assertEquals( sorted(case.boundingRegionsAssertList), [br for br in sortedGeSource.getBoundingRegionTuples()] )
def __init__(self, genomeAnchor, startList, endList, valList, strandList, idList, edgesList, \ weightsList, borderHandling, allowOverlaps, extraLists=OrderedDict()): assert (startList is not None or endList is not None or valList is not None or edgesList is not None) assert borderHandling in ['crop'] self.genomeAnchor = genomeAnchor.getCopy() self.trackFormat = TrackFormat(startList, endList, valList, strandList, idList, edgesList, weightsList, extraLists) self.borderHandling = borderHandling self.allowOverlaps = allowOverlaps self._trackElement = TrackElement(self) #self._bpLevelArray = None self._startList = startList self._endList = endList self._valList = valList self._strandList = strandList self._idList = idList self._edgesList = edgesList self._weightsList = weightsList self._extraLists = copy(extraLists) self._handlePointsAndPartitions() if self._startList is None: self._trackElement.start = noneFunc if self._endList is None: self._trackElement.end = noneFunc if self._valList is None: self._trackElement.val = noneFunc if self._strandList is None: self._trackElement.strand = noneFunc if self._idList is None: self._trackElement.id = noneFunc if self._edgesList is None: self._trackElement.edges = noneFunc if self._weightsList is None: self._trackElement.weights = noneFunc self._updateNumListElements() for i, list in enumerate([self._startList, self._endList, self._valList, self._strandList, self._idList, self._edgesList, self._weightsList] \ + [extraList for extraList in self._extraLists.values()]): assert list is None or len(list) == self._numListElements, 'List (%s): ' % i + str(list) + ' (expected %s elements, found %s)' % (self._numListElements, len(list))
def _compose(self, out): trackName = self._geSource.getTrackName() if trackName is not None: name = ':'.join(self._geSource.getTrackName()).replace(' ', '_') else: name = None print >> out, 'track type=wiggle_0' + (' name=%s' % name if name is not None else '') tf = TrackFormat.createInstanceFromGeSource(self._geSource) span = self._geSource.getFixedLength() step = self._geSource.getFixedGapSize() + span isFixedStep = (tf.reprIsDense() or step > 1 or (step == 1 and span != 1)) for brt, geList in iterateOverBRTuplesWithContainedGEs(self._geSource): if len(geList) == 0: continue if isFixedStep: self._composeFixedStepDeclarationLine(out, brt.region, step, span) else: curChr, curSpan = self._composeVariableStepDeclarationLine( out, geList[0]) for i, ge in enumerate(geList): if i==0 and tf.isDense() and tf.isInterval() and \ self._geSource.addsStartElementToDenseIntervals(): continue val = self._commonFormatNumberVal(ge.val) if isFixedStep: cols = [val] else: if ge.chr != curChr or self._getVariableSpan( ge) != curSpan: curChr, curSpan = self._composeVariableStepDeclarationLine( out, ge) cols = [str(ge.start + 1), val] print >> out, '\t'.join([str(x) for x in cols])
def _allGESourceManagers(self, trackName, allowOverlaps): collector = PreProcMetaDataCollector(self._genome, trackName) if allowOverlaps == False and collector.overlapRuleHasBeenFinalized(True): for i in range(1): self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (allowOverlaps: %s)' % allowOverlaps) yield self._getGESourceManagerFromTrack(trackName) else: for geSource in self._allGESources(trackName): if allowOverlaps == True: tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.isDense() or geSource.hasNoOverlappingElements(): return self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \ (' (allowOverlaps: %s)' % allowOverlaps) if PreProcessUtils.shouldPreProcessGESource(trackName, geSource, allowOverlaps): yield self._getGESourceManagerFromGESource(geSource)
def _allGESourceManagers(self, trackName, allowOverlaps): collector = PreProcMetaDataCollector(self._genome, trackName) if allowOverlaps == False and collector.overlapRuleHasBeenFinalized( True): for i in range(1): self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (allowOverlaps: %s)' % allowOverlaps) yield self._getGESourceManagerFromTrack(trackName) else: for geSource in self._allGESources(trackName): if allowOverlaps == True: tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.isDense() or geSource.hasNoOverlappingElements(): return self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \ (' (allowOverlaps: %s)' % allowOverlaps) if PreProcessUtils.shouldPreProcessGESource( trackName, geSource, allowOverlaps): yield self._getGESourceManagerFromGESource(geSource)
def _compose(self, out): trackName = self._geSource.getTrackName() if trackName is not None: name = ':'.join(self._geSource.getTrackName()).replace(' ','_') else: name = None print >>out, 'track type=wiggle_0' + (' name=%s' % name if name is not None else '') tf = TrackFormat.createInstanceFromGeSource(self._geSource) span = self._geSource.getFixedLength() step = self._geSource.getFixedGapSize() + span isFixedStep = (tf.reprIsDense() or step > 1 or (step == 1 and span != 1)) for brt, geList in iterateOverBRTuplesWithContainedGEs(self._geSource): if len(geList) == 0: continue if isFixedStep: self._composeFixedStepDeclarationLine(out, brt.region, step, span) else: curChr, curSpan = self._composeVariableStepDeclarationLine(out, geList[0]) for i,ge in enumerate(geList): if i==0 and tf.isDense() and tf.isInterval() and \ self._geSource.addsStartElementToDenseIntervals(): continue val = self._commonFormatNumberVal(ge.val) if isFixedStep: cols = [val] else: if ge.chr != curChr or self._getVariableSpan(ge) != curSpan: curChr, curSpan = self._composeVariableStepDeclarationLine(out, ge) cols = [str(ge.start+1), val] print >>out, '\t'.join([str(x) for x in cols])
def _getRawTrackView(self, region, borderHandling, allowOverlaps): assert len(region) == 1 from collections import OrderedDict from gtrackcore.track.memmap.CommonMemmapFunctions import findEmptyVal from gtrackcore.track.core.TrackView import TrackView import numpy as np geSource = ExternalTrackManager.getGESourceFromGalaxyOrVirtualTN( self.trackName, region.genome) prefixList = geSource.getPrefixList() valDataType = geSource.getValDataType() valDim = geSource.getValDim() weightDataType = geSource.getEdgeWeightDataType() weightDim = geSource.getEdgeWeightDim() startList, endList, valList, strandList, idList, edgesList, weightsList = [ None ] * 7 extraLists = OrderedDict() tf = TrackFormat.createInstanceFromPrefixList(prefixList, valDataType, valDim, \ weightDataType, weightDim) if allowOverlaps and (tf.isDense() or geSource.hasNoOverlappingElements()): raise IncompatibleTracksError(prettyPrintTrackName(self.trackName) + ' with format: '\ + str(tf) + ' does not satisfy ' + str(self._trackFormatReq)) denseAndInterval = tf.isDense() and tf.isInterval() numEls = 2 if denseAndInterval else 1 if valDataType == 'S': valDataType = 'S2' if weightDataType == 'S': weightDataType = 'S2' for prefix in prefixList: if prefix == 'start': startList = np.array([-1], dtype='int32') elif prefix == 'end': if denseAndInterval: endList = np.array([0, 1], dtype='int32') else: endList = np.array([0], dtype='int32') elif prefix == 'val': valList = np.array([findEmptyVal(valDataType)] * valDim * numEls, \ dtype=valDataType).reshape((numEls, valDim) if valDim > 1 else numEls) elif prefix == 'strand': strandList = np.array([1] * numEls, dtype='int8') elif prefix == 'id': idList = np.array([''] * numEls, dtype='S1') elif prefix == 'edges': edgesList = np.array([['']] * numEls, dtype='S1') elif prefix == 'weights': weightsList = np.array([[[findEmptyVal(weightDataType)]]] * weightDim * numEls, \ dtype=weightDataType).reshape((numEls, 1, weightDim) if weightDim > 1 else (numEls, 1)) else: extraLists[prefix] = np.array([''] * numEls, dtype='S1') return TrackView(region, startList, endList, valList, strandList, idList, edgesList, weightsList, borderHandling, allowOverlaps, extraLists)
def _init(self): self._allValsAreBedVals = False tf = TrackFormat.createInstanceFromGeSource(self._geSource) if tf.getValTypeName() == 'Number (integer)': self._allValsAreBedVals = all((0 <= ge.val <= 1000) for ge in self._geSource)
def getTrackFormat(self): return TrackFormat.createInstanceFromPrefixList(self._prefixList, \ self._valDataType, \ self._valDim, \ self._weightDataType, \ self._weightDim)
def _assertIsCompatibleWith(self, tfReq, reqList): for start in [None, []]: for end in [None, []]: for val in [None, []]: for strand in [None, []]: for id, edges, weights in [(None, None, None), ([], None, None), ([], [], None), ([], [], [])]: for extra in [None, {'a': [], 'b': []}]: if [] in [start, end, val]: tf = TrackFormat(start, end, val, strand, id, edges, weights, extra) propList = [tf.isDense(), tf.isValued(), tf.isInterval(), tf.isLinked(), tf.hasStrand(), tf.hasId(), tf.isWeighted(), tf.hasExtra(), \ tf.getValTypeName() if tf.getValTypeName() != '' else False, \ tf.getWeightTypeName() if tf.getWeightTypeName() != '' else False, \ tf.getExtraNames() if tf.getExtraNames() != [] else False] isCompatible = (not False in [ (r == None or r == p) for r, p in zip(reqList, propList) ]) self.assertEqual( isCompatible, tfReq.isCompatibleWith(tf))
def _init(self): self._allValsAreBedVals = False tf = TrackFormat.createInstanceFromGeSource(self._geSource) if tf.getValTypeName() == 'Number (integer)': self._allValsAreBedVals = all( (0 <= ge.val <= 1000) for ge in self._geSource)
class TrackView(object): def _handlePointsAndPartitions(self): if self.trackFormat.isDense() and not self.trackFormat.reprIsDense(): self._startList = self._endList[:-1] self._endList = self._endList[1:] if self._valList != None: self._valList = self._valList[1:] if self._strandList != None: self._strandList = self._strandList[1:] if self._idList != None: self._idList = self._idList[1:] if self._edgesList != None: self._edgesList = self._edgesList[1:] if self._weightsList != None: self._weightsList = self._weightsList[1:] for key, extraList in self._extraLists.items(): if extraList != None: self._extraLists[key] = extraList[1:] if not self.trackFormat.isDense() and not self.trackFormat.isInterval(): self._endList = VirtualPointEnd(self._startList) def __init__(self, genomeAnchor, startList, endList, valList, strandList, idList, edgesList, \ weightsList, borderHandling, allowOverlaps, extraLists=OrderedDict()): assert startList!=None or endList!=None or valList!=None or edgesList!=None assert borderHandling in ['crop'] self.genomeAnchor = genomeAnchor.getCopy() self.trackFormat = TrackFormat(startList, endList, valList, strandList, idList, edgesList, weightsList, extraLists) self.borderHandling = borderHandling self.allowOverlaps = allowOverlaps self._trackElement = TrackElement(self) #self._bpLevelArray = None self._startList = startList self._endList = endList self._valList = valList self._strandList = strandList self._idList = idList self._edgesList = edgesList self._weightsList = weightsList self._extraLists = copy(extraLists) self._handlePointsAndPartitions() if self._startList is None: self._trackElement.start = noneFunc if self._endList is None: self._trackElement.end = noneFunc if self._valList is None: self._trackElement.val = noneFunc if self._strandList is None: self._trackElement.strand = noneFunc if self._idList is None: self._trackElement.id = noneFunc if self._edgesList is None: self._trackElement.edges = noneFunc if self._weightsList is None: self._trackElement.weights = noneFunc self._updateNumListElements() for i, list in enumerate([self._startList, self._endList, self._valList, self._strandList, self._idList, self._edgesList, self._weightsList] \ + [extraList for extraList in self._extraLists.values()]): assert list is None or len(list) == self._numListElements, 'List (%s): ' % i + str(list) + ' (expected %s elements, found %s)' % (self._numListElements, len(list)) def __iter__(self): self._trackElement._index = -1 return self def _updateNumListElements(self): "" self._numListElements = self._computeNumListElements() if self.allowOverlaps and self._numListElements > 0: self._numIterElements = self._computeNumIterElements() else: self._numIterElements = self._numListElements def _computeNumListElements(self): for list in [self._startList, self._endList, self._valList, self._edgesList]: if list is not None: return len(list) raise ShouldNotOccurError def _computeNumIterElements(self): for list in [self._startList, self._endList, self._valList, self._edgesList]: if list is not None: if isinstance(list, numpy.ndarray): return len(self._removeBlindPassengersFromNumpyArray(list)) else: return sum(1 for x in self) raise ShouldNotOccurError def __len__(self): "" return self._bpSize() def getNumElements(self): return self._numIterElements def _bpSize(self): return len(self.genomeAnchor) def next(self): self._trackElement._index += 1 #To remove any blind passengers - segments entirely in front of genomeanchor, # but sorted after a larger segment crossing the border if self.allowOverlaps and not self.trackFormat.reprIsDense(): while self._trackElement._index < self._numListElements and self._endList[self._trackElement._index] <= self.genomeAnchor.start: #self._trackElement.end() <= 0: self._trackElement._index += 1 if self._trackElement._index < self._numListElements: return self._trackElement else: raise StopIteration def _findLeftIndex(self): leftIndex = 0 #remove track elements entirely to the left of the anchor while leftIndex < len(self._endList) and self._endList[leftIndex] <= self.genomeAnchor.start: leftIndex += 1 return leftIndex def _findRightIndex(self): rightIndex = self._numListElements while rightIndex > 0 and self._startList[rightIndex-1] >= self.genomeAnchor.end: rightIndex -= 1 return rightIndex def sliceElementsAccordingToGenomeAnchor(self): assert( not self.trackFormat.reprIsDense() ) self._doScatteredSlicing() def _doScatteredSlicing(self): leftIndex = self._findLeftIndex() rightIndex = self._findRightIndex() if self._bpSize() == 0: rightIndex = leftIndex self._startList = self._startList[leftIndex:rightIndex] self._endList = self._endList[leftIndex:rightIndex] if self._valList != None: self._valList = self._valList[leftIndex:rightIndex] if self._strandList != None: self._strandList = self._strandList[leftIndex:rightIndex] if self._idList != None: self._idList = self._idList[leftIndex:rightIndex] if self._edgesList != None: self._edgesList = self._edgesList[leftIndex:rightIndex] if self._weightsList != None: self._weightsList = self._weightsList[leftIndex:rightIndex] for key, extraList in self._extraLists.items(): self._extraLists[key] = extraList[leftIndex:rightIndex] self._updateNumListElements() def _doDenseSlicing(self, i, j): if self._valList != None: self._valList = self._valList[i:j] if self._strandList != None: self._strandList = self._strandList[i:j] if self._idList != None: self._idList = self._idList[i:j] if self._edgesList != None: self._edgesList = self._edgesList[i:j] if self._weightsList != None: self._weightsList = self._weightsList[i:j] for key, extraList in self._extraLists.items(): self._extraLists[key] = extraList[i:j] self._updateNumListElements() def __getslice__(self, i, j): slicedTV = TrackView(self.genomeAnchor, self._startList, self._endList, \ self._valList, self._strandList, self._idList, \ self._edgesList, self._weightsList, \ self.borderHandling, self.allowOverlaps, \ extraLists=self._extraLists) slicedTV.trackFormat = self.trackFormat slicedTV.genomeAnchor.start += i if j>=0: try: slicedTV.genomeAnchor.end = min(self.genomeAnchor.end, self.genomeAnchor.start + j) except FloatingPointError: # Caused by trackView[:] with self.genomeAnchor.start > 0 slicedTV.genomeAnchor.end = self.genomeAnchor.end if j<0: slicedTV.genomeAnchor.end += j if self.trackFormat.reprIsDense(): slicedTV._doDenseSlicing(i,j) else: slicedTV._doScatteredSlicing() return slicedTV def _getBpLevelModificationArray(self, indexes, vals): bpLevelMod = numpy.bincount(indexes, vals) origLen = len(bpLevelMod) bpLevelMod.resize(self._bpSize()+1) bpLevelMod[origLen:] = 0 return bpLevelMod def _commonGetBpLevelArray(self, vals): if self.trackFormat.reprIsDense(): if self.allowOverlaps: raise ShouldNotOccurError() return vals else: bpLevelArray = numpy.zeros(self._bpSize()+1) numElements = self.getNumElements() if numElements > 0: bpLevelArray += self._getBpLevelModificationArray(self.startsAsNumpyArray(), vals) bpLevelArray -= self._getBpLevelModificationArray(self.endsAsNumpyArray(), vals) bpLevelArray = bpLevelArray.cumsum() return bpLevelArray[:-1] def getBinaryBpLevelArray(self): vals = numpy.ones(self.getNumElements(), dtype='int32') return numpy.array(self._commonGetBpLevelArray(vals), dtype='bool8') def getCoverageBpLevelArray(self): vals = numpy.ones(self.getNumElements(), dtype='int32') return numpy.array(self._commonGetBpLevelArray(vals), dtype='int32') def getValueBpLevelArray(self, voidValue=0): ''' Creates a bp-level function of any valued track. In case of scattered tracks, uncovered aras are filled with voidValue (which would typically be set to 0 or numpy.nan). In the case of overlapping regions, the values are added.''' assert self.trackFormat.isValued('number'), self.trackFormat vals = self.valsAsNumpyArray() bpLevelArray = numpy.array(self._commonGetBpLevelArray(vals), dtype=vals.dtype) if voidValue != 0: bpLevelArray[~self.getBinaryBpLevelArray()] = voidValue return bpLevelArray def _removeBlindPassengersFromNumpyArray(self, numpyArray): ''' To remove any blind passengers - segments entirely in front of genomeanchor, but sorted after a larger segment crossing the border. ''' if self.allowOverlaps and len(numpyArray) > 0: numpyArray = numpyArray[numpy.where(self._endList > self.genomeAnchor.start)] return numpyArray def _commonAsNumpyArray(self, numpyArray, numpyArrayModMethod, name): assert(self.borderHandling in ['crop']) if numpyArray is None: return None numpyArray = self._removeBlindPassengersFromNumpyArray(numpyArray) if numpyArrayModMethod is not None: return numpyArrayModMethod(numpyArray) else: return numpyArray def startsAsNumpyArray(self): return self._commonAsNumpyArray(self._startList, self._startListModMethod, 'starts') def _startListModMethod(self, startList): return numpy.maximum(startList - self.genomeAnchor.start, \ numpy.zeros(len(startList), dtype='int32')) def endsAsNumpyArray(self): return self._commonAsNumpyArray(self._endList, self._endListModMethod, 'ends') def _endListModMethod(self, endList): return numpy.minimum(endList - self.genomeAnchor.start, \ numpy.zeros(len(endList), dtype='int32') + len(self.genomeAnchor)) def valsAsNumpyArray(self): return self._commonAsNumpyArray(self._valList, None, 'vals') def strandsAsNumpyArray(self): return self._commonAsNumpyArray(self._strandList, None, 'strands') def idsAsNumpyArray(self): return self._commonAsNumpyArray(self._idList, None, 'ids') def edgesAsNumpyArray(self): return self._commonAsNumpyArray(self._edgesList, None, 'edges') def weightsAsNumpyArray(self): return self._commonAsNumpyArray(self._weightsList, None, 'weights') def extrasAsNumpyArray(self, key): assert self.hasExtra(key) from functools import partial return self._commonAsNumpyArray(self._extraLists[key], None, 'extras') def allExtrasAsDictOfNumpyArrays(self): return OrderedDict([(key,self.extrasAsNumpyArray(key)) for key in self._extraLists]) def hasExtra(self, key): return key in self._extraLists
testFn = self._writeTestFile(case) print open(testFn).read() print sortedContents = sortGtrackFileAndReturnContents( testFn, case.genome) print sortedContents sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass forPreProcessor = True if case.sourceClass is None else False sortedGeSource = GEDependentAttributesHolder(sourceClass('sortedFile.gtrack', case.genome, \ forPreProcessor=forPreProcessor, \ printWarnings=False, \ strToUseInsteadOfFn=sortedContents)) reprIsDense = TrackFormat.createInstanceFromGeSource( sortedGeSource).reprIsDense() if not reprIsDense: self.assertEquals(sorted(case.assertElementList), [ge for ge in sortedGeSource]) else: for ge in sortedGeSource: pass self.assertEquals( sorted(case.boundingRegionsAssertList), [br for br in sortedGeSource.getBoundingRegionTuples()]) def runTest(self): pass
def _assertIsCompatibleWith(self, tfReq, reqList): for start in [None, []]: for end in [None, []]: for val in [None, []]: for strand in [None, []]: for id, edges, weights in [(None, None, None), ([], None, None), ([], [], None), ([], [], [])]: for extra in [None, {"a": [], "b": []}]: if [] in [start, end, val]: tf = TrackFormat(start, end, val, strand, id, edges, weights, extra) propList = [ tf.isDense(), tf.isValued(), tf.isInterval(), tf.isLinked(), tf.hasStrand(), tf.hasId(), tf.isWeighted(), tf.hasExtra(), tf.getValTypeName() if tf.getValTypeName() != "" else False, tf.getWeightTypeName() if tf.getWeightTypeName() != "" else False, tf.getExtraNames() if tf.getExtraNames() != [] else False, ] isCompatible = not False in [ (r == None or r == p) for r, p in zip(reqList, propList) ] self.assertEqual(isCompatible, tfReq.isCompatibleWith(tf))