def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorresponds = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False
def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorrespond = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False
def _composeContents(self, out, hbColumns, columns, geSource, onlyNonDefault=True, singleDataLine=False): tf = TrackFormat.createInstanceFromGeSource(self._geSource) out.write(self._composeHeaderLines(onlyNonDefault)) out.write(self._composeColSpecLine(columns)) for br, geList in iterateOverBRTuplesWithContainedGEs( geSource, onlyYieldTwoGEs=singleDataLine): if br is not None: out.write(self._composeBoundingRegionLine(br)) for i, ge in enumerate( self._removeStartElementIfApplicable(tf, geList)): out.write( self._composeDataLine(ge, hbColumns, i + 1, i + 1 == len(geList))) if singleDataLine: break if singleDataLine: break
def _getGESourceManagerFromGESource(self, geSource): tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.reprIsDense(): if tf.getValTypeName() in ['Number', 'Number (integer)', 'Case-control']: return RegionBasedGESourceManager(geSource, self._regionList, \ calcStatsInExtraPass=False, countElsInBoundingRegions=False) else: raise NotSupportedError else: return RegionBasedGESourceManager(geSource, self._regionList, \ calcStatsInExtraPass=True, countElsInBoundingRegions=True)
def _getGESourceManagerFromGESource(self, geSource): tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.reprIsDense(): if tf.getValTypeName() in [ 'Number', 'Number (integer)', 'Case-control' ]: return RegionBasedGESourceManager(geSource, self._regionList, \ calcStatsInExtraPass=False, countElsInBoundingRegions=False) else: raise NotSupportedError else: return RegionBasedGESourceManager(geSource, self._regionList, \ calcStatsInExtraPass=True, countElsInBoundingRegions=True)
def _composeContents(self, out, hbColumns, columns, geSource, onlyNonDefault=True, singleDataLine=False): tf = TrackFormat.createInstanceFromGeSource(self._geSource) out.write( self._composeHeaderLines(onlyNonDefault) ) out.write( self._composeColSpecLine(columns) ) for br, geList in iterateOverBRTuplesWithContainedGEs(geSource, onlyYieldTwoGEs=singleDataLine): if br is not None: out.write( self._composeBoundingRegionLine(br) ) for i, ge in enumerate(self._removeStartElementIfApplicable(tf, geList)): out.write( self._composeDataLine(ge, hbColumns, i+1, i+1 == len(geList)) ) if singleDataLine: break if singleDataLine: break
def testSorting(self): geSourceTest = self._commonSetup() for caseName in geSourceTest.cases: if not caseName.startswith("gtrack"): continue if "no_sort" in caseName: print "Test case skipped: " + caseName continue print caseName print case = geSourceTest.cases[caseName] testFn = self._writeTestFile(case) print open(testFn).read() print sortedContents = sortGtrackFileAndReturnContents(testFn, case.genome) print sortedContents sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass forPreProcessor = True if case.sourceClass is None else False sortedGeSource = GEDependentAttributesHolder( sourceClass( "sortedFile.gtrack", case.genome, forPreProcessor=forPreProcessor, printWarnings=False, strToUseInsteadOfFn=sortedContents, ) ) reprIsDense = TrackFormat.createInstanceFromGeSource(sortedGeSource).reprIsDense() if not reprIsDense: self.assertEquals(sorted(case.assertElementList), [ge for ge in sortedGeSource]) else: for ge in sortedGeSource: pass self.assertEquals( sorted(case.boundingRegionsAssertList), [br for br in sortedGeSource.getBoundingRegionTuples()] )
def _compose(self, out): trackName = self._geSource.getTrackName() if trackName is not None: name = ':'.join(self._geSource.getTrackName()).replace(' ', '_') else: name = None print >> out, 'track type=wiggle_0' + (' name=%s' % name if name is not None else '') tf = TrackFormat.createInstanceFromGeSource(self._geSource) span = self._geSource.getFixedLength() step = self._geSource.getFixedGapSize() + span isFixedStep = (tf.reprIsDense() or step > 1 or (step == 1 and span != 1)) for brt, geList in iterateOverBRTuplesWithContainedGEs(self._geSource): if len(geList) == 0: continue if isFixedStep: self._composeFixedStepDeclarationLine(out, brt.region, step, span) else: curChr, curSpan = self._composeVariableStepDeclarationLine( out, geList[0]) for i, ge in enumerate(geList): if i==0 and tf.isDense() and tf.isInterval() and \ self._geSource.addsStartElementToDenseIntervals(): continue val = self._commonFormatNumberVal(ge.val) if isFixedStep: cols = [val] else: if ge.chr != curChr or self._getVariableSpan( ge) != curSpan: curChr, curSpan = self._composeVariableStepDeclarationLine( out, ge) cols = [str(ge.start + 1), val] print >> out, '\t'.join([str(x) for x in cols])
def _allGESourceManagers(self, trackName, allowOverlaps): collector = PreProcMetaDataCollector(self._genome, trackName) if allowOverlaps == False and collector.overlapRuleHasBeenFinalized(True): for i in range(1): self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (allowOverlaps: %s)' % allowOverlaps) yield self._getGESourceManagerFromTrack(trackName) else: for geSource in self._allGESources(trackName): if allowOverlaps == True: tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.isDense() or geSource.hasNoOverlappingElements(): return self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \ (' (allowOverlaps: %s)' % allowOverlaps) if PreProcessUtils.shouldPreProcessGESource(trackName, geSource, allowOverlaps): yield self._getGESourceManagerFromGESource(geSource)
def _allGESourceManagers(self, trackName, allowOverlaps): collector = PreProcMetaDataCollector(self._genome, trackName) if allowOverlaps == False and collector.overlapRuleHasBeenFinalized( True): for i in range(1): self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (allowOverlaps: %s)' % allowOverlaps) yield self._getGESourceManagerFromTrack(trackName) else: for geSource in self._allGESources(trackName): if allowOverlaps == True: tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.isDense() or geSource.hasNoOverlappingElements(): return self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \ (' (allowOverlaps: %s)' % allowOverlaps) if PreProcessUtils.shouldPreProcessGESource( trackName, geSource, allowOverlaps): yield self._getGESourceManagerFromGESource(geSource)
def _compose(self, out): trackName = self._geSource.getTrackName() if trackName is not None: name = ':'.join(self._geSource.getTrackName()).replace(' ','_') else: name = None print >>out, 'track type=wiggle_0' + (' name=%s' % name if name is not None else '') tf = TrackFormat.createInstanceFromGeSource(self._geSource) span = self._geSource.getFixedLength() step = self._geSource.getFixedGapSize() + span isFixedStep = (tf.reprIsDense() or step > 1 or (step == 1 and span != 1)) for brt, geList in iterateOverBRTuplesWithContainedGEs(self._geSource): if len(geList) == 0: continue if isFixedStep: self._composeFixedStepDeclarationLine(out, brt.region, step, span) else: curChr, curSpan = self._composeVariableStepDeclarationLine(out, geList[0]) for i,ge in enumerate(geList): if i==0 and tf.isDense() and tf.isInterval() and \ self._geSource.addsStartElementToDenseIntervals(): continue val = self._commonFormatNumberVal(ge.val) if isFixedStep: cols = [val] else: if ge.chr != curChr or self._getVariableSpan(ge) != curSpan: curChr, curSpan = self._composeVariableStepDeclarationLine(out, ge) cols = [str(ge.start+1), val] print >>out, '\t'.join([str(x) for x in cols])
testFn = self._writeTestFile(case) print open(testFn).read() print sortedContents = sortGtrackFileAndReturnContents( testFn, case.genome) print sortedContents sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass forPreProcessor = True if case.sourceClass is None else False sortedGeSource = GEDependentAttributesHolder(sourceClass('sortedFile.gtrack', case.genome, \ forPreProcessor=forPreProcessor, \ printWarnings=False, \ strToUseInsteadOfFn=sortedContents)) reprIsDense = TrackFormat.createInstanceFromGeSource( sortedGeSource).reprIsDense() if not reprIsDense: self.assertEquals(sorted(case.assertElementList), [ge for ge in sortedGeSource]) else: for ge in sortedGeSource: pass self.assertEquals( sorted(case.boundingRegionsAssertList), [br for br in sortedGeSource.getBoundingRegionTuples()]) def runTest(self): pass
def _init(self): self._allValsAreBedVals = False tf = TrackFormat.createInstanceFromGeSource(self._geSource) if tf.getValTypeName() == 'Number (integer)': self._allValsAreBedVals = all( (0 <= ge.val <= 1000) for ge in self._geSource)
def _init(self): self._allValsAreBedVals = False tf = TrackFormat.createInstanceFromGeSource(self._geSource) if tf.getValTypeName() == 'Number (integer)': self._allValsAreBedVals = all((0 <= ge.val <= 1000) for ge in self._geSource)