def _assertSplitUserBin(self, compBins, start, end): region = GenomeRegion('hg18', 'chr1', start, end) compBinRegions = [ GenomeRegion('hg18', 'chr1', elStart, elEnd) for elStart, elEnd in compBins ] AssertList(compBinRegions, CompBinManager.splitUserBin(region), self.assertEqual)
def testBoundingRegionsChrInUnsortedOrder(self): self._setUpShelve() brTuples = [BoundingRegionTuple(GenomeRegion('TestGenome', 'chrM', 1000, 2000), 5), \ BoundingRegionTuple(GenomeRegion('TestGenome', 'chr21', 0, 1000000), 10), \ BoundingRegionTuple(GenomeRegion('TestGenome', 'chr21', 2000000, 2500000), 20)] self._brShelve.storeBoundingRegions(brTuples, ['chr21', 'chrM'], sparse=True)
def testBoundingRegionsUnsortedInChr(self): self._setUpShelve() brTuples = [BoundingRegionTuple(GenomeRegion('TestGenome', 'chr21', 2000000, 2500000), 20),\ BoundingRegionTuple(GenomeRegion('TestGenome', 'chr21', 0, 1000000), 10)] self.assertRaises(InvalidFormatError, self._brShelve.storeBoundingRegions, brTuples, ['chr21'], sparse=True)
def _assertIntersect(self, assertRegs, chr, regs1, regs2): genomeRegs1 = [GenomeRegion('TestGenome', chr, start, end) for start, end in regs1] genomeRegs2 = [GenomeRegion('TestGenome', chr, start, end) for start, end in regs2] genomeAssertRegs = [GenomeRegion('TestGenome', chr, start, end) for start, end in assertRegs] resultRegs = BoundingRegionUserBinSource.getAllIntersectingRegions\ ('TestGenome', chr, genomeRegs1, genomeRegs2) #print [str(x) for x in resultRegs] self.assertListsOrDicts(genomeAssertRegs, resultRegs)
def testIsCompBin(self): self.assertTrue( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 0, 100))) self.assertTrue( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 200, 300))) self.assertTrue( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944323))) self.assertFalse( CompBinManager.isCompBin(GenomeRegion('TestGenome', 'chr21', 0, 40))) self.assertFalse( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 10, 100))) self.assertFalse( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 10, 200))) self.assertFalse( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 100, 300))) self.assertFalse( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944322))) self.assertFalse( CompBinManager.isCompBin( GenomeRegion('TestGenome', 'chr21', 46944300, 46944324)))
def testBoundingRegionIncorrectCountDense(self): self._setUpShelve() brTuples = [BoundingRegionTuple(GenomeRegion('TestGenome', 'chr21', 0, 1000000), 1000000), \ BoundingRegionTuple(GenomeRegion('TestGenome', 'chr21', 2000000, 2500000), 500000), \ BoundingRegionTuple(GenomeRegion('TestGenome', 'chrM', 1000, 2000), 500)] self.assertRaises(InvalidFormatError, self._brShelve.storeBoundingRegions, brTuples, ['chr21', 'chrM'], sparse=False)
def testGetNumOfBins(self): self.assertEqual( 0, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 0, 0))) self.assertEqual( 1, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 0, 100))) self.assertEqual( 2, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 200, 400))) self.assertEqual( 4, CompBinManager.getNumOfBins(GenomeRegion('hg18', 'chr1', 67, 314)))
def splitUserBin(region): 'Splits a region into several compBins, based on borders as defined by getCompBinSize' #assert( len(region) > 0 ) start = (int(region.start) / CompBinManager.getCompBinSize()) * CompBinManager.getCompBinSize() #round off to nearest whole compBin border compBins = [] while start < region.end: part = GenomeRegion(region.genome, region.chr) end = start + CompBinManager.getCompBinSize() part.start = max(start, region.start) part.end = min(end, region.end) compBins.append( part ) start += CompBinManager.getCompBinSize() return compBins
def nextBin(self): for region in self._userBinSource: start = region.start if region.start is not None else 0 chrLen = GenomeInfo.getChrLen( region.genome, region.chr) if region.genome is not None else None regEnd = min([x for x in [region.end, chrLen] if x is not None]) if self._binLen is None: yield GenomeRegion(region.genome, region.chr, start, regEnd) else: while start < regEnd: end = min(start + self._binLen, regEnd) yield GenomeRegion(region.genome, region.chr, start, end) start += self._binLen
def _doRandTest(self, origTV, randTrackClasses): anchor = [origTV.genomeAnchor.start, origTV.genomeAnchor.end] intensityTV = SampleTV_Num(vals=range(anchor[1] - anchor[0]), anchor=anchor) #fixme: not yet used.. MyPlainTrack._origTV = intensityTV gtrackcore.track.random.SegsSampledByIntensityTrack.PlainTrack = MyPlainTrack origTrack = SampleTrack(origTV) anchorReg = GenomeRegion('TestGenome', 'chr21', anchor[0], anchor[1]) binLen = len(anchorReg) for randClass in randTrackClasses: for i in range(100): randTrack = randClass(origTrack, anchorReg, i, trackNameIntensity='dummy_intensity') randTV = randTrack.getTrackView(anchorReg) self.assertListsOrDicts(sorted(len(el) for el in origTV), sorted(len(el) for el in randTV)) if isinstance(randClass, PermutedSegsAndIntersegsTrack): self.assertEqual(self._getInterSegLens(origTV, binLen), self._getInterSegLens(randTV, binLen)) else: self.assertEqual( sum(self._getInterSegLens(origTV, binLen)), sum(self._getInterSegLens(randTV, binLen))) for el in randTV: assert (0 <= el.start() < el.end() <= binLen) #self.assertEqual(self._createValAndStrandDict(origTV), self._createValAndStrandDict(randTV)) self.assertListsOrDicts(self._createSortedContents(origTV), self._createSortedContents(randTV))
def _createTrackView(self, starts, ends, vals, strands, ids, edges, weights, extras, sourceRegion, allowOverlaps, sliceFull=False): genomeAnchor = GenomeRegion(genome=self.genome, chr=self.chr, start=sourceRegion[0], end=sourceRegion[1]) tv = TrackView(genomeAnchor, \ array(starts) if starts is not None else None, \ array(ends) if ends is not None else None, \ array(vals, dtype='float64') if vals is not None else None, \ array(strands) if strands is not None else None, \ array(ids) if ids is not None else None, \ array(edges) if edges is not None else None, \ array(weights) if weights is not None else None, \ 'crop', allowOverlaps, \ extraLists=OrderedDict([(key, array(extra)) for key, extra in extras.iteritems()]) if extras is not None else OrderedDict()) if sliceFull: tv.sliceElementsAccordingToGenomeAnchor() return tv
def _appendBoundingRegionTuple(self): boundingRegion = GenomeRegion(genome=self._genome, chr=self._chr, start=self._start, \ end=self._getEnd(self._getFixedStepCurElStart())) elCount = self._curElCountInBoundingRegion + (1 if self._isStepFunction else 0) self._boundingRegionTuples.append( BoundingRegionTuple(boundingRegion, elCount))
def testPreProcessFasta(self): self._preProcess(['FastaGenomeElementSource'], \ noOverlapsFileCount=2, \ withOverlapsFileCount=None, \ noOverlapsChrElCount={'chr21':9804, 'chrM':0}, \ withOverlapsChrElCount=None, \ customBins={'chr21':GenomeRegion(self.GENOME, 'chr21', 0, 9804)})
def _appendBoundingRegionTuple(self): #if self._genomeElement.chr is not None: # brRegion = GenomeRegion(self._genome, self._genomeElement.chr, 0, self._elCount) if self._chr is not None: brRegion = GenomeRegion(self._genome, self._chr, 0, self._elCount) self._boundingRegionTuples.append( BoundingRegionTuple(brRegion, self._elCount))
def _getIter(elList, valDataType, valDim, edgeWeightDataType, edgeWeightDim, brList=[]): geIter = MyGeIter(valDataType, valDim, edgeWeightDataType, edgeWeightDim) for i in xrange(len(elList)): ge = GenomeElement(genome=elList[i][0], chr=elList[i][1], start=elList[i][2], end=elList[i][3]) if len(elList[i]) == 5: for prefix in elList[i][4]: setattr(ge, prefix, elList[i][4][prefix]) geIter.iter.append(ge) for i in xrange(len(brList)): br = GenomeRegion(genome=brList[i][0], chr=brList[i][1], start=brList[i][2], end=brList[i][3]) geIter.boundingRegionTuples.append( BoundingRegionTuple(br, brList[i][4])) return geIter
def _testGetBoundingInfoOutsideCommon(self, sparse): self._setUpShelve() self._commonStoreBoundingRegions(sparse=sparse) self.assertRaises(OutsideBoundingRegionError, \ self._brShelve.getBoundingRegionInfo, \ GenomeRegion('TestGenome', 'chr21', 50000, 1052000)) self.assertRaises(OutsideBoundingRegionError, \ self._brShelve.getBoundingRegionInfo, \ GenomeRegion('TestGenome', 'chr21', 1000000, 1052000)) self.assertRaises(OutsideBoundingRegionError, \ self._brShelve.getBoundingRegionInfo, \ GenomeRegion('TestGenome', 'chrM', 1500, 3000)) self.assertEquals( BoundingRegionInfo(100000, 110000, 0, 0, 0, 0), self._brShelve.getBoundingRegionInfo( GenomeRegion('TestGenome', 'chr2', 100000, 110000)))
def getAllIntersectingRegions(cls, genome, chr, regList1, regList2): regTuples1 = [(reg.start, reg.end) for reg in regList1] regTuples2 = [(reg.start, reg.end) for reg in regList2] if len(regTuples1) == 0 or len(regTuples2) == 0: return [] starts1, ends1 = zip(*regTuples1) starts2, ends2 = zip(*regTuples2) starts = starts1 + starts2 ends = ends1 + ends2 borderArray = numpy.array(ends + starts) intersectionArray = numpy.array([-1 for e in ends] + [1 for s in starts]) del regTuples1, regTuples2, starts1, starts2, starts, ends1, ends2, ends # Use merge sort, as it is stable sortedIndex = borderArray.argsort(kind='merge') borderArray = borderArray[sortedIndex] intersectionArray = intersectionArray[sortedIndex] intersectStartIndex = numpy.add.accumulate(intersectionArray) == 2 intersectStarts = borderArray[intersectStartIndex] intersectEnds = borderArray[1:][intersectStartIndex[:-1]] assert len(intersectStarts) == len(intersectEnds) return [GenomeRegion(genome, chr, start, end) \ for start, end in zip(intersectStarts, intersectEnds)]
def testStdGetBoundingInfoDense(self): self._setUpShelve() self._commonStoreBoundingRegions(sparse=False) self.assertEquals( BoundingRegionInfo(0, 1000000, 0, 1000000, 0, 0), self._brShelve.getBoundingRegionInfo( GenomeRegion('TestGenome', 'chr21', 50000, 52000))) self.assertEquals( BoundingRegionInfo(2000000, 2500000, 1000000, 1500000, 0, 0), self._brShelve.getBoundingRegionInfo( GenomeRegion('TestGenome', 'chr21', 2050000, 2052000))) self.assertEquals( BoundingRegionInfo(1000, 2000, 1500000, 1501000, 0, 0), self._brShelve.getBoundingRegionInfo( GenomeRegion('TestGenome', 'chrM', 1000, 2000)))
def __iter__(self): brShelve1 = self._getBoundingRegionShelve(self._trackName1) brShelve2 = self._getBoundingRegionShelve(self._trackName2) allBrsAreWholeChrs1 = self._commonAllBoundingRegionsAreWholeChr(brShelve1) \ if brShelve1 is not None else False allBrsAreWholeChrs2 = self._commonAllBoundingRegionsAreWholeChr(brShelve2) \ if brShelve2 is not None else False for chr in GenomeInfo.getExtendedChrList(self.genome): if brShelve1 is None: yield GenomeRegion(self.genome, chr, 0, GenomeInfo.getChrLen(self.genome, chr)) else: brList1 = brShelve1.getAllBoundingRegionsForChr(chr) if brShelve2 is None or \ (allBrsAreWholeChrs2 and not allBrsAreWholeChrs1): for reg in brList1: yield reg else: brList2 = brShelve2.getAllBoundingRegionsForChr(chr) if allBrsAreWholeChrs1 and not allBrsAreWholeChrs2: for reg in brList2: yield reg else: for reg in self.getAllIntersectingRegions(self.genome, chr, brList1, brList2): yield reg
def _assertTrackViewLoading_Segments(self, trackData, indexList, start, end): trackView = self.trackViewLoader.loadTrackView( trackData, GenomeRegion(genome='TestGenome', start=start, end=end), 'crop', False) i = -1 for i, el in enumerate(trackView): if i < len(indexList): index = indexList[i] else: self.fail() self.assertEqual(max(0, trackData['start'][index] - start), el.start()) self.assertEqual( min(end, trackData['end'][index]) - start, el.end()) self.assertAlmostEqual(trackData['val'][index], el.val()) self.assertEqual(trackData['strand'][index], el.strand()) self.assertEqual(trackData['id'][index], el.id()) self.assertListsOrDicts(trackData['edges'][index], el.edges()) self.assertListsOrDicts(trackData['weights'][index], el.weights()) self.assertEqual(trackData['a'][index], el.a()) self.assertEqual(trackData['b'][index], el.b()) self.assertRaises(AttributeError, lambda: el.leftIndex) self.assertRaises(AttributeError, lambda: el.rightIndex) self.assertEqual(len(indexList), i + 1)
def splitUserBin(region): 'Splits a region into several compBins, based on borders as defined by getCompBinSize' #assert( len(region) > 0 ) start = (int(region.start) / CompBinManager.getCompBinSize() ) * CompBinManager.getCompBinSize( ) #round off to nearest whole compBin border compBins = [] while start < region.end: part = GenomeRegion(region.genome, region.chr) end = start + CompBinManager.getCompBinSize() part.start = max(start, region.start) part.end = min(end, region.end) compBins.append(part) start += CompBinManager.getCompBinSize() return compBins
def testNoBoundingRegions(self): for sparse in [False, True]: self._setUpShelve() self._brShelve.storeBoundingRegions([], [], sparse) self.assertEquals( BoundingRegionInfo(50000, 52000, 0, 0, 0, 0), self._brShelve.getBoundingRegionInfo( GenomeRegion('TestGenome', 'chr21', 50000, 52000)))
def __new__(cls, genome): from gtrackcore.track.core.GenomeRegion import GenomeRegion from gtrackcore.metadata.GenomeInfo import GenomeInfo chrList = GenomeInfo.getChrList(genome) if len(chrList) > 0: return [ GenomeRegion(genome, GenomeInfo.getChrList(genome)[0], 0, 1) ]
def assertChrElCounts(self, trackName, chrElCountDict, allowOverlaps, customBins): for chr in chrElCountDict.keys(): if chr in customBins: region = customBins[chr] else: region = GenomeRegion(self.GENOME, chr, 0, GenomeInfo.getChrLen(self.GENOME, chr)) tv = self._getTrackView(trackName, region, allowOverlaps) self.assertEquals(chrElCountDict[chr], len([x for x in tv]))
def testBoundingRegionsNotPositive(self): self._setUpShelve() brTuples = [ BoundingRegionTuple(GenomeRegion('TestGenome', 'chr21', 0, 0), 1) ] self.assertRaises(InvalidFormatError, self._brShelve.storeBoundingRegions, brTuples, ['chr21'], sparse=True)
def getAllBoundingRegionsForChr(self, chr): self._updateContentsIfNecessary(chr) if chr in self._contents: #Temporary brInfoHolder = self._contents[chr] if isinstance(brInfoHolder, dict): brInfosForChr = brInfoHolder.values() else: brInfosForChr = brInfoHolder.brInfos for brInfo in brInfosForChr: yield GenomeRegion(self._genome, chr, brInfo.start, brInfo.end)
def __cmp__(self, other): if other is None: return -1 else: #print self.toStr() #print other.toStr() #print [cmp(getattr(self, attr), getattr(other, attr)) for attr in ['genome','chr','start','end','val','strand','id','edges','weights','extra']] try: return cmp([self.genome, self.chr, self.start, self.end, self.val, self.strand, self.id, self.edges, self.weights, self.extra] , \ [other.genome, other.chr, other.start, other.end, other.val, other.strand, other.id, other.edges, other.weights, other.extra]) except: if isinstance(other, GenomeRegion): return GenomeRegion.__cmp__(self, other)
def _assertLists(self, tv, starts, ends, vals, strands, ids, edges, weights, extras, region): if extras is None: extras = OrderedDict() for attr in [starts, ends, vals, strands, ids, weights ] + extras.values(): if attr != None: #for el in tv: # print el.start(), '-', el.end(), ',' , #print self.assertEqual(len(attr), sum(1 for x in tv)) self.assertEqual(GenomeRegion(genome=self.genome, chr=self.chr, start=region[0], end=region[1]),\ tv.genomeAnchor) for i, el in enumerate(tv): #print el.start(), el.end(), el.val(), el.strand(), el.id(), el.edges(), el.weights() #for key in el.getAllExtraKeysInOrder(): # print getattr(el, key)() self.assertEqual(starts[i] if starts != None else None, el.start()) self.assertEqual(ends[i] if ends != None else None, el.end()) if vals is None: self.assertEqual(None, el.val()) else: self.assertAlmostEqual(vals[i], el.val()) self.assertEqual(strands[i] if strands != None else None, el.strand()) self.assertEqual(ids[i] if ids != None else None, el.id()) self.assertListsOrDicts(edges[i] if edges != None else None, el.edges()) self.assertListsOrDicts(weights[i] if weights != None else None, el.weights()) for key in extras: self.assertEqual(extras[key][i] if extras != None else None, getattr(el, key)()) self._smartAssertListWithNone(starts, tv.startsAsNumpyArray()) self._smartAssertListWithNone(ends, tv.endsAsNumpyArray()) self._smartAssertListWithNone(vals, tv.valsAsNumpyArray()) self._smartAssertListWithNone(strands, tv.strandsAsNumpyArray()) self._smartAssertListWithNone(ids, tv.idsAsNumpyArray()) self._smartAssertListWithNone(edges, tv.edgesAsNumpyArray()) self._smartAssertListWithNone(weights, tv.weightsAsNumpyArray()) for key in extras: self._smartAssertListWithNone(extras[key], tv.extrasAsNumpyArray(key))
def _assertTrackViewLoading_Numbers(self, trackData, start, end): trackView = self.trackViewLoader.loadTrackView( trackData, GenomeRegion(genome='TestGenome', start=start, end=end), 'crop', False) self.assertListsOrDicts(trackData['val'][start:end], [el.val() for el in trackView]) self.assertListsOrDicts(trackData['strand'][start:end], [el.strand() for el in trackView]) self.assertListsOrDicts(trackData['id'][start:end], [el.id() for el in trackView]) self.assertListsOrDicts(trackData['edges'][start:end], [el.edges() for el in trackView]) self.assertListsOrDicts(trackData['weights'][start:end], [el.weights() for el in trackView]) self.assertListsOrDicts(trackData['a'][start:end], [el.a() for el in trackView]) self.assertListsOrDicts(trackData['b'][start:end], [el.b() for el in trackView])
def getBoundingRegionTuples(self): boundingRegionTuples = [x for x in self._getBoundingRegionTuples() \ if x.region.chr is not None] if len(boundingRegionTuples) == 0: from gtrackcore.input.core.GenomeElementSource import BoundingRegionTuple from gtrackcore.track.core.GenomeRegion import GenomeRegion from gtrackcore.metadata.GenomeInfo import GenomeInfo geChrList = self.getAllChrs() boundingRegionTuples = [BoundingRegionTuple( \ GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(self._geSource.genome, chr)), \ self.getNumElementsForChr(chr) ) \ for chr in geChrList] self._boundingRegionsAndGEsCorrespond = False else: self._boundingRegionsAndGEsCorrespond = True return boundingRegionTuples
def __init__(self, vals=True, strands=True, anchor=None, valDType='float64'): assert (vals != True or anchor != None) if anchor == None: numElements = len(vals) anchor = [10, 10 + numElements] else: numElements = anchor[1] - anchor[0] vals = self._createList(vals, getRandValList(numElements), valDType) strands = self._createList(strands, getRandStrandList(numElements), 'bool8') #print (vals, strands, anchor) TrackView.__init__( self, GenomeRegion('TestGenome', 'chr21', anchor[0], anchor[1]), None, None, vals, strands, None, None, None, 'crop', False)