def _next(self, line): if line.startswith('#'): return ge = GenomeElement(self._genome) cols = line.split('\t') if self._numCols is not None: if len(cols) != self._numCols: raise InvalidFormatError('Error: BED files must have the same number of columns in each data line.') else: self._numCols = len(cols) if self._numCols < self.MIN_NUM_COLS or self._numCols > self.MAX_NUM_COLS: raise InvalidFormatError('Error: BED file must contain between %s and %s columns.' % (self.MIN_NUM_COLS, self.MAX_NUM_COLS)) ge.chr = self._checkValidChr(cols[0]) ge.start = self._checkValidStart(ge.chr, int(cols[1])) self._parseEnd( ge, self._checkValidEnd(ge.chr, int(cols[2]), start=ge.start)) self._parseName( ge, cols ) self._parseVal( ge, cols ) if self._numCols >= 6: ge.strand = self._getStrandFromString(cols[5]) for i,extraCol in enumerate(self.BED_EXTRA_COLUMNS): if self._numCols >= i+7: setattr(ge, extraCol, cols[i+6]) return ge
def _next(self, line): cols = line.split('\t') ge = GenomeElement(self._genome) ge.chr = self._checkValidChr(cols[0]) ge.start = int(cols[1]) ge.end = int(cols[2]) self._parseVal(ge, cols[3]) return ge
def _next(self, line): if line.startswith('>'): self._appendBoundingRegionTuple() self._elCount = 0 self._chr = self._checkValidChr(line[1:].split()[0]) else: if self._chr is None: raise InvalidFormatError('FASTA file does not start with the ">" character.') self._elCount += len(line) ge = GenomeElement(self._genome, self._chr) ge.val = np.fromstring(line, dtype='S1') return ge
def _next(self, line): if line.startswith('>'): self._appendBoundingRegionTuple() self._elCount = 0 self._chr = self._checkValidChr(line[1:].split()[0]) else: if self._chr is None: raise InvalidFormatError( 'FASTA file does not start with the ">" character.') self._elCount += len(line) ge = GenomeElement(self._genome, self._chr) ge.val = np.fromstring(line, dtype='S1') return ge
def next(self): self._curPos += 1 if self._curPos % 10e6 == 0: print '.', if self._curPos >= len(self._tv.genomeAnchor): raise StopIteration if self._exhausted: return None if self._curEl is None: try: self._curEl = self._tvIter.next() except StopIteration: self._exhausted = True return None if self._curPos == self._curEl.start(): trackEl = self._curEl genome = self._tv.genomeAnchor.genome chr = self._tv.genomeAnchor.chr #print 'EL: ',GenomeElement(genome,chr, trackEl.start(), trackEl.end(), trackEl.val(), trackEl.strand()) outEl = GenomeElement(genome, chr, trackEl.start(), trackEl.end(), trackEl.val(), trackEl.strand()) self._curEl = None return outEl else: #print self._curPos,' AND ', self._curEl.start() #print 'None' return None
def _getIter(elList, valDataType, valDim, edgeWeightDataType, edgeWeightDim, brList=[]): geIter = MyGeIter(valDataType, valDim, edgeWeightDataType, edgeWeightDim) for i in xrange(len(elList)): ge = GenomeElement(genome=elList[i][0], chr=elList[i][1], start=elList[i][2], end=elList[i][3]) if len(elList[i]) == 5: for prefix in elList[i][4]: setattr(ge, prefix, elList[i][4][prefix]) geIter.iter.append(ge) for i in xrange(len(brList)): br = GenomeRegion(genome=brList[i][0], chr=brList[i][1], start=brList[i][2], end=brList[i][3]) geIter.boundingRegionTuples.append( BoundingRegionTuple(br, brList[i][4])) return geIter
def _parseDeclarationLine(self, line): returnGE = None chr, start, step, span = self._getDeclarationLineAttrValues(line) self._fixedStep = self._checkFixedStep(line, start, step) chr = self._handleChr(chr) self._span = self._handleSpan(span) self._isPoints = self._span == 1 if self._fixedStep: start = self._handleStart(chr, start) self._step = self._handleStep(step) self._isStepFunction = (self._step == self._span and self._step > 1) self._isFunction = (self._step == self._span and self._step == 1) if self._isFunction: self._genomeElement.chr = chr if not self._shouldExpandBoundingRegion(chr, start): if self._chr is not None: #self._chr is still the chromosome of the previous decl. line self._appendBoundingRegionTuple() self._start = start self._curElCountInBoundingRegion = 0 if self._isStepFunction: returnGE = GenomeElement(genome=self._genome, chr=chr, end=self._start, \ val=numpy.nan, isBlankElement=True) self._chr = chr return returnGE
def next(self): if self._returnedOneElement: raise StopIteration self._returnedOneElement = True return GenomeElement(genome=self._genome, chr=self._region.chr, val=self._valSlice)
def testAssignAndRetrieve(self): e = GenomeElement('TestGenome', start=5, val=1.0, extra={'a':1,'b':2}, orderedExtraKeys=['a','b']) self.assertEqual(e.genome, 'TestGenome') self.assertEqual(e.chr, None) self.assertEqual(e.start, 5) self.assertEqual(e.end, None) self.assertEqual(e.val, 1.0) self.assertEqual(e.strand, None) self.assertEqual(e.a, 1) self.assertEqual(e.b, 2) self.assertEqual(e.extra, {'a':1,'b':2}) self.assertEqual(e.orderedExtraKeys, ['a', 'b']) e = GenomeElement('TestGenome', a=1) e.b = 2 self.assertEqual(e.genome, 'TestGenome') self.assertEqual(e.a, 1) self.assertEqual(e.b, 2) self.assertEqual(e.extra, {'a':1,'b':2}) self.assertEqual(e.orderedExtraKeys, ['a', 'b']) self.assertRaises(AttributeError, lambda : e.nonExisting)
def testAssignAndRetrieve(self): e = GenomeElement("TestGenome", start=5, val=1.0, extra={"a": 1, "b": 2}, orderedExtraKeys=["a", "b"]) self.assertEqual(e.genome, "TestGenome") self.assertEqual(e.chr, None) self.assertEqual(e.start, 5) self.assertEqual(e.end, None) self.assertEqual(e.val, 1.0) self.assertEqual(e.strand, None) self.assertEqual(e.a, 1) self.assertEqual(e.b, 2) self.assertEqual(e.extra, {"a": 1, "b": 2}) self.assertEqual(e.orderedExtraKeys, ["a", "b"]) e = GenomeElement("TestGenome", a=1) e.b = 2 self.assertEqual(e.genome, "TestGenome") self.assertEqual(e.a, 1) self.assertEqual(e.b, 2) self.assertEqual(e.extra, {"a": 1, "b": 2}) self.assertEqual(e.orderedExtraKeys, ["a", "b"]) self.assertRaises(AttributeError, lambda: e.nonExisting)
def _next(self, line): if line.startswith('##FASTA'): raise StopIteration if len(line)>0 and line[0]=='#': return None origCols = line.split('\t') cols = [unquote(x) for x in origCols] if len(cols) != 9: raise InvalidFormatError("Error: GFF files must contain 9 tab-separated columns") ge = GenomeElement(self._genome) ge.chr = self._checkValidChr(cols[0]) ge.source = cols[1] self._parseThirdCol(ge, cols[2]) ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1) ge.end = self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start) self._parseSixthCol(ge, cols[5]) ge.strand = self._getStrandFromString(cols[6]) ge.phase = cols[7] ge.attributes = cols[8] for attr in origCols[8].split(';'): attrSplitted = attr.split('=') if len(attrSplitted) == 2: key, val = attrSplitted if key.lower() == 'id': ge.id = unquote(val) elif key.lower() == 'name': ge.name = unquote(val) return ge
def __init__(self, fn, genome=None, trackName=None, external=False, printWarnings=True, strToUseInsteadOfFn='', *args, **kwArgs): #, depth=0 self._fn = fn self._genome = genome self._genomeElement = GenomeElement(genome) self._trackName = trackName self._external = external self._prefixList = None self._printWarnings = printWarnings self._strToUseInsteadOfFn = strToUseInsteadOfFn self._lastWarning = None
def next(self): self._index += 1 if self._index >= len(self): raise StopIteration return GenomeElement(start=self._startList[self._index] if self._index < len(self._startList) else None, end=self._endList[self._index] if self._index < len(self._endList) else None, strand=self._strandList[self._index] if self._index < len(self._strandList) else None, val=self._valList[self._index] if self._index < len(self._valList) else None, id=self._idList[self._index] if self._index < len(self._idList) else None, edges=self._edgesList[self._index] if self._index < len(self._edgesList) else None, weights=self._weightsList[self._index] if self._index < len(self._weightsList) else None, extra=self._extraList[self._index] if self._index < len(self._extraList) else None)
def _next(self, line): if self._isDeclarationLine(line): ge = self._parseDeclarationLine(line) if ge is not None: return ge else: if line.startswith('#'): return None cols = line.split() self._checkDataLineCols(cols) if self._fixedStep: self._curElCountInBoundingRegion += 1 val = numpy.float(self._handleNan(cols[0])) if self._isFunction: self._genomeElement.val = val return self._genomeElement else: start = self._checkValidStart( self._chr, self._getFixedStepCurElStart()) else: start = self._checkValidStart(self._chr, int(cols[0]) - 1) val = numpy.float(self._handleNan(cols[1])) end = None if not self._isPoints: end = self._checkValidEnd(self._chr, self._getEnd(start), start) if self._isStepFunction: start = None return GenomeElement(genome=self._genome, chr=self._chr, start=start, end=end, val=val)
def _wrappedTrackElsGenerator(self): track = self._getTrack() for region,tv in ((region, self._getTrackView(track, region)) for region in self._boundingRegions): for te in tv: yield GenomeElement.createGeFromTrackEl(te, tv.trackFormat, globalCoords=self._globalCoords)
def testExclude(self): self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\ GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,100) )) self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\ GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',200,210) )) self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\ GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chrM',100,110) )) self.assertEqual([GenomeElement('TestGenome','chr21',110,200)],\ GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',100,110) )) self.assertEqual([GenomeElement('TestGenome','chr21',110,200)],\ GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,110) )) self.assertEqual([GenomeElement('TestGenome','chr21',100,190)],\ GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',190,200) )) self.assertEqual([GenomeElement('TestGenome','chr21',100,190)],\ GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',190,210) )) self.assertEqual([],\ GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,210) )) self.assertEqual([GenomeElement('TestGenome','chr21',100,140), GenomeElement('TestGenome','chr21',160,200)],\ GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',140,160) ))
def testExtend(self): self.assertEqual(GenomeElement('TestGenome','chr21',100,200),\ GenomeElement('TestGenome','chr21',100,200).extend( 0 )) self.assertEqual(GenomeElement('TestGenome','chr21',0,200),\ GenomeElement('TestGenome','chr21',100,200).extend( -100 )) self.assertEqual(GenomeElement('TestGenome','chr21',-100,200),\ GenomeElement('TestGenome','chr21',100,200).extend( -200, ensureValidity=False )) self.assertEqual(GenomeElement('TestGenome','chr21',0,200),\ GenomeElement('TestGenome','chr21',100,200).extend( -200, ensureValidity=True )) self.assertEqual(GenomeElement('TestGenome','chr21',100,300),\ GenomeElement('TestGenome','chr21',100,200).extend( 100 )) self.assertEqual(GenomeElement('TestGenome','chr21',100,50000200),\ GenomeElement('TestGenome','chr21',100,200).extend( 50000000, ensureValidity=False )) self.assertEqual(GenomeElement('TestGenome','chr21',100,46944323),\ GenomeElement('TestGenome','chr21',100,200).extend( 50000000, ensureValidity=True ))
def testEqual(self): self.assertEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'})) self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('NCBI46','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'})) self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('TestGenome','chrM',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'})) self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('TestGenome','chr21',20,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'})) self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('TestGenome','chr21',10,110,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'})) self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('TestGenome','chr21',10,100,6,True,'id',['id2','id3'],[5,6],extra={'source':'source'})) self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('TestGenome','chr21',10,100,5,False,'id',['id2','id3'],[5,6],extra={'source':'source'})) self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('TestGenome','chr21',10,100,5,True,'id4',['id2','id3'],[5,6],extra={'source':'source'})) self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id4'],[5,6],extra={'source':'source'})) self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,7],extra={'source':'source'})) self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}), GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source', 'other':'value'}))
def next(self): trackEl = self._tvIter.next() ge = GenomeElement.createGeFromTrackEl(trackEl, self._tv.trackFormat) return ge
def testContains(self): self.assertTrue(GenomeElement('TestGenome','chr21',10,100).contains( \ GenomeElement('TestGenome','chr21',10,100))) self.assertTrue(GenomeElement('TestGenome','chr21',10,100).contains( \ GenomeElement('TestGenome','chr21',20,80))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \ GenomeElement('TestGenome','chr21',10,101))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \ GenomeElement('TestGenome','chr21',9,100))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \ GenomeElement('TestGenome','chr21',9,101))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \ GenomeElement('TestGenome','chr21',0,10))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \ GenomeElement('TestGenome','chrM',20,80)))
def testWriteElement(self): s = SetupDir(self.path, ['start', 'end', 'val', 'strand', 'id', 'edges', 'weights', 'cat']) ge = GenomeElement() s.od.writeElement(ge) for f in s.od._files.values(): self.assertEqual(ge, f.ge)
def testOverlaps(self): self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \ GenomeElement('TestGenome','chr21',10,100))) self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \ GenomeElement('TestGenome','chr21',20,80))) self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \ GenomeElement('TestGenome','chr21',10,101))) self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \ GenomeElement('TestGenome','chr21',9,100))) self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \ GenomeElement('TestGenome','chr21',9,101))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \ GenomeElement('TestGenome','chr21',0,10))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \ GenomeElement('TestGenome','chr21',100,110))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \ GenomeElement('TestGenome','chrM',20,80)))
def testTouches(self): self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \ GenomeElement('TestGenome','chr21',10,100))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \ GenomeElement('TestGenome','chr21',20,80))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \ GenomeElement('TestGenome','chr21',10,101))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \ GenomeElement('TestGenome','chr21',9,100))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \ GenomeElement('TestGenome','chr21',9,101))) self.assertTrue(GenomeElement('TestGenome','chr21',10,100).touches( \ GenomeElement('TestGenome','chr21',0,10))) self.assertTrue(GenomeElement('TestGenome','chr21',10,100).touches( \ GenomeElement('TestGenome','chr21',100,110))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \ GenomeElement('TestGenome','chr21',0,9))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \ GenomeElement('TestGenome','chr21',101,110))) self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \ GenomeElement('TestGenome','chrM',20,80)))
def _next(self, line): if line.startswith('##FASTA'): raise StopIteration if len(line) > 0 and line[0] == '#': return None origCols = line.split('\t') cols = [unquote(x) for x in origCols] if len(cols) != 9: raise InvalidFormatError( "Error: GFF files must contain 9 tab-separated columns") ge = GenomeElement(self._genome) ge.chr = self._checkValidChr(cols[0]) ge.source = cols[1] self._parseThirdCol(ge, cols[2]) ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1) ge.end = self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start) self._parseSixthCol(ge, cols[5]) ge.strand = self._getStrandFromString(cols[6]) ge.phase = cols[7] ge.attributes = cols[8] for attr in origCols[8].split(';'): attrSplitted = attr.split('=') if len(attrSplitted) == 2: key, val = attrSplitted if key.lower() == 'id': ge.id = unquote(val) elif key.lower() == 'name': ge.name = unquote(val) return ge