def _checkValidStart(self, chr, start): if start < 0: raise InvalidFormatError('Error: start position is negative: %s' % start) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ start > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: start position is larger than chromosome size (%s) < %d' % \ (GenomeInfo.getChrLen(self.genome, chr), start)) return start
def _checkValidStart(self, chr, start): if start < 0: raise InvalidFormatError('Error: start position is negative: %s' % start) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ start > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: start position is larger than the size of chromosome "%s" (%s > %s)' % \ (chr, start, GenomeInfo.getChrLen(self.genome, chr))) return start
def _checkValidEnd(self, chr, end, start=None): if end < 0: raise InvalidFormatError('Error: end position is negative: %s' % end) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ end-1 > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: end position is larger than chromosome size (%s)' % \ GenomeInfo.getChrLen(self.genome, chr)) if start is not None and end <= start: raise InvalidFormatError('Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d' % (end, start)) return end
def _getBoundingRegionTupleList(self, case, sortedAssertElList): boundingRegions = [br for br in sorted(case.boundingRegionsAssertList) if br.region.chr is not None] if len(boundingRegions) > 0: return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=br.region.chr, \ start=br.region.start if br.region.start is not None else 0, \ end=br.region.end if br.region.end is not None else \ GenomeInfo.getChrLen(self.GENOME, br.region.chr)), br.elCount) for br in boundingRegions] else: totChrList = [ge.chr for ge in sortedAssertElList] chrBrList = OrderedDict( [ (i, totChrList.count(i)) for i in sorted(set(totChrList)) ] ) return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=chr, start=0, \ end=GenomeInfo.getChrLen(self.GENOME, chr)), elCount) \ for chr, elCount in chrBrList.iteritems()]
def createAssemblyGapsFile(genome, assemblyChars='ACGTacgt'): """genome assemblyChars='ACGTacgt'""" basePath = gcf.createOrigPath(genome, GenomeInfo.getPropertyTrackName(genome, 'gaps'),'') outFn = basePath + 'assemblyGaps.bed' qcf.ensurePathExists(outFn) outFile = open(outFn,'w') seqTrack = PlainTrack( GenomeInfo.getSequenceTrackName(genome) ) anyGaps = False for chr in GenomeInfo.getExtendedChrList(genome): chrRegion = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) seqTV = seqTrack.getTrackView(chrRegion) seq = seqTV.valsAsNumpyArray() #gapIndexes = numpy.arange(len(seq))[(seq == 'n') | (seq == 'N')] gapIndexes = numpy.arange(len(seq))[numpy.logical_not( numpy.logical_or.reduce([seq == x for x in assemblyChars]) )] gapIndexDiff = gapIndexes[1:] - gapIndexes[:-1] gapBeginIndexes = numpy.delete(gapIndexes, (numpy.arange(len(gapIndexDiff)) + 1)[gapIndexDiff==1]) gapEndIndexes = numpy.delete(gapIndexes + 1, numpy.arange(len(gapIndexDiff))[gapIndexDiff==1]) assert len(gapBeginIndexes) == len(gapEndIndexes) for i in xrange(len(gapBeginIndexes)): anyGaps = True outFile.write('\t'.join([chr, str(gapBeginIndexes[i]), str(gapEndIndexes[i])]) + os.linesep) if not anyGaps: outFile.write('\t'.join([GenomeInfo.getExtendedChrList(genome)[0], '1', '1'])) outFile.close()
def _compute(self): tv = self._children[0].getResult() starts, ends = tv.startsAsNumpyArray(), tv.endsAsNumpyArray() borderDict = defaultdict(int) listLen = len(starts) for index in xrange(listLen): borderDict[starts[index]]+=1 borderDict[ends[index]]-=1 sortedPos = sorted(borderDict) range(0, chrlength, microbinzie) #handle start border issues startList, endList, valList = (sortedPos, sortedPos[1:], []) if sortedPos[0] == 0 else ([0] + sortedPos, sortedPos, [0]) #Handle end border issues chrEndPos = GenomeInfo.getChrLen(tv.genomeAnchor.genome, tv.genomeAnchor.chr)-1 startList, endList = (startList, endList+[chrEndPos]) if endList[-1]<chrEndPos else (startList[:-1], endList) #make step-function values accVal = 0 for pos in sortedPos: accVal+= borderDict[pos] valList.append(accVal) if chrEndPos == pos: valList.pop() return TrackView(genomeAnchor=tv.genomeAnchor, startList=np.array(startList), endList=np.array(endList), valList=np.array(valList), \ strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=False)
def _createTrackCommon(cls, genome, inTrackName, outTrackName, windowSize, func, username, chrList): regionList = [ GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) for chr in chrList ] for region in regionList: PreProcessCustomTrackJob(genome, outTrackName, [region], cls._getGeSourceForRegion, username=username, preProcess=True, finalize=False, inTrackName=inTrackName, windowSize=windowSize, func=func).process() PreProcessCustomTrackJob(genome, outTrackName, regionList, cls._getGeSourceForRegion, username=username, preProcess=False, finalize=True, inTrackName=inTrackName, windowSize=windowSize, func=func).process()
def __iter__(self): brShelve1 = self._getBoundingRegionShelve(self._trackName1) brShelve2 = self._getBoundingRegionShelve(self._trackName2) allBrsAreWholeChrs1 = self._commonAllBoundingRegionsAreWholeChr(brShelve1) \ if brShelve1 is not None else False allBrsAreWholeChrs2 = self._commonAllBoundingRegionsAreWholeChr(brShelve2) \ if brShelve2 is not None else False for chr in GenomeInfo.getExtendedChrList(self.genome): if brShelve1 is None: yield GenomeRegion(self.genome, chr, 0, GenomeInfo.getChrLen(self.genome, chr)) else: brList1 = brShelve1.getAllBoundingRegionsForChr(chr) if brShelve2 is None or \ (allBrsAreWholeChrs2 and not allBrsAreWholeChrs1): for reg in brList1: yield reg else: brList2 = brShelve2.getAllBoundingRegionsForChr(chr) if allBrsAreWholeChrs1 and not allBrsAreWholeChrs2: for reg in brList2: yield reg else: for reg in self.getAllIntersectingRegions( self.genome, chr, brList1, brList2): yield reg
def _createPreProcFiles(self): collector = TrackInfoDataCollector(self._genome, self._trackName) collector.updateMetaDataForFinalization(self._geSource.getFileSuffix(), self._geSource.getPrefixList(), \ self._geSource.getValDataType(), self._geSource.getValDim(), \ self._geSource.getEdgeWeightDataType(), self._geSource.getEdgeWeightDim(), \ self._geSource.hasUndirectedEdges(), self._geSource.getVersion(), PreProcessUtils.constructId(self._geSource)) if collector.getNumElements(self._chr, self._allowOverlaps) == 0: return if self._mode != 'Real': for ge in self._geSource: pass return dirPath = createDirPath(self._trackName, self._genome, self._chr, self._allowOverlaps) dir = OutputDirectory(dirPath, collector.getPrefixList(self._allowOverlaps), \ collector.getNumElements(self._chr, self._allowOverlaps),\ GenomeInfo.getChrLen(self._genome, self._chr), \ collector.getValDataType(), collector.getValDim(), \ collector.getEgdeWeightDataType(), collector.getEgdeWeightDim(), \ collector.getMaxNumEdges(self._chr, self._allowOverlaps), \ collector.getMaxStrLens(self._chr, self._allowOverlaps)) writeFunc = dir.writeRawSlice if self._geSource.isSliceSource() else dir.writeElement for ge in self._geSource: writeFunc(ge) collector.appendPreProcessedChr(self._allowOverlaps, self._chr) dir.close()
def __iter__(self): brShelve1 = self._getBoundingRegionShelve(self._trackName1) brShelve2 = self._getBoundingRegionShelve(self._trackName2) for chr in GenomeInfo.getExtendedChrList(self.genome): if brShelve1 is None: yield GenomeRegion(self.genome, chr, 0, GenomeInfo.getChrLen(self.genome, chr)) else: brList1 = brShelve1.getAllBoundingRegions(chr) allBrsAreWholeChrs1 = self._commonAllBoundingRegionsAreWholeChr(brShelve1) allBrsAreWholeChrs2 = self._commonAllBoundingRegionsAreWholeChr(brShelve2) \ if brShelve2 is not None else False if brShelve2 is None or \ (allBrsAreWholeChrs2 and not allBrsAreWholeChrs1): for reg in brList1: yield reg else: brList2 = brShelve2.getAllBoundingRegions(chr) if allBrsAreWholeChrs1 and not allBrsAreWholeChrs2: for reg in brList2: yield reg else: for reg in self.getAllIntersectingRegions(self.genome, chr, brList1, brList2): yield reg
def execute(cls, choices, galaxyFn=None, username=''): from quick.application.ExternalTrackManager import ExternalTrackManager genome = choices[0] preProcTN1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN( genome, choices[2].split( ':')) if choices[1] == 'history' else choices[2].split(':') chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) trackType = choices[3].split(':')[1] fnSource = ExternalTrackManager.extractFnFromGalaxyTN( choices[3].split(':')) if trackType in ['valued.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter( BedGenomeElementSource(fnSource, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter( GtrackGenomeElementSource(fnSource, genome=genome)).__iter__() #headLinesStr = geSource.getHeaderLines().replace('##','\n##') else: raise InvalidFormatError( 'The Binning must be of the following formats: gtrack, valued.bed, category.bed ,bed ...' ) cls.PrintResultToHistItem(galaxyFn, geSource, preProcTN1, genome, username)
def _removeBoundingRegionTuplesIfFullChrsAndNotFixedGapSize(self): if self.getFixedGapSize() == 0 and not self._reprIsDense: # If only full chromosomes if all(brt.region.chr in GenomeInfo.getExtendedChrList(self._genome) and \ brt.region.start == 0 and \ brt.region.end == GenomeInfo.getChrLen(self._genome, brt.region.chr) \ for brt in self._boundingRegionTuples): self._boundingRegionTuples = []
def assertChrElCounts(self, trackName, chrElCountDict, allowOverlaps, customBins): for chr in chrElCountDict.keys(): if chr in customBins: region = customBins[chr] else: region = GenomeRegion(self.GENOME, chr, 0, GenomeInfo.getChrLen(self.GENOME, chr)) tv = self._getTrackView(trackName, region, allowOverlaps) self.assertEquals(chrElCountDict[chr], len([x for x in tv]))
def _checkValidEnd(self, chr, end, start=None): if end < 0: raise InvalidFormatError('Error: end position is negative: %s' % end) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ end-1 > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: end position is larger than the size of chromosome "%s" (%s > %s)' % \ (chr, end-1, GenomeInfo.getChrLen(self.genome, chr))) if start is not None and end <= start: if not start == end == 1: raise InvalidFormatError( 'Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d' % (end, start)) return end
def _createTrackCommon(cls, genome, inTrackName, outTrackName, windowSize, func, username, chrList): regionList = [ GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) for chr in chrList ] PreProcessCustomTrackJob(genome, outTrackName, regionList, cls._getGeSourceForRegion, \ username=username, inTrackName=inTrackName, windowSize=windowSize, func=func).process()
def validate_snp(cls, snps, genome): DNA = ['A', 'C', 'G', 'T'] + cls.AMBIGUOUS_DNA.keys() err = [] for snp in snps: assert len(snp) == 5 _rsid, _chr, _pos, _ref, _var = snp #spec = ':'.join(snp) spec = repr(snp) if _rsid and not _chr: err.append('Invalid RefSNP: rs' + _rsid) continue if _chr not in GenomeInfo.getChrList(genome): err.append(spec + ' Chromosome ' + _chr + ' is not valid') continue if not _pos.isdigit(): err.append(spec + ' Position must numeric') continue if int(_pos) < 0: err.append(spec + ' Position must be higher than 0') continue chrLen = GenomeInfo.getChrLen(genome, _chr) if int(_pos) > chrLen: err.append(spec + ' Position is higher than length of %s (%d)' % (_chr, chrLen)) continue ref = VariantMeltingProfile.get_reference_allele( genome, _chr, _pos, len(_ref)) if _ref != ref: err.append( spec + ' Reference allele does not match reference genome, should be: ' + ref) continue if _ref == 'N': err.append(spec + ' Reference allele can not be N') continue if not _var: err.append(spec + ' Variant allele not specified') continue if not all([v in DNA for v in _var]): err.append(spec + ' Variant allele ' + _var + " is not valid") continue if cls.AMBIGUOUS_DNA.has_key( _var) and _ref in cls.AMBIGUOUS_DNA[_var]: err.append(spec + ' Ambiguous variant allele includes reference') continue return err
def getNumberElements(genome, trackName): track = PlainTrack(trackName) numElements = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) numElements = numElements + [len(tv.startsAsNumpyArray())] return numElements
def getAnchor(genome, trackName): track = PlainTrack(trackName) anchor = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) anchor = anchor + [str(tv.genomeAnchor)] return anchor
def createNmerChains(self, n): for chr in GenomeInfo.getChrList(self._genome): print 'Creating chains of nmers of length ', n, ' for chromosome ', chr chrLen = GenomeInfo.getChrLen(self._genome,chr) chrReg = GenomeRegion( self._genome, chr, 0, chrLen ) seqTV = PlainTrack( GenomeInfo.getSequenceTrackName(self._genome) ).getTrackView(chrReg) #nmersAsInts = NmerAsIntSlidingWindow(n, FuncValTvWrapper(seqTV)) nmersAsInts = NmerAsIntSlidingWindow(n, seqTV.valsAsNumpyArray()) SameValueIndexChainsFactory.generate( nmersAsInts, chrLen, 4**n, self._createPath(n), chr )
def isCompBin(region): if isIter(region): return False offsetOK = (CompBinManager.getOffset( region.start, CompBinManager.getBinNumber(region.start)) == 0) lengthOK = (len(region) == min( CompBinManager.getCompBinSize(), GenomeInfo.getChrLen(region.genome, region.chr) - region.start)) return offsetOK and lengthOK
def createNmerChains(self, n): for chr in GenomeInfo.getChrList(self._genome): print 'Creating chains of nmers of length ', n, ' for chromosome ', chr chrLen = GenomeInfo.getChrLen(self._genome, chr) chrReg = GenomeRegion(self._genome, chr, 0, chrLen) seqTV = PlainTrack(GenomeInfo.getSequenceTrackName( self._genome)).getTrackView(chrReg) #nmersAsInts = NmerAsIntSlidingWindow(n, FuncValTvWrapper(seqTV)) nmersAsInts = NmerAsIntSlidingWindow(n, seqTV.valsAsNumpyArray()) SameValueIndexChainsFactory.generate(nmersAsInts, chrLen, 4**n, self._createPath(n), chr)
def extend(self, extensionSize, ensureValidity=True): if extensionSize >= 0: self.end += extensionSize else: self.start += extensionSize if ensureValidity: self.start = max(0, self.start) self.end = min(self.end, GenomeInfo.getChrLen(self.genome, self.chr)) return self
def execute(cls, choices, galaxyFn=None, username=''): from gold.util.RandomUtil import random outputFile = open(galaxyFn, 'w') genome = choices[0] histItem = choices[2] trackItem = choices[3] chromRegsPath = GenomeInfo.getChrRegsFn(genome) chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) geSource = headLinesStr = None if choices[1] == 'history': trackType = choices[2].split(':')[1] username = ''.join( [chr(random.randint(97, 122)) for i in range(6)]) tempFn = createCollectedPath( genome, [], username + '_'.join([str(v) for v in time.localtime()[:6]]) + '.' + trackType) fnSource = ExternalTrackManager.extractFnFromGalaxyTN( choices[2].split(':')) open(tempFn, 'w').write(open(fnSource, 'r').read()) if trackType in ['valued.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter( BedGenomeElementSource(tempFn, genome=genome)).__iter__() #elif trackType == 'gtrack': # geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__() # headLinesStr = geSource.getHeaderLines().replace('##','\n##') cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True) os.remove(tempFn) else: writeHeaderFlag = True for chrom in GenomeInfo.getChrList(genome): gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom]) plTrack = PlainTrack(trackItem.split(':')) geSource = GenomeElementTvWrapper( plTrack.getTrackView(gRegion)).__iter__() cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag) writeHeaderFlag = False outputFile.close()
def getSegmentSizes(genome, trackName): track = PlainTrack(trackName) segmentSize = []; sumSegmentSize = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) sizeSegments = tv.endsAsNumpyArray() - tv.startsAsNumpyArray() sumSizes = sizeSegments.sum() segmentSize = segmentSize + [sizeSegments.tolist()] sumSegmentSize = sumSegmentSize + [sumSizes.tolist()] return sumSegmentSize
def smoothPoints(genome, inTrackName, windowSize, chr): from gold.extra.SlidingWindow import SlidingWindow from quick.util.GenomeInfo import GenomeInfo from gold.track.Track import PlainTrack from gold.track.GenomeRegion import GenomeRegion #func = lambda x: ( sum( [r.dnorm(i-len(x)/2.0,0,2000)*x[i].end for i in range(len(x)) if x[i]!=None] ) / sum( [r.dnorm(i-len(x)/2.0,0,2000)*1 for i in range(len(x)) if x[i]!=None] ) ) if len([y for y in x if y!=None])>0 else 0 chrReg = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome,chr) ) #chrReg = GenomeElement(genome, chr, 0, 3000) inTrackView = PlainTrack(inTrackName).getTrackView(chrReg) print [x.end() for x in inTrackView] slidingWindows = SlidingWindow(GenomeElementTvWrapper(inTrackView), windowSize) print [x for x in weightedValForWindowsYielder(slidingWindows, windowSize)]
def _createOutputDirectory(self, genome, chr, trackName, allowOverlaps, geSourceManager): dirPath = createDirPath(trackName, genome, chr, allowOverlaps) from quick.util.GenomeInfo import GenomeInfo return OutputDirectory(dirPath, geSourceManager.getPrefixList(), \ geSourceManager.getNumElementsForChr(chr), \ GenomeInfo.getChrLen(genome, chr), \ geSourceManager.getValDataType(), \ geSourceManager.getValDim(), \ geSourceManager.getEdgeWeightDataType(), \ geSourceManager.getEdgeWeightDim(), \ geSourceManager.getMaxNumEdgesForChr(chr), \ geSourceManager.getMaxStrLensForChr(chr), \ geSourceManager.isSorted())
def __iter__(self): chr = self.chr trackName1, trackName2, w1, w2, genome = self.trackName1, self.trackName2, self.w1, self.w2, self.genome region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) ) track1 = PlainTrack(trackName1) tv1 = track1.getTrackView(region) vals1 = tv1.valsAsNumpyArray() track2 = PlainTrack(trackName2) tv2 = track2.getTrackView(region) vals2 = tv2.valsAsNumpyArray() for i in xrange(len(vals1)): yield w1*vals1[i] + w2*vals2[i]
def __iter__(self): chr = self.chr trackName1, trackName2, w1, w2, genome = self.trackName1, self.trackName2, self.w1, self.w2, self.genome region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) track1 = PlainTrack(trackName1) tv1 = track1.getTrackView(region) vals1 = tv1.valsAsNumpyArray() track2 = PlainTrack(trackName2) tv2 = track2.getTrackView(region) vals2 = tv2.valsAsNumpyArray() for i in xrange(len(vals1)): yield w1 * vals1[i] + w2 * vals2[i]
def __iter__(self): from gold.application.RSetup import r chr = self.chr trackName1, genome = self.trackName1, self.genome factor = self.factor region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) ) track1 = PlainTrack(trackName1) tv1 = track1.getTrackView(region) vals1 = tv1.valsAsNumpyArray() #scale between 0 and 1..: minVal, maxVal = vals1.min(), vals1.max() vals1 = (vals1 - minVal) * (1/(maxVal-minVal)) for pos in xrange(len(vals1)): #print r.runif(1), vals1[pos] if r.runif(1) < factor*vals1[pos]: yield [pos,pos+1]
def getGenomicElements(genome, trackName): track = PlainTrack(trackName) genElements = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) for el in tv: #print chrom, el.start(), el.end() #, el.name() genElements = genElements + [[chrom, el.start(), el.end()]] return genElements #print numpy.version.version # 1.7.1 !! #unique, counts = numpy.unique(segmentSize, return_counts=True) # This is for numpy 1.9 #print numpy.asarray((unique, counts)).T '''track.setFormatConverter('SegmentToMidPointFormatConverter')
def nextBin(self): #start = self.start #for chr in self.chromosomes: # if self.genome: # chrLen = GenomeInfo.getChrLen(self.genome, chr) # else: # chrLen = self.end # assert chrLen is not None # # if self.end is None: # chrEnd = chrLen # else: # chrEnd = min(self.end, chrLen) # #chrLen = 3100000 # # while (start < chrEnd): # if self.binLen is not None: # end = min(start+self.binLen, chrEnd) # else: # end = chrEnd # #print 'YIELDING: ',start, end, chrEnd # yield GenomeRegion(self.genome, chr, start, end) # if self.binLen is not None: # start += self.binLen # else: # start = chrLen # # #in case of more chromosomes, reset start: # start = 0 for region in self._userBinSource: start = region.start if region.start is not None else 0 chrLen = GenomeInfo.getChrLen( region.genome, region.chr) if region.genome is not None else None regEnd = min([x for x in [region.end, chrLen] if x is not None]) if self._binLen is None: yield GenomeRegion(region.genome, region.chr, start, regEnd) else: while start < regEnd: end = min(start + self._binLen, regEnd) yield GenomeRegion(region.genome, region.chr, start, end) start += self._binLen
def __iter__(self): from proto.RSetup import r chr = self.chr trackName1, genome = self.trackName1, self.genome factor = self.factor region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) track1 = PlainTrack(trackName1) tv1 = track1.getTrackView(region) vals1 = tv1.valsAsNumpyArray() #scale between 0 and 1..: minVal, maxVal = vals1.min(), vals1.max() vals1 = (vals1 - minVal) * (1 / (maxVal - minVal)) for pos in xrange(len(vals1)): #print r.runif(1), vals1[pos] if r.runif(1) < factor * vals1[pos]: yield [pos, pos + 1]
def createBoundingRegionShelve(genome, trackName, allowOverlaps): collector = TrackInfoDataCollector(genome, trackName) geChrList = collector.getPreProcessedChrs(allowOverlaps) boundingRegionTuples = [x for x in collector.getBoundingRegionTuples(allowOverlaps) if x.region.chr is not None] if len(boundingRegionTuples) == 0: boundingRegionTuples = [BoundingRegionTuple( \ GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(genome, chr)), \ collector.getNumElements(chr, allowOverlaps) ) \ for chr in geChrList] brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps) brShelve.storeBoundingRegions(boundingRegionTuples, geChrList, not collector.getTrackFormat().reprIsDense()) boundingRegionChrs = set([br.region.chr for br in boundingRegionTuples]) for chr in boundingRegionChrs | set(geChrList): if brShelve.getTotalElementCount(chr) != collector.getNumElements(chr, allowOverlaps): raise ShouldNotOccurError("Error: The total element count for all bounding regions of chromosome '%s' is not equal to the number of genome elements of that chromosome. %s != %s" % \ (chr, brShelve.getTotalElementCount(chr), collector.getNumElements(chr, allowOverlaps)) )
def getBoundingRegionTuples(self): boundingRegionTuples = [x for x in self._getBoundingRegionTuples() \ if x.region.chr is not None] if len(boundingRegionTuples) == 0: from gold.origdata.GenomeElementSource import BoundingRegionTuple from gold.track.GenomeRegion import GenomeRegion from quick.util.GenomeInfo import GenomeInfo geChrList = self.getAllChrs() boundingRegionTuples = [BoundingRegionTuple( \ GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(self._geSource.genome, chr)), \ self.getNumElementsForChr(chr) ) \ for chr in geChrList] self._boundingRegionsAndGEsCorrespond = False else: self._boundingRegionsAndGEsCorrespond = True return boundingRegionTuples
def _createNmerTrack(self, nmerList, lowerOrder=None): nmerLengths = list(set([len(nmer) for nmer in nmerList])) assert len(nmerLengths)==1 chainOrder = lowerOrder if lowerOrder is not None else nmerLengths[0] regionList = [GenomeRegion(self._genome, chr, 0, GenomeInfo.getChrLen(self._genome, chr) ) for chr in GenomeInfo.getChrList(self._genome)] for region in regionList: print '|', chains = SameValueIndexChainsFactory.load(self._createPath(chainOrder), region.chr) for nmer in nmerList: if len(nmerList) > 1: print '.', if lowerOrder is not None: nmerPrefix = nmer[0:chainOrder] rawIndexGenerator = chains.getIndexGenerator(NmerTools.nmerAsInt(nmerPrefix)) indexGenerator = LowerOrderChainWrapper(rawIndexGenerator, nmerPrefix, nmer, self._genome, region.chr) else: indexGenerator = chains.getIndexGenerator(NmerTools.nmerAsInt(nmer)) #print 'Length of lower order chain: %i and %i' % (sum(1 for x in indexGenerator), sum(1 for x in indexGenerator)) #print 'Length of wrapped chain: %i and %i' % (sum(1 for x in wrappedIndexGenerator), sum(1 for x in wrappedIndexGenerator)) PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), [region], \ self._getNmerGeSourceForChr, finalize=False, preProcess=True, \ indexGenerator=indexGenerator).process() for nmer in nmerList: try: PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), regionList, \ self._getNmerGeSourceForChr, preProcess=False, finalize=True, \ indexGenerator=[0]).process() except EmptyGESourceError: PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), [GenomeRegion(self._genome, regionList[0].chr, -1, 0)], \ self._getNmerGeSourceForChr, preProcess=True, finalize=True, \ indexGenerator=[-1]).process() return
def nextBin(self): #start = self.start #for chr in self.chromosomes: # if self.genome: # chrLen = GenomeInfo.getChrLen(self.genome, chr) # else: # chrLen = self.end # assert chrLen is not None # # if self.end is None: # chrEnd = chrLen # else: # chrEnd = min(self.end, chrLen) # #chrLen = 3100000 # # while (start < chrEnd): # if self.binLen is not None: # end = min(start+self.binLen, chrEnd) # else: # end = chrEnd # #print 'YIELDING: ',start, end, chrEnd # yield GenomeRegion(self.genome, chr, start, end) # if self.binLen is not None: # start += self.binLen # else: # start = chrLen # # #in case of more chromosomes, reset start: # start = 0 for region in self._userBinSource: start = region.start if region.start is not None else 0 chrLen = GenomeInfo.getChrLen(region.genome, region.chr) if region.genome is not None else None regEnd = min([x for x in [region.end, chrLen] if x is not None]) if self._binLen is None: yield GenomeRegion(region.genome, region.chr, start, regEnd) else: while start < regEnd: end = min(start + self._binLen, regEnd) yield GenomeRegion(region.genome, region.chr, start, end) start += self._binLen
def createChromosomeFile(genome, chromNames, referToCollected=False): """genome chromNames""" # python quick/extra/CustomFuncCatalog.py CreateChromosomeFile mm9 'chr1, chr2, ...'" chrList = chromNames.replace(' ','').split(',') if referToCollected: from gold.util.CommonFunctions import createCollectedPath basePath = createCollectedPath(genome, GenomeInfo.getChrTrackName(genome)) else: basePath = gcf.createOrigPath(genome, GenomeInfo.getChrTrackName(genome)) # Why is this file a category.bed file? outFn = basePath + os.sep + 'chromosomes.category.bed' qcf.ensurePathExists(outFn) print 'Creating: ' + outFn outFile = open(outFn, 'w') for chr in chrList: outFile.write('\t'.join([chr, '0', str(GenomeInfo.getChrLen(genome, chr)), chr]) + os.linesep) outFile.close()
def execute(cls, choices, galaxyFn=None, username=''): from quick.application.ExternalTrackManager import ExternalTrackManager genome = choices[0] preProcTN1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, choices[2].split(':')) if choices[1] == 'History' else choices[2].split(':') chrSizeDict = dict([ ( chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) trackType = choices[3].split(':')[1] fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[3].split(':')) if trackType in ['marked.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter(BedGenomeElementSource(fnSource, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter(GtrackGenomeElementSource(fnSource, genome=genome)).__iter__() #headLinesStr = geSource.getHeaderLines().replace('##','\n##') else: raise InvalidFormatError('The Binning must be of the following formats: gtrack, marked.bed, category.bed ,bed ...') cls.PrintResultToHistItem( galaxyFn, geSource, preProcTN1, genome, username)
def execute(cls, choices, galaxyFn=None, username=''): outputFile = open(galaxyFn, 'w') genome = choices[0] histItem = choices[2] trackItem = choices[3] chromRegsPath = GenomeInfo.getChrRegsFn(genome) chrSizeDict = dict([ ( chr, GenomeInfo.getChrLen(genome, chr)) for chr in GenomeInfo.getChrList(genome)]) geSource = headLinesStr = None if choices[1] == 'history': trackType = choices[2].split(':')[1] from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile tempFn = GalaxyRunSpecificFile(['fromHistory.'+trackType],galaxyFn).getDiskPath(True) fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':')) open(tempFn,'w').write(open(fnSource,'r').read()) if trackType in ['valued.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__() headLinesStr = geSource.getHeaderLines().replace('##','\n##') cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True) os.remove(tempFn) else: writeHeaderFlag = True for chr in GenomeInfo.getChrList(genome): gRegion = GenomeRegion(genome, chr, 0, chrSizeDict[chr]) plTrack = PlainTrack(trackItem.split(':')) geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__() cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag) writeHeaderFlag = False outputFile.close()
def execute(cls, choices, galaxyFn=None, username=''): outputFile = open(galaxyFn, 'w') genome = choices[0] histItem = choices[2] trackItem = choices[3] chromRegsPath = GenomeInfo.getChrRegsFn(genome) chrSizeDict = dict([ ( chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) geSource = headLinesStr = None if choices[1] == 'History': trackType = choices[2].split(':')[1] username = ''.join([chr(random.randint(97,122)) for i in range(6)]) tempFn = createCollectedPath(genome, [], username+'_'.join([str(v) for v in time.localtime()[:6]])+'.'+trackType) fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':')) open(tempFn,'w').write(open(fnSource,'r').read()) if trackType in ['marked.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__() headLinesStr = geSource.getHeaderLines().replace('##','\n##') cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True) os.remove(tempFn) else: writeHeaderFlag = True for chrom in GenomeInfo.getChrList(genome): gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom]) plTrack = PlainTrack(trackItem.split(':')) geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__() cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag) writeHeaderFlag = False outputFile.close()
def execute(cls, choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' genome = choices.genome; infile = choices.css; fdr_filtering = (choices.filter == "FDR p-value treshold"); if fdr_filtering: FDR = float(choices.fdr) else: num_top = int(choices.numtop); windowSize = int(choices.wsize) inFn = ExternalTrackManager.extractFnFromGalaxyTN(infile.split(":")) data = open(inFn, "r").read(); scores, p, addr, windows = cls.preProcessPvalues(data, 2, 3) outfile = open(galaxyFn, "w") addrs = numpy.array(addr) if fdr_filtering: # [::-1] --> sorted from smallest to largest psorted = numpy.argsort(p)[::-1] k = float(len(p)) n = k testp = 0 #Benjamini-Hochberg procedure for pi in psorted: if p[pi] <= k/n * FDR: testp = p[pi] break k -=1 # Tuva changed from 1 to 0: if k == 0: print "NONE FOUND"; outfile.write("NONE found") outfile.close() return print "Pval found:", testp filteredaddrs = addrs[p<=testp] else: scoresorted = numpy.argsort(scores)[::-1]; scorelimit = scores[scoresorted[num_top-1]]; filteredaddrs = addrs[scores>=scorelimit]; prevAddr = -10000. headers = "##gtrack version: 1.0\n##track type: segments\n##uninterrupted data lines: true\n"+\ "##no overlapping elements: true\n###seqid\tstart\tend\n" outfile.write(headers) curchrom = "" start = "" end = sys.maxint prevAddr = -1000000. for addr in filteredaddrs: addrList = addr.split("\t") if addrList[0] != curchrom or int(addrList[1])-windowSize > prevAddr: if curchrom != "": newend = prevAddr+windowSize if prevAddr+windowSize < end else end outfile.write(start+"\t"+str(newend)+"\n") start = addr curchrom = addrList[0] end = int(GenomeInfo.getChrLen(genome, curchrom))-1 prevAddr = int(addr.split("\t")[1]) newend = prevAddr+windowSize if prevAddr+windowSize < end else end outfile.write(start+"\t"+str(newend)+"\n") print "Number of regions found", len(filteredaddrs) if fdr_filtering: print "False discoveries", testp*windows outfile.close()
def storeShelve(genome, brTuples, sparse=True): boundingRegionShelve = shelve.open('/tmp/brshelve.shelve', 'c', writeback=True) lastRegion = None chrStartIdxs = OrderedDict() chrEndIdxs = OrderedDict() totElCount = 0 totBinCount = 0 for br in brTuples: if lastRegion is not None: if br.region < lastRegion: raise InvalidFormatError("Error: bounding regions are unsorted: %s > %s. The Genomic HyperBrowser preprocessor requires sorted bounding regions." % (lastRegion, br.region)) if lastRegion.overlaps(br.region): raise InvalidFormatError("Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region)) if len(br.region) < 1: raise InvalidFormatError("Error: bounding region '%s' does not have positive length." % br.region) if lastRegion is None or br.region.chr != lastRegion.chr: boundingRegionShelve[br.region.chr] = sorteddict() if sparse: chrStartIdxs[br.region.chr] = totElCount #chrLen = GenomeInfo.getChrLen(br.region.genome, br.region.chr) #startIdx, endIdx, startBinIdx = totElCount, totElCount, totBinCount #endBinIdx = totBinCount + CompBinManager.getNumOfBins(GenomeRegion(start=0, end=chrLen)) #chrInfo[br.region.chr] = BoundingRegionInfo(0, chrLen, startIdx, endIdx, startBinIdx, endBinIdx) startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None) totElCount += br.elCount chrEndIdxs[br.region.chr] = totElCount #print startIdx, endIdx, totElCount #if sparse: # binCount = CompBinManager.getNumOfBins(br.region) # startBinIdx, endBinIdx = totBinCount, totBinCount + binCount # totBinCount += binCount # print startBinIdx, endBinIdx, totBinCount #else: # startBinIdx, endBinIdx = None, None boundingRegionShelve[br.region.chr][br.region.start] = BoundingRegionInfo(br.region.start, br.region.end, startIdx, endIdx, None, None) lastRegion = br.region if sparse: totBinCount = 0 for chr in chrStartIdxs: #print chr chrLen = GenomeInfo.getChrLen(genome, chr) numBinsInChr = CompBinManager.getNumOfBins(GenomeRegion(start=0, end=chrLen)) for key in boundingRegionShelve[chr].keys(): startBinIdx = totBinCount endBinIdx = totBinCount + numBinsInChr brInfo = boundingRegionShelve[chr][key] boundingRegionShelve[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \ chrStartIdxs[chr], chrEndIdxs[chr], \ startBinIdx, endBinIdx) totBinCount += numBinsInChr #print boundingRegionShelve[chr] boundingRegionShelve.sync()
def dummygetChromosomlength(a,b): return GenomeInfo.getChrLen(a,b)
def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList, sparse): assert sparse in [False, True] tempContents = OrderedDict() genomeElementChrs = set(genomeElementChrList) lastRegion = None chrStartIdxs = OrderedDict() chrEndIdxs = OrderedDict() totElCount = 0 totBinCount = 0 for br in boundingRegionTuples: if lastRegion is None or br.region.chr != lastRegion.chr: if br.region.chr in tempContents: raise InvalidFormatError("Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)." % br.region) lastRegion = None tempContents[br.region.chr] = OrderedDict() #sorteddict() if sparse: chrStartIdxs[br.region.chr] = totElCount else: if br.region < lastRegion: raise InvalidFormatError("Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s." % (lastRegion, br.region)) if lastRegion.overlaps(br.region): raise InvalidFormatError("Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region)) if lastRegion.end == br.region.start: raise InvalidFormatError("Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastRegion, br.region)) if len(br.region) < 1: raise InvalidFormatError("Error: bounding region '%s' does not have positive length." % br.region) if not sparse and len(br.region) != br.elCount: raise InvalidFormatError("Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s" % (br.region, len(br.region), br.elCount)) startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None) totElCount += br.elCount if sparse: chrEndIdxs[br.region.chr] = totElCount tempContents[br.region.chr][br.region.start] = BoundingRegionInfo(br.region.start, br.region.end, startIdx, endIdx, 0, 0) lastRegion = br.region if sparse: totBinCount = 0 for chr in tempContents: chrLen = GenomeInfo.getChrLen(self._genome, chr) numBinsInChr = CompBinManager.getNumOfBins(GenomeRegion(start=0, end=chrLen)) for key in tempContents[chr].keys(): startBinIdx = totBinCount endBinIdx = totBinCount + numBinsInChr brInfo = tempContents[chr][key] if chr in genomeElementChrs: tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \ chrStartIdxs[chr], chrEndIdxs[chr], \ startBinIdx, endBinIdx) else: if chrEndIdxs[chr] - chrStartIdxs[chr] > 0: raise InvalidFormatError("Error: bounding region '%s' has incorrect element count: %s > 0" % (GenomeRegion(chr=chr, start=brInfo.start, end=brInfo.end), chrEndIdxs[chr] - chrStartIdxs[chr])) tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, 0, 0, 0, 0) if chr in genomeElementChrs: totBinCount += numBinsInChr if len(genomeElementChrs - set(tempContents.keys())) > 0: raise InvalidFormatError('Error: some chromosomes (sequences) contains data, but has no bounding regions: %s' % ', '.join(genomeElementChrs - set(tempContents.keys()))) ensurePathExists(self._fn) for chr in tempContents: brInfoDict = tempContents[chr] tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()), tuple(brInfoDict.values())) brShelve = safeshelve.open(self._fn) brShelve.update(tempContents) brShelve.close() while not self.fileExists(): from gold.application.LogSetup import logMessage logMessage("Bounding region shelve file '%s' has yet to be created" % self._fn) import time time.sleep(0.2)
def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList, sparse): assert sparse in [False, True] tempContents = OrderedDict() genomeElementChrs = set(genomeElementChrList) lastRegion = None chrStartIdxs = OrderedDict() chrEndIdxs = OrderedDict() totElCount = 0 totBinCount = 0 for br in boundingRegionTuples: if lastRegion is None or br.region.chr != lastRegion.chr: if br.region.chr in tempContents: raise InvalidFormatError( "Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)." % br.region) lastRegion = None tempContents[br.region.chr] = OrderedDict() #sorteddict() if sparse: chrStartIdxs[br.region.chr] = totElCount else: if br.region < lastRegion: raise InvalidFormatError( "Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s." % (lastRegion, br.region)) if lastRegion.overlaps(br.region): raise InvalidFormatError( "Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region)) if lastRegion.end == br.region.start: raise InvalidFormatError( "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastRegion, br.region)) if len(br.region) < 1: raise InvalidFormatError( "Error: bounding region '%s' does not have positive length." % br.region) if not sparse and len(br.region) != br.elCount: raise InvalidFormatError( "Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s" % (br.region, len(br.region), br.elCount)) startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None) totElCount += br.elCount if sparse: chrEndIdxs[br.region.chr] = totElCount tempContents[br.region.chr][br.region.start] = BoundingRegionInfo( br.region.start, br.region.end, startIdx, endIdx, 0, 0) lastRegion = br.region if sparse: totBinCount = 0 for chr in tempContents: chrLen = GenomeInfo.getChrLen(self._genome, chr) numBinsInChr = CompBinManager.getNumOfBins( GenomeRegion(start=0, end=chrLen)) for key in tempContents[chr].keys(): startBinIdx = totBinCount endBinIdx = totBinCount + numBinsInChr brInfo = tempContents[chr][key] if chr in genomeElementChrs: tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \ chrStartIdxs[chr], chrEndIdxs[chr], \ startBinIdx, endBinIdx) else: if chrEndIdxs[chr] - chrStartIdxs[chr] > 0: raise InvalidFormatError( "Error: bounding region '%s' has incorrect element count: %s > 0" % (GenomeRegion(chr=chr, start=brInfo.start, end=brInfo.end), chrEndIdxs[chr] - chrStartIdxs[chr])) tempContents[chr][key] = BoundingRegionInfo( brInfo.start, brInfo.end, 0, 0, 0, 0) if chr in genomeElementChrs: totBinCount += numBinsInChr if len(genomeElementChrs - set(tempContents.keys())) > 0: raise InvalidFormatError( 'Error: some chromosomes (sequences) contains data, but has no bounding regions: %s' % ', '.join(genomeElementChrs - set(tempContents.keys()))) ensurePathExists(self._fn) for chr in tempContents: brInfoDict = tempContents[chr] tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()), tuple(brInfoDict.values())) brShelve = safeshelve.open(self._fn, 'c', protocol=self.PROTOCOL) brShelve.update(tempContents) brShelve.close() while not self.fileExists(): from gold.application.LogSetup import logMessage logMessage( "Bounding region shelve file '%s' has yet to be created" % self._fn) import time time.sleep(0.2)
def execute(cls, choices, galaxyFn=None, username=''): ''' Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' from quick.application.ExternalTrackManager import ExternalTrackManager from gold.origdata.BedGenomeElementSource import BedCategoryGenomeElementSource from gold.origdata.GtrackGenomeElementSource import GtrackGenomeElementSource from gold.origdata.TrackGenomeElementSource import TrackGenomeElementSource from gold.track.GenomeRegion import GenomeRegion from quick.util.GenomeInfo import GenomeInfo from collections import defaultdict genome = choices[0] track = choices[2].split(':') allowOverlaps = True if choices[3] == 'Yes' else False regionList = [] for chrom in GenomeInfo.getChrList(genome): start = 0 chromSize = GenomeInfo.getChrLen(genome, chrom) regionList.append(GenomeRegion(genome, chrom, start, chromSize)) if choices[1] == 'From Hyperbrowser repository': geSource = TrackGenomeElementSource(genome, track, regionList) else: fileType = ExternalTrackManager.extractFileSuffixFromGalaxyTN( track) fn = ExternalTrackManager.extractFnFromGalaxyTN(track) geSource = BedCategoryGenomeElementSource( fn ) if fileType == 'category.bed' else GtrackGenomeElementSource(fn) resultMinDict = defaultdict(dict) resultMaxDict = defaultdict(dict) for ge in geSource: if resultMaxDict[ge.chr].has_key(ge.val): if ge.end: if resultMaxDict[ge.chr][ge.val] < ge.end: resultMaxDict[ge.chr][ge.val] = ge.end elif resultMaxDict[ge.chr][ge.val] < ge.start: resultMaxDict[ge.chr][ge.val] = ge.start if resultMinDict[ge.chr][ge.val] > ge.start: resultMinDict[ge.chr][ge.val] = ge.start else: resultMaxDict[ge.chr][ge.val] = ge.end if ge.end else ge.start resultMinDict[ge.chr][ge.val] = ge.start utfil = open(galaxyFn, 'w') quitFlag = False errorMsg = 'Error, overlapping regions ' catsConflicting = [] for chrom in sorted(resultMinDict.keys()): for category in resultMinDict[chrom].keys(): lower, upper = resultMinDict[chrom][category], resultMaxDict[ chrom][category] if not allowOverlaps: for cat in resultMinDict[chrom]: if cat != category: l, u = resultMinDict[chrom][cat], resultMaxDict[ chrom][cat] if l >= upper or u <= lower: continue if l > lower or u < upper: quitFlag = True catsConflicting.append( '(Category: %s, Region: %i - %i) vs. (Category: %s, Region: %i - %i)' % (category, lower, upper, cat, l, u)) #break #if quitFlag: break print >> utfil, '\t'.join( [chrom, str(lower), str(upper + 1), category]) #if quitFlag: break utfil.close() if quitFlag: open(galaxyFn, 'w').write( 'Error: overlapping resulting regions are not allowed with selected preferences:\n' + '\n'.join(catsConflicting))
def execute(cls, choices, galaxyFn=None, username=""): """Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. """ print "Executing..." genome = choices[0] infile = choices[1] windowSize = int(choices[2]) normquantile = float(choices[3]) percentile = float(choices[4]) inFn = ExternalTrackManager.extractFnFromGalaxyTN(infile.split(":")) data = open(inFn, "r").read() fetVals, addr = cls.preProcessPvalues(data, 2) stddevs, addr = cls.preProcessPvalues(data, 3) output = open(galaxyFn, "w") # Tuva changed sorted elms to FALSE output.write( "##gtrack version: 1.0\n" + "##track type: segments\n" + "##uninterrupted data lines: true\n" + "##sorted elements: false\n" + "##no overlapping elements: true\n" + "###seqid\tstart\tend\n" ) # Calculate limit for FET: m = stats.cmedian(fetVals) upperquant = stats.scoreatpercentile(stddevs, percentile) qnorm = stats.norm.ppf(normquantile) limit = m + qnorm * upperquant print "Windows found", sum(fetVals >= limit) print "percentile", percentile, "normquantile", normquantile print "mean", m, "upperquant", upperquant, "qnorm", qnorm print "Limit", limit addrs = numpy.array(addr) filteredaddrs = addrs[fetVals >= limit] print GenomeInfo.getChrList(genome) curchrom = "" start = "" end = sys.maxint prevAddr = -1000000.0 for addr in filteredaddrs: addrList = addr.split("\t") if addrList[0] != curchrom or int(addrList[1]) - windowSize > prevAddr: if curchrom != "": newend = prevAddr + windowSize if prevAddr + windowSize < end else end output.write(start + "\t" + str(newend) + "\n") start = addr curchrom = addrList[0] end = int(GenomeInfo.getChrLen(genome, curchrom)) - 1 prevAddr = int(addr.split("\t")[1]) newend = prevAddr + windowSize if prevAddr + windowSize < end else end output.write(start + "\t" + str(newend) + "\n") output.close()
def _getChrLen(self): return GenomeInfo.getChrLen(self.genome, self.chr)
def _createTrackCommon(cls, genome, inTrackName, outTrackName, windowSize, func, username, chrList): regionList = [GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) ) for chr in chrList] PreProcessCustomTrackJob(genome, outTrackName, regionList, cls._getGeSourceForRegion, \ username=username, inTrackName=inTrackName, windowSize=windowSize, func=func).process()