def _checkValidStart(self, chr, start):
     if start < 0:
         raise InvalidFormatError('Error: start position is negative: %s' % start)
 
     if self.genome and \
         GenomeInfo.isValidChr(self.genome, chr) and \
             start > GenomeInfo.getChrLen(self.genome, chr):
                 raise InvalidFormatError('Error: start position is larger than chromosome size (%s) < %d' % \
                                          (GenomeInfo.getChrLen(self.genome, chr), start))
     return start
    def _checkValidStart(self, chr, start):
        if start < 0:
            raise InvalidFormatError('Error: start position is negative: %s' %
                                     start)

        if self.genome and \
            GenomeInfo.isValidChr(self.genome, chr) and \
                start > GenomeInfo.getChrLen(self.genome, chr):
            raise InvalidFormatError('Error: start position is larger than the size of chromosome "%s" (%s > %s)' % \
                                     (chr, start, GenomeInfo.getChrLen(self.genome, chr)))
        return start
 def _checkValidEnd(self, chr, end, start=None):
     if end < 0:
         raise InvalidFormatError('Error: end position is negative: %s' % end)
     
     if self.genome and \
         GenomeInfo.isValidChr(self.genome, chr) and \
             end-1 > GenomeInfo.getChrLen(self.genome, chr):
                 raise InvalidFormatError('Error: end position is larger than chromosome size (%s)' % \
                                          GenomeInfo.getChrLen(self.genome, chr))
     if start is not None and end <= start:
             raise InvalidFormatError('Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d' % (end, start))
     
     return end
 def _getBoundingRegionTupleList(self, case, sortedAssertElList):
     boundingRegions = [br for br in sorted(case.boundingRegionsAssertList) if br.region.chr is not None]
     if len(boundingRegions) > 0:
         return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=br.region.chr, \
                                                  start=br.region.start if br.region.start is not None else 0, \
                                                  end=br.region.end if br.region.end is not None else \
                                                      GenomeInfo.getChrLen(self.GENOME, br.region.chr)), br.elCount)
                 for br in boundingRegions]
     else:
         totChrList = [ge.chr for ge in sortedAssertElList]
         chrBrList = OrderedDict( [ (i, totChrList.count(i)) for i in sorted(set(totChrList)) ] )
         return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=chr, start=0, \
                                                  end=GenomeInfo.getChrLen(self.GENOME, chr)), elCount) \
                 for chr, elCount in chrBrList.iteritems()]
Exemple #5
0
def createAssemblyGapsFile(genome, assemblyChars='ACGTacgt'):
    """genome assemblyChars='ACGTacgt'"""
    basePath = gcf.createOrigPath(genome, GenomeInfo.getPropertyTrackName(genome, 'gaps'),'')
    outFn = basePath + 'assemblyGaps.bed'
    qcf.ensurePathExists(outFn)
    outFile = open(outFn,'w')
    
    seqTrack = PlainTrack( GenomeInfo.getSequenceTrackName(genome) )

    anyGaps = False
    for chr in GenomeInfo.getExtendedChrList(genome):
        chrRegion = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr))
        seqTV = seqTrack.getTrackView(chrRegion)
        seq = seqTV.valsAsNumpyArray()
        
        #gapIndexes = numpy.arange(len(seq))[(seq == 'n') | (seq == 'N')]
        gapIndexes = numpy.arange(len(seq))[numpy.logical_not( numpy.logical_or.reduce([seq == x for x in assemblyChars]) )]
        gapIndexDiff = gapIndexes[1:] - gapIndexes[:-1]
        gapBeginIndexes = numpy.delete(gapIndexes, (numpy.arange(len(gapIndexDiff)) + 1)[gapIndexDiff==1])
        gapEndIndexes = numpy.delete(gapIndexes + 1, numpy.arange(len(gapIndexDiff))[gapIndexDiff==1])
        
        assert len(gapBeginIndexes) == len(gapEndIndexes)
        
        for i in xrange(len(gapBeginIndexes)):
            anyGaps = True
            outFile.write('\t'.join([chr, str(gapBeginIndexes[i]), str(gapEndIndexes[i])]) + os.linesep)
        
    if not anyGaps:
        outFile.write('\t'.join([GenomeInfo.getExtendedChrList(genome)[0], '1', '1']))
        
    outFile.close()
 def _compute(self):
     tv = self._children[0].getResult()
     starts, ends = tv.startsAsNumpyArray(), tv.endsAsNumpyArray()
     
     borderDict = defaultdict(int)
     listLen = len(starts)
     
     for index in xrange(listLen):
         borderDict[starts[index]]+=1
         borderDict[ends[index]]-=1
     
     
     sortedPos = sorted(borderDict)
     range(0, chrlength, microbinzie)
     
     #handle start border issues
     startList, endList, valList = (sortedPos,  sortedPos[1:], [])  if sortedPos[0] == 0 else  ([0] + sortedPos,  sortedPos,  [0])
     
     #Handle end border issues 
     chrEndPos = GenomeInfo.getChrLen(tv.genomeAnchor.genome, tv.genomeAnchor.chr)-1
     startList, endList  = (startList, endList+[chrEndPos])  if endList[-1]<chrEndPos else  (startList[:-1], endList)
     
     #make step-function values
     accVal = 0
     for pos in sortedPos:
         accVal+= borderDict[pos]
         valList.append(accVal)
     
     if chrEndPos == pos:
         valList.pop()
     
         
     return TrackView(genomeAnchor=tv.genomeAnchor, startList=np.array(startList), endList=np.array(endList), valList=np.array(valList), \
                      strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=False)
Exemple #7
0
 def _createTrackCommon(cls, genome, inTrackName, outTrackName, windowSize,
                        func, username, chrList):
     regionList = [
         GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr))
         for chr in chrList
     ]
     for region in regionList:
         PreProcessCustomTrackJob(genome,
                                  outTrackName, [region],
                                  cls._getGeSourceForRegion,
                                  username=username,
                                  preProcess=True,
                                  finalize=False,
                                  inTrackName=inTrackName,
                                  windowSize=windowSize,
                                  func=func).process()
     PreProcessCustomTrackJob(genome,
                              outTrackName,
                              regionList,
                              cls._getGeSourceForRegion,
                              username=username,
                              preProcess=False,
                              finalize=True,
                              inTrackName=inTrackName,
                              windowSize=windowSize,
                              func=func).process()
    def __iter__(self):
        brShelve1 = self._getBoundingRegionShelve(self._trackName1)
        brShelve2 = self._getBoundingRegionShelve(self._trackName2)

        allBrsAreWholeChrs1 = self._commonAllBoundingRegionsAreWholeChr(brShelve1) \
            if brShelve1 is not None else False
        allBrsAreWholeChrs2 = self._commonAllBoundingRegionsAreWholeChr(brShelve2) \
            if brShelve2 is not None else False

        for chr in GenomeInfo.getExtendedChrList(self.genome):
            if brShelve1 is None:
                yield GenomeRegion(self.genome, chr, 0,
                                   GenomeInfo.getChrLen(self.genome, chr))
            else:
                brList1 = brShelve1.getAllBoundingRegionsForChr(chr)

                if brShelve2 is None or \
                    (allBrsAreWholeChrs2 and not allBrsAreWholeChrs1):
                    for reg in brList1:
                        yield reg
                else:
                    brList2 = brShelve2.getAllBoundingRegionsForChr(chr)
                    if allBrsAreWholeChrs1 and not allBrsAreWholeChrs2:
                        for reg in brList2:
                            yield reg
                    else:
                        for reg in self.getAllIntersectingRegions(
                                self.genome, chr, brList1, brList2):
                            yield reg
    def _createPreProcFiles(self):
        collector = TrackInfoDataCollector(self._genome, self._trackName)
        collector.updateMetaDataForFinalization(self._geSource.getFileSuffix(), self._geSource.getPrefixList(), \
                                                self._geSource.getValDataType(), self._geSource.getValDim(), \
                                                self._geSource.getEdgeWeightDataType(), self._geSource.getEdgeWeightDim(), \
                                                self._geSource.hasUndirectedEdges(),
                                                self._geSource.getVersion(), PreProcessUtils.constructId(self._geSource))

        if collector.getNumElements(self._chr, self._allowOverlaps) == 0:
            return
        
        if self._mode != 'Real':
            for ge in self._geSource:
                pass
            return
        
        dirPath = createDirPath(self._trackName, self._genome, self._chr, self._allowOverlaps)

        dir = OutputDirectory(dirPath, collector.getPrefixList(self._allowOverlaps), \
                              collector.getNumElements(self._chr, self._allowOverlaps),\
                              GenomeInfo.getChrLen(self._genome, self._chr), \
                              collector.getValDataType(), collector.getValDim(), \
                              collector.getEgdeWeightDataType(), collector.getEgdeWeightDim(), \
                              collector.getMaxNumEdges(self._chr, self._allowOverlaps), \
                              collector.getMaxStrLens(self._chr, self._allowOverlaps))
        
        writeFunc = dir.writeRawSlice if self._geSource.isSliceSource() else dir.writeElement
        
        for ge in self._geSource:
            writeFunc(ge)
        
        collector.appendPreProcessedChr(self._allowOverlaps, self._chr)
        
        dir.close()
    def __iter__(self):
        brShelve1 = self._getBoundingRegionShelve(self._trackName1)
        brShelve2 = self._getBoundingRegionShelve(self._trackName2)
        
        for chr in GenomeInfo.getExtendedChrList(self.genome):
            if brShelve1 is None:
                yield GenomeRegion(self.genome, chr, 0, GenomeInfo.getChrLen(self.genome, chr))
            else:
                brList1 = brShelve1.getAllBoundingRegions(chr)
                allBrsAreWholeChrs1 = self._commonAllBoundingRegionsAreWholeChr(brShelve1)
                allBrsAreWholeChrs2 = self._commonAllBoundingRegionsAreWholeChr(brShelve2) \
                    if brShelve2 is not None else False

                if brShelve2 is None or \
                    (allBrsAreWholeChrs2 and not allBrsAreWholeChrs1):
                    for reg in brList1:
                        yield reg
                else:
                    brList2 = brShelve2.getAllBoundingRegions(chr)
                    if allBrsAreWholeChrs1 and not allBrsAreWholeChrs2:
                        for reg in brList2:
                            yield reg
                    else:
                        for reg in self.getAllIntersectingRegions(self.genome, chr, brList1, brList2):
                            yield reg
Exemple #11
0
    def execute(cls, choices, galaxyFn=None, username=''):
        from quick.application.ExternalTrackManager import ExternalTrackManager

        genome = choices[0]
        preProcTN1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(
            genome, choices[2].split(
                ':')) if choices[1] == 'history' else choices[2].split(':')
        chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom))
                            for chrom in GenomeInfo.getChrList(genome)])

        trackType = choices[3].split(':')[1]
        fnSource = ExternalTrackManager.extractFnFromGalaxyTN(
            choices[3].split(':'))

        if trackType in ['valued.bed', 'category.bed', 'bed']:
            geSource = GenomeElementSorter(
                BedGenomeElementSource(fnSource, genome=genome)).__iter__()

        elif trackType == 'gtrack':
            geSource = GenomeElementSorter(
                GtrackGenomeElementSource(fnSource, genome=genome)).__iter__()
            #headLinesStr = geSource.getHeaderLines().replace('##','\n##')
        else:
            raise InvalidFormatError(
                'The Binning must be of the following formats: gtrack, valued.bed, category.bed ,bed ...'
            )

        cls.PrintResultToHistItem(galaxyFn, geSource, preProcTN1, genome,
                                  username)
Exemple #12
0
 def _removeBoundingRegionTuplesIfFullChrsAndNotFixedGapSize(self):
     if self.getFixedGapSize() == 0 and not self._reprIsDense:
         # If only full chromosomes
         if all(brt.region.chr in GenomeInfo.getExtendedChrList(self._genome) and \
                 brt.region.start == 0 and \
                  brt.region.end == GenomeInfo.getChrLen(self._genome, brt.region.chr) \
                   for brt in self._boundingRegionTuples):
             self._boundingRegionTuples = []
 def assertChrElCounts(self, trackName, chrElCountDict, allowOverlaps, customBins):
     for chr in chrElCountDict.keys():
         if chr in customBins:
             region = customBins[chr]
         else:
             region = GenomeRegion(self.GENOME, chr, 0, GenomeInfo.getChrLen(self.GENOME, chr))
         tv = self._getTrackView(trackName, region, allowOverlaps)
         self.assertEquals(chrElCountDict[chr], len([x for x in tv]))
    def _checkValidEnd(self, chr, end, start=None):
        if end < 0:
            raise InvalidFormatError('Error: end position is negative: %s' %
                                     end)

        if self.genome and \
            GenomeInfo.isValidChr(self.genome, chr) and \
                end-1 > GenomeInfo.getChrLen(self.genome, chr):
            raise InvalidFormatError('Error: end position is larger than the size of chromosome "%s" (%s > %s)' % \
                                     (chr, end-1, GenomeInfo.getChrLen(self.genome, chr)))
        if start is not None and end <= start:
            if not start == end == 1:
                raise InvalidFormatError(
                    'Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d'
                    % (end, start))

        return end
 def _removeBoundingRegionTuplesIfFullChrsAndNotFixedGapSize(self):
     if self.getFixedGapSize() == 0 and not self._reprIsDense:
         # If only full chromosomes
         if all(brt.region.chr in GenomeInfo.getExtendedChrList(self._genome) and \
                 brt.region.start == 0 and \
                  brt.region.end == GenomeInfo.getChrLen(self._genome, brt.region.chr) \
                   for brt in self._boundingRegionTuples):
             self._boundingRegionTuples = []
Exemple #16
0
 def _createTrackCommon(cls, genome, inTrackName, outTrackName, windowSize,
                        func, username, chrList):
     regionList = [
         GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr))
         for chr in chrList
     ]
     PreProcessCustomTrackJob(genome, outTrackName, regionList, cls._getGeSourceForRegion, \
                              username=username, inTrackName=inTrackName, windowSize=windowSize, func=func).process()
    def validate_snp(cls, snps, genome):
        DNA = ['A', 'C', 'G', 'T'] + cls.AMBIGUOUS_DNA.keys()

        err = []
        for snp in snps:
            assert len(snp) == 5
            _rsid, _chr, _pos, _ref, _var = snp
            #spec = ':'.join(snp)
            spec = repr(snp)

            if _rsid and not _chr:
                err.append('Invalid RefSNP: rs' + _rsid)
                continue
            if _chr not in GenomeInfo.getChrList(genome):
                err.append(spec + ' Chromosome ' + _chr + ' is not valid')
                continue
            if not _pos.isdigit():
                err.append(spec + ' Position must numeric')
                continue
            if int(_pos) < 0:
                err.append(spec + ' Position must be higher than 0')
                continue

            chrLen = GenomeInfo.getChrLen(genome, _chr)
            if int(_pos) > chrLen:
                err.append(spec +
                           ' Position is higher than length of %s (%d)' %
                           (_chr, chrLen))
                continue

            ref = VariantMeltingProfile.get_reference_allele(
                genome, _chr, _pos, len(_ref))
            if _ref != ref:
                err.append(
                    spec +
                    ' Reference allele does not match reference genome, should be: '
                    + ref)
                continue

            if _ref == 'N':
                err.append(spec + ' Reference allele can not be N')
                continue

            if not _var:
                err.append(spec + ' Variant allele not specified')
                continue

            if not all([v in DNA for v in _var]):
                err.append(spec + ' Variant allele ' + _var + " is not valid")
                continue

            if cls.AMBIGUOUS_DNA.has_key(
                    _var) and _ref in cls.AMBIGUOUS_DNA[_var]:
                err.append(spec +
                           ' Ambiguous variant allele includes reference')
                continue

        return err
 def assertChrElCounts(self, trackName, chrElCountDict, allowOverlaps,
                       customBins):
     for chr in chrElCountDict.keys():
         if chr in customBins:
             region = customBins[chr]
         else:
             region = GenomeRegion(self.GENOME, chr, 0,
                                   GenomeInfo.getChrLen(self.GENOME, chr))
         tv = self._getTrackView(trackName, region, allowOverlaps)
         self.assertEquals(chrElCountDict[chr], len([x for x in tv]))
 def getNumberElements(genome, trackName):
     track = PlainTrack(trackName)
     numElements = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         numElements = numElements + [len(tv.startsAsNumpyArray())]
         
     return numElements
 def getAnchor(genome, trackName):
     track = PlainTrack(trackName)
     anchor = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         anchor = anchor + [str(tv.genomeAnchor)]
     
     return anchor
 def createNmerChains(self, n):
     for chr in GenomeInfo.getChrList(self._genome):
         print 'Creating chains of nmers of length ', n, ' for chromosome ', chr
         chrLen = GenomeInfo.getChrLen(self._genome,chr)
         chrReg = GenomeRegion( self._genome, chr, 0, chrLen )
         seqTV = PlainTrack( GenomeInfo.getSequenceTrackName(self._genome) ).getTrackView(chrReg)
         
         #nmersAsInts = NmerAsIntSlidingWindow(n, FuncValTvWrapper(seqTV))
         nmersAsInts = NmerAsIntSlidingWindow(n, seqTV.valsAsNumpyArray())
         SameValueIndexChainsFactory.generate( nmersAsInts, chrLen, 4**n, self._createPath(n), chr )
Exemple #22
0
    def isCompBin(region):
        if isIter(region):
            return False

        offsetOK = (CompBinManager.getOffset(
            region.start, CompBinManager.getBinNumber(region.start)) == 0)
        lengthOK = (len(region) == min(
            CompBinManager.getCompBinSize(),
            GenomeInfo.getChrLen(region.genome, region.chr) - region.start))
        return offsetOK and lengthOK
    def createNmerChains(self, n):
        for chr in GenomeInfo.getChrList(self._genome):
            print 'Creating chains of nmers of length ', n, ' for chromosome ', chr
            chrLen = GenomeInfo.getChrLen(self._genome, chr)
            chrReg = GenomeRegion(self._genome, chr, 0, chrLen)
            seqTV = PlainTrack(GenomeInfo.getSequenceTrackName(
                self._genome)).getTrackView(chrReg)

            #nmersAsInts = NmerAsIntSlidingWindow(n, FuncValTvWrapper(seqTV))
            nmersAsInts = NmerAsIntSlidingWindow(n, seqTV.valsAsNumpyArray())
            SameValueIndexChainsFactory.generate(nmersAsInts, chrLen, 4**n,
                                                 self._createPath(n), chr)
Exemple #24
0
    def extend(self, extensionSize, ensureValidity=True):
        if extensionSize >= 0:
            self.end += extensionSize
        else:
            self.start += extensionSize

        if ensureValidity:
            self.start = max(0, self.start)
            self.end = min(self.end,
                           GenomeInfo.getChrLen(self.genome, self.chr))

        return self
Exemple #25
0
    def execute(cls, choices, galaxyFn=None, username=''):
        from gold.util.RandomUtil import random

        outputFile = open(galaxyFn, 'w')
        genome = choices[0]
        histItem = choices[2]
        trackItem = choices[3]
        chromRegsPath = GenomeInfo.getChrRegsFn(genome)

        chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom))
                            for chrom in GenomeInfo.getChrList(genome)])
        geSource = headLinesStr = None
        if choices[1] == 'history':

            trackType = choices[2].split(':')[1]
            username = ''.join(
                [chr(random.randint(97, 122)) for i in range(6)])
            tempFn = createCollectedPath(
                genome, [],
                username + '_'.join([str(v) for v in time.localtime()[:6]]) +
                '.' + trackType)
            fnSource = ExternalTrackManager.extractFnFromGalaxyTN(
                choices[2].split(':'))
            open(tempFn, 'w').write(open(fnSource, 'r').read())

            if trackType in ['valued.bed', 'category.bed', 'bed']:
                geSource = GenomeElementSorter(
                    BedGenomeElementSource(tempFn, genome=genome)).__iter__()

            #elif trackType == 'gtrack':
            #    geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__()
            #    headLinesStr = geSource.getHeaderLines().replace('##','\n##')

            cls.WriteExpandedElementsToFile(geSource,
                                            chrSizeDict,
                                            outputFile,
                                            headLinesStr,
                                            writeHeaderFlag=True)
            os.remove(tempFn)

        else:
            writeHeaderFlag = True
            for chrom in GenomeInfo.getChrList(genome):
                gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom])
                plTrack = PlainTrack(trackItem.split(':'))
                geSource = GenomeElementTvWrapper(
                    plTrack.getTrackView(gRegion)).__iter__()
                cls.WriteExpandedElementsToFile(geSource, chrSizeDict,
                                                outputFile, headLinesStr,
                                                writeHeaderFlag)
                writeHeaderFlag = False
        outputFile.close()
 def getSegmentSizes(genome, trackName):
     track = PlainTrack(trackName)
     segmentSize = []; sumSegmentSize = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         sizeSegments = tv.endsAsNumpyArray() - tv.startsAsNumpyArray()
         sumSizes = sizeSegments.sum()
         segmentSize = segmentSize + [sizeSegments.tolist()]
         sumSegmentSize = sumSegmentSize + [sumSizes.tolist()]
         
     return sumSegmentSize
def smoothPoints(genome, inTrackName, windowSize, chr):
    from gold.extra.SlidingWindow import SlidingWindow
    from quick.util.GenomeInfo import GenomeInfo
    from gold.track.Track import PlainTrack
    from gold.track.GenomeRegion import GenomeRegion
    
    #func = lambda x: ( sum( [r.dnorm(i-len(x)/2.0,0,2000)*x[i].end for i in range(len(x)) if x[i]!=None] ) / sum( [r.dnorm(i-len(x)/2.0,0,2000)*1 for i in range(len(x)) if x[i]!=None] ) ) if len([y for y in x if y!=None])>0 else 0    
    
    chrReg = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome,chr) )
            #chrReg = GenomeElement(genome, chr, 0, 3000)
    inTrackView = PlainTrack(inTrackName).getTrackView(chrReg)
    print [x.end() for x in inTrackView]
    slidingWindows = SlidingWindow(GenomeElementTvWrapper(inTrackView), windowSize)
    print [x for x in weightedValForWindowsYielder(slidingWindows, windowSize)]
Exemple #28
0
    def _createOutputDirectory(self, genome, chr, trackName, allowOverlaps,
                               geSourceManager):
        dirPath = createDirPath(trackName, genome, chr, allowOverlaps)

        from quick.util.GenomeInfo import GenomeInfo
        return  OutputDirectory(dirPath, geSourceManager.getPrefixList(), \
                                geSourceManager.getNumElementsForChr(chr), \
                                GenomeInfo.getChrLen(genome, chr), \
                                geSourceManager.getValDataType(), \
                                geSourceManager.getValDim(), \
                                geSourceManager.getEdgeWeightDataType(), \
                                geSourceManager.getEdgeWeightDim(), \
                                geSourceManager.getMaxNumEdgesForChr(chr), \
                                geSourceManager.getMaxStrLensForChr(chr), \
                                geSourceManager.isSorted())
    def __iter__(self):
        chr = self.chr
        trackName1, trackName2, w1, w2, genome = self.trackName1, self.trackName2, self.w1, self.w2, self.genome
        
        region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) )

        track1 = PlainTrack(trackName1)
        tv1 = track1.getTrackView(region)
        vals1 = tv1.valsAsNumpyArray()
        
        track2 = PlainTrack(trackName2)
        tv2 = track2.getTrackView(region)
        vals2 = tv2.valsAsNumpyArray()
        
        for i in xrange(len(vals1)):
            yield w1*vals1[i] + w2*vals2[i]
    def __iter__(self):
        chr = self.chr
        trackName1, trackName2, w1, w2, genome = self.trackName1, self.trackName2, self.w1, self.w2, self.genome

        region = GenomeRegion(genome, chr, 0,
                              GenomeInfo.getChrLen(genome, chr))

        track1 = PlainTrack(trackName1)
        tv1 = track1.getTrackView(region)
        vals1 = tv1.valsAsNumpyArray()

        track2 = PlainTrack(trackName2)
        tv2 = track2.getTrackView(region)
        vals2 = tv2.valsAsNumpyArray()

        for i in xrange(len(vals1)):
            yield w1 * vals1[i] + w2 * vals2[i]
    def __iter__(self):
        from gold.application.RSetup import r
        chr = self.chr
        trackName1, genome = self.trackName1, self.genome
        factor = self.factor
        region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) )

        track1 = PlainTrack(trackName1)
        tv1 = track1.getTrackView(region)
        vals1 = tv1.valsAsNumpyArray()
        
        #scale between 0 and 1..:
        minVal, maxVal = vals1.min(), vals1.max()
        vals1 = (vals1 - minVal) * (1/(maxVal-minVal))
        for pos in xrange(len(vals1)):
            #print r.runif(1), vals1[pos]
            if r.runif(1) < factor*vals1[pos]:
                yield [pos,pos+1]
 def getGenomicElements(genome, trackName):
     track = PlainTrack(trackName)
     genElements = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         for el in tv:
             #print chrom, el.start(), el.end() #, el.name()
             genElements = genElements + [[chrom, el.start(), el.end()]]
             
     return genElements
 
     #print numpy.version.version # 1.7.1 !!
     #unique, counts = numpy.unique(segmentSize, return_counts=True) # This is for numpy 1.9
     #print numpy.asarray((unique, counts)).T
     
     '''track.setFormatConverter('SegmentToMidPointFormatConverter')
    def nextBin(self):
        #start = self.start
        #for chr in self.chromosomes:
        #    if self.genome:
        #        chrLen = GenomeInfo.getChrLen(self.genome, chr)
        #    else:
        #        chrLen = self.end
        #        assert chrLen is not None
        #
        #    if self.end is None:
        #        chrEnd = chrLen
        #    else:
        #        chrEnd = min(self.end, chrLen)
        #    #chrLen = 3100000
        #
        #    while (start < chrEnd):
        #        if self.binLen is not None:
        #            end = min(start+self.binLen, chrEnd)
        #        else:
        #            end = chrEnd
        #        #print 'YIELDING: ',start, end, chrEnd
        #        yield GenomeRegion(self.genome, chr, start, end)
        #        if self.binLen is not None:
        #            start += self.binLen
        #        else:
        #            start = chrLen
        #
        #    #in case of more chromosomes, reset start:
        #    start = 0
        for region in self._userBinSource:
            start = region.start if region.start is not None else 0

            chrLen = GenomeInfo.getChrLen(
                region.genome,
                region.chr) if region.genome is not None else None
            regEnd = min([x for x in [region.end, chrLen] if x is not None])

            if self._binLen is None:
                yield GenomeRegion(region.genome, region.chr, start, regEnd)
            else:
                while start < regEnd:
                    end = min(start + self._binLen, regEnd)
                    yield GenomeRegion(region.genome, region.chr, start, end)
                    start += self._binLen
    def __iter__(self):
        from proto.RSetup import r
        chr = self.chr
        trackName1, genome = self.trackName1, self.genome
        factor = self.factor
        region = GenomeRegion(genome, chr, 0,
                              GenomeInfo.getChrLen(genome, chr))

        track1 = PlainTrack(trackName1)
        tv1 = track1.getTrackView(region)
        vals1 = tv1.valsAsNumpyArray()

        #scale between 0 and 1..:
        minVal, maxVal = vals1.min(), vals1.max()
        vals1 = (vals1 - minVal) * (1 / (maxVal - minVal))
        for pos in xrange(len(vals1)):
            #print r.runif(1), vals1[pos]
            if r.runif(1) < factor * vals1[pos]:
                yield [pos, pos + 1]
    def createBoundingRegionShelve(genome, trackName, allowOverlaps):
        collector = TrackInfoDataCollector(genome, trackName)
        geChrList = collector.getPreProcessedChrs(allowOverlaps)

        boundingRegionTuples = [x for x in collector.getBoundingRegionTuples(allowOverlaps) if x.region.chr is not None]
        
        if len(boundingRegionTuples) == 0:
            boundingRegionTuples = [BoundingRegionTuple( \
                                     GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(genome, chr)), \
                                     collector.getNumElements(chr, allowOverlaps) ) \
                                    for chr in geChrList]
        brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps)
        brShelve.storeBoundingRegions(boundingRegionTuples, geChrList, not collector.getTrackFormat().reprIsDense())
        
        boundingRegionChrs = set([br.region.chr for br in boundingRegionTuples])
        for chr in boundingRegionChrs | set(geChrList):
            if brShelve.getTotalElementCount(chr) != collector.getNumElements(chr, allowOverlaps):
                raise ShouldNotOccurError("Error: The total element count for all bounding regions of chromosome '%s' is not equal to the number of genome elements of that chromosome. %s != %s" % \
                                          (chr, brShelve.getTotalElementCount(chr), collector.getNumElements(chr, allowOverlaps)) )
Exemple #36
0
    def getBoundingRegionTuples(self):
        boundingRegionTuples = [x for x in self._getBoundingRegionTuples() \
                                if x.region.chr is not None]

        if len(boundingRegionTuples) == 0:
            from gold.origdata.GenomeElementSource import BoundingRegionTuple
            from gold.track.GenomeRegion import GenomeRegion
            from quick.util.GenomeInfo import GenomeInfo

            geChrList = self.getAllChrs()
            boundingRegionTuples = [BoundingRegionTuple( \
                                     GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(self._geSource.genome, chr)), \
                                     self.getNumElementsForChr(chr) ) \
                                    for chr in geChrList]
            self._boundingRegionsAndGEsCorrespond = False
        else:
            self._boundingRegionsAndGEsCorrespond = True

        return boundingRegionTuples
 def _createNmerTrack(self, nmerList, lowerOrder=None):
     nmerLengths = list(set([len(nmer) for nmer in nmerList]))
     assert len(nmerLengths)==1
     
     chainOrder = lowerOrder if lowerOrder is not None else nmerLengths[0]
     
     regionList = [GenomeRegion(self._genome, chr, 0, GenomeInfo.getChrLen(self._genome, chr) ) for chr in GenomeInfo.getChrList(self._genome)]
     
     for region in regionList:
         print '|',
         
         chains = SameValueIndexChainsFactory.load(self._createPath(chainOrder), region.chr)
         
         for nmer in nmerList:
             if len(nmerList) > 1:
                 print '.',
             
             if lowerOrder is not None:
                 nmerPrefix = nmer[0:chainOrder]
                 rawIndexGenerator = chains.getIndexGenerator(NmerTools.nmerAsInt(nmerPrefix))             
                 indexGenerator = LowerOrderChainWrapper(rawIndexGenerator, nmerPrefix, nmer, self._genome, region.chr)
             else:
                 indexGenerator = chains.getIndexGenerator(NmerTools.nmerAsInt(nmer)) 
     
             #print 'Length of lower order chain: %i and %i' % (sum(1 for x in indexGenerator), sum(1 for x in indexGenerator))
             #print 'Length of wrapped chain: %i and %i' % (sum(1 for x in wrappedIndexGenerator), sum(1 for x in wrappedIndexGenerator))            
             
             PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), [region], \
                                      self._getNmerGeSourceForChr, finalize=False, preProcess=True, \
                                      indexGenerator=indexGenerator).process()
                 
     for nmer in nmerList:
         try:
             PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), regionList, \
                                      self._getNmerGeSourceForChr, preProcess=False, finalize=True, \
                                      indexGenerator=[0]).process()
         except EmptyGESourceError:
             PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), [GenomeRegion(self._genome, regionList[0].chr, -1, 0)], \
                                      self._getNmerGeSourceForChr, preProcess=True, finalize=True, \
                                      indexGenerator=[-1]).process()
     
     return
    def nextBin(self):
        #start = self.start
        #for chr in self.chromosomes:
        #    if self.genome:
        #        chrLen = GenomeInfo.getChrLen(self.genome, chr)
        #    else:
        #        chrLen = self.end
        #        assert chrLen is not None
        #    
        #    if self.end is None:
        #        chrEnd = chrLen
        #    else:
        #        chrEnd = min(self.end, chrLen)
        #    #chrLen = 3100000
        #    
        #    while (start < chrEnd):
        #        if self.binLen is not None:
        #            end = min(start+self.binLen, chrEnd)
        #        else:
        #            end = chrEnd
        #        #print 'YIELDING: ',start, end, chrEnd
        #        yield GenomeRegion(self.genome, chr, start, end)
        #        if self.binLen is not None:
        #            start += self.binLen
        #        else:
        #            start = chrLen
        #
        #    #in case of more chromosomes, reset start:
        #    start = 0
        for region in self._userBinSource:
            start = region.start if region.start is not None else 0

            chrLen = GenomeInfo.getChrLen(region.genome, region.chr) if region.genome is not None else None
            regEnd = min([x for x in [region.end, chrLen] if x is not None])
            
            if self._binLen is None:
                yield GenomeRegion(region.genome, region.chr, start, regEnd)
            else:
                while start < regEnd:
                    end = min(start + self._binLen, regEnd)
                    yield GenomeRegion(region.genome, region.chr, start, end)
                    start += self._binLen
Exemple #39
0
def createChromosomeFile(genome, chromNames, referToCollected=False):
    """genome chromNames"""
    # python quick/extra/CustomFuncCatalog.py CreateChromosomeFile mm9 'chr1, chr2, ...'"
    
    chrList = chromNames.replace(' ','').split(',')
    if referToCollected:
        from gold.util.CommonFunctions import createCollectedPath
        basePath = createCollectedPath(genome, GenomeInfo.getChrTrackName(genome))
    else:
        basePath = gcf.createOrigPath(genome, GenomeInfo.getChrTrackName(genome))

    # Why is this file a category.bed file?
    outFn = basePath + os.sep + 'chromosomes.category.bed'
    qcf.ensurePathExists(outFn)
    print 'Creating: ' + outFn

    outFile = open(outFn, 'w')
    for chr in chrList:
        outFile.write('\t'.join([chr, '0', str(GenomeInfo.getChrLen(genome, chr)), chr]) + os.linesep)
    outFile.close()
 def execute(cls, choices, galaxyFn=None, username=''):
     from quick.application.ExternalTrackManager import ExternalTrackManager
     
     genome = choices[0]
     preProcTN1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, choices[2].split(':')) if choices[1] == 'History' else choices[2].split(':')
     chrSizeDict =  dict([ ( chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)])
     
     
     trackType = choices[3].split(':')[1]
     fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[3].split(':'))
     
     if trackType in ['marked.bed', 'category.bed', 'bed']:
         geSource = GenomeElementSorter(BedGenomeElementSource(fnSource, genome=genome)).__iter__()
         
     elif trackType == 'gtrack':
         geSource = GenomeElementSorter(GtrackGenomeElementSource(fnSource, genome=genome)).__iter__()
         #headLinesStr = geSource.getHeaderLines().replace('##','\n##')
     else:
         raise InvalidFormatError('The Binning must be of the following formats: gtrack, marked.bed, category.bed ,bed ...')
         
         
     cls.PrintResultToHistItem( galaxyFn, geSource, preProcTN1, genome, username)
    def execute(cls, choices, galaxyFn=None, username=''):

        outputFile =  open(galaxyFn, 'w')
        genome = choices[0]
        histItem = choices[2]
        trackItem = choices[3]
        chromRegsPath = GenomeInfo.getChrRegsFn(genome)
        
        chrSizeDict =  dict([ ( chr, GenomeInfo.getChrLen(genome, chr)) for chr in GenomeInfo.getChrList(genome)])
        geSource = headLinesStr = None
        if choices[1] == 'history':
            
            trackType = choices[2].split(':')[1]
            
            from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile
            tempFn  = GalaxyRunSpecificFile(['fromHistory.'+trackType],galaxyFn).getDiskPath(True)
            
            fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':'))
            open(tempFn,'w').write(open(fnSource,'r').read())
        
            if trackType in ['valued.bed', 'category.bed', 'bed']:
                geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__()
            
            elif trackType == 'gtrack':
                geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__()
                headLinesStr = geSource.getHeaderLines().replace('##','\n##')
            
            cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True)
            os.remove(tempFn)
        
        else:
            writeHeaderFlag = True
            for chr in GenomeInfo.getChrList(genome):
                gRegion = GenomeRegion(genome, chr, 0, chrSizeDict[chr])
                plTrack = PlainTrack(trackItem.split(':'))
                geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__()
                cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag)
                writeHeaderFlag = False    
        outputFile.close()
 def execute(cls, choices, galaxyFn=None, username=''):
     outputFile =  open(galaxyFn, 'w')
     genome = choices[0]
     histItem = choices[2]
     trackItem = choices[3]
     chromRegsPath = GenomeInfo.getChrRegsFn(genome)
     
     chrSizeDict =  dict([ ( chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)])
     geSource = headLinesStr = None
     if choices[1] == 'History':
         
         trackType = choices[2].split(':')[1]
         username = ''.join([chr(random.randint(97,122)) for i in range(6)]) 
         tempFn = createCollectedPath(genome, [], username+'_'.join([str(v) for v in time.localtime()[:6]])+'.'+trackType)
         fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':'))
         open(tempFn,'w').write(open(fnSource,'r').read())
         
         
         if trackType in ['marked.bed', 'category.bed', 'bed']:
             geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__()
         
         elif trackType == 'gtrack':
             geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__()
             headLinesStr = geSource.getHeaderLines().replace('##','\n##')
         
         cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True)
         os.remove(tempFn)
     
     else:
         writeHeaderFlag = True
         for chrom in GenomeInfo.getChrList(genome):
             gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom])
             plTrack = PlainTrack(trackItem.split(':'))
             geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__()
             cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag)
             writeHeaderFlag = False    
     outputFile.close()
    def execute(cls, choices, galaxyFn=None, username=''):
        '''Is called when execute-button is pushed by web-user.
        Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
        If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.
        If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
        choices is a list of selections made by web-user in each options box.
        '''

        genome = choices.genome;
        infile = choices.css;
        fdr_filtering = (choices.filter == "FDR p-value treshold");
        if fdr_filtering:
            FDR = float(choices.fdr)
        else:
            num_top = int(choices.numtop);
        windowSize = int(choices.wsize)

        inFn = ExternalTrackManager.extractFnFromGalaxyTN(infile.split(":"))
        data = open(inFn, "r").read();
        scores, p, addr, windows = cls.preProcessPvalues(data, 2, 3)
        outfile = open(galaxyFn, "w") 
        
        addrs = numpy.array(addr)

        if fdr_filtering:
            # [::-1] --> sorted from smallest to largest
            psorted = numpy.argsort(p)[::-1]
            k = float(len(p))
            n = k
            testp = 0

            #Benjamini-Hochberg procedure
            for pi in psorted:
                if p[pi] <= k/n * FDR:
                    testp = p[pi]
                    break
                k -=1
            # Tuva changed from 1 to 0:
            if k == 0:
                print "NONE FOUND";
                outfile.write("NONE found")
                outfile.close()
                return
        
            print "Pval found:", testp
            filteredaddrs = addrs[p<=testp]
        else:
            scoresorted = numpy.argsort(scores)[::-1];
            scorelimit = scores[scoresorted[num_top-1]];
            filteredaddrs = addrs[scores>=scorelimit];
        
        prevAddr = -10000.
        headers = "##gtrack version: 1.0\n##track type: segments\n##uninterrupted data lines: true\n"+\
                "##no overlapping elements: true\n###seqid\tstart\tend\n"
        outfile.write(headers)
        curchrom = ""
        start = ""
        end = sys.maxint
        prevAddr = -1000000.
        for addr in filteredaddrs:
            addrList = addr.split("\t")
            if addrList[0] != curchrom or int(addrList[1])-windowSize > prevAddr:
                if curchrom != "":
                    newend = prevAddr+windowSize if prevAddr+windowSize < end else end
                    outfile.write(start+"\t"+str(newend)+"\n")
                start = addr
                curchrom = addrList[0]
                end = int(GenomeInfo.getChrLen(genome, curchrom))-1

            prevAddr = int(addr.split("\t")[1])

        newend = prevAddr+windowSize if prevAddr+windowSize < end else end
        outfile.write(start+"\t"+str(newend)+"\n")
        print "Number of regions found", len(filteredaddrs)                                                                                                                 
        if fdr_filtering:
            print "False discoveries", testp*windows
        outfile.close()
def storeShelve(genome, brTuples, sparse=True):
    boundingRegionShelve = shelve.open('/tmp/brshelve.shelve', 'c', writeback=True)
    
    lastRegion = None
    chrStartIdxs = OrderedDict()
    chrEndIdxs = OrderedDict()
    totElCount = 0
    totBinCount = 0
    
    for br in brTuples:
        if lastRegion is not None:
            if br.region < lastRegion:
                raise InvalidFormatError("Error: bounding regions are unsorted: %s > %s. The Genomic HyperBrowser preprocessor requires sorted bounding regions." % (lastRegion, br.region))
            if lastRegion.overlaps(br.region):
                raise InvalidFormatError("Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region))
        
        if len(br.region) < 1:
            raise InvalidFormatError("Error: bounding region '%s' does not have positive length." % br.region)
        
        if lastRegion is None or br.region.chr != lastRegion.chr:
            boundingRegionShelve[br.region.chr] = sorteddict()
            if sparse:
                chrStartIdxs[br.region.chr] = totElCount
                #chrLen = GenomeInfo.getChrLen(br.region.genome, br.region.chr)
                #startIdx, endIdx, startBinIdx = totElCount, totElCount, totBinCount
                #endBinIdx = totBinCount + CompBinManager.getNumOfBins(GenomeRegion(start=0, end=chrLen))
                #chrInfo[br.region.chr] = BoundingRegionInfo(0, chrLen, startIdx, endIdx, startBinIdx, endBinIdx)
        
        startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None)
        totElCount += br.elCount
        chrEndIdxs[br.region.chr] = totElCount
        #print startIdx, endIdx, totElCount
        
        #if sparse:
        #    binCount = CompBinManager.getNumOfBins(br.region)
        #    startBinIdx, endBinIdx = totBinCount, totBinCount + binCount
        #    totBinCount += binCount
        #    print startBinIdx, endBinIdx, totBinCount
        #else:
        #    startBinIdx, endBinIdx = None, None
        
        boundingRegionShelve[br.region.chr][br.region.start] = BoundingRegionInfo(br.region.start, br.region.end, startIdx, endIdx, None, None)
        
        lastRegion = br.region
    
    if sparse:
        totBinCount = 0
        for chr in chrStartIdxs:
            #print chr
            chrLen = GenomeInfo.getChrLen(genome, chr)
            numBinsInChr = CompBinManager.getNumOfBins(GenomeRegion(start=0, end=chrLen))
            for key in boundingRegionShelve[chr].keys():
                startBinIdx = totBinCount
                endBinIdx = totBinCount + numBinsInChr
                brInfo = boundingRegionShelve[chr][key]
                boundingRegionShelve[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \
                                                                    chrStartIdxs[chr], chrEndIdxs[chr], \
                                                                    startBinIdx, endBinIdx)
            totBinCount += numBinsInChr
            #print boundingRegionShelve[chr]
        
    boundingRegionShelve.sync()
 def dummygetChromosomlength(a,b):
     return GenomeInfo.getChrLen(a,b)
    def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList, sparse):
        assert sparse in [False, True]

        tempContents = OrderedDict()

        genomeElementChrs = set(genomeElementChrList)    
        lastRegion = None
        chrStartIdxs = OrderedDict()
        chrEndIdxs = OrderedDict()
        totElCount = 0
        totBinCount = 0
        
        for br in boundingRegionTuples:
            if lastRegion is None or br.region.chr != lastRegion.chr:
                if br.region.chr in tempContents:
                    raise InvalidFormatError("Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)." % br.region)
                
                lastRegion = None
                tempContents[br.region.chr] = OrderedDict() #sorteddict()
                if sparse:
                    chrStartIdxs[br.region.chr] = totElCount
            else:
                if br.region < lastRegion:
                    raise InvalidFormatError("Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s." % (lastRegion, br.region))
                if lastRegion.overlaps(br.region):
                    raise InvalidFormatError("Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region))
                if lastRegion.end == br.region.start:
                    raise InvalidFormatError("Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastRegion, br.region))
            
            if len(br.region) < 1:
                raise InvalidFormatError("Error: bounding region '%s' does not have positive length." % br.region)
                
            if not sparse and len(br.region) != br.elCount:
                raise InvalidFormatError("Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s" % (br.region, len(br.region), br.elCount))
            
            startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None)
            totElCount += br.elCount
            if sparse:
                chrEndIdxs[br.region.chr] = totElCount
            
            tempContents[br.region.chr][br.region.start] = BoundingRegionInfo(br.region.start, br.region.end, startIdx, endIdx, 0, 0)
            
            lastRegion = br.region
        
        if sparse:
            totBinCount = 0
            for chr in tempContents:
                chrLen = GenomeInfo.getChrLen(self._genome, chr)
                numBinsInChr = CompBinManager.getNumOfBins(GenomeRegion(start=0, end=chrLen))
                for key in tempContents[chr].keys():
                    startBinIdx = totBinCount
                    endBinIdx = totBinCount + numBinsInChr
                    brInfo = tempContents[chr][key]
                    
                    if chr in genomeElementChrs:
                        tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \
                                                                    chrStartIdxs[chr], chrEndIdxs[chr], \
                                                                    startBinIdx, endBinIdx)
                    else:
                        if chrEndIdxs[chr] - chrStartIdxs[chr] > 0:
                            raise InvalidFormatError("Error: bounding region '%s' has incorrect element count: %s > 0" % (GenomeRegion(chr=chr, start=brInfo.start, end=brInfo.end), chrEndIdxs[chr] - chrStartIdxs[chr]))
                        tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, 0, 0, 0, 0)
                
                if chr in genomeElementChrs:
                    totBinCount += numBinsInChr
        
        if len(genomeElementChrs - set(tempContents.keys())) > 0:
            raise InvalidFormatError('Error: some chromosomes (sequences) contains data, but has no bounding regions: %s' % ', '.join(genomeElementChrs - set(tempContents.keys())))
        
        ensurePathExists(self._fn)
        
        for chr in tempContents:
            brInfoDict = tempContents[chr]
            tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()), tuple(brInfoDict.values()))
        
        brShelve = safeshelve.open(self._fn)
        brShelve.update(tempContents)
        brShelve.close()
        
        while not self.fileExists():
            from gold.application.LogSetup import logMessage
            logMessage("Bounding region shelve file '%s' has yet to be created" % self._fn)
            import time
            time.sleep(0.2)
    def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList,
                             sparse):
        assert sparse in [False, True]

        tempContents = OrderedDict()

        genomeElementChrs = set(genomeElementChrList)
        lastRegion = None
        chrStartIdxs = OrderedDict()
        chrEndIdxs = OrderedDict()
        totElCount = 0
        totBinCount = 0

        for br in boundingRegionTuples:
            if lastRegion is None or br.region.chr != lastRegion.chr:
                if br.region.chr in tempContents:
                    raise InvalidFormatError(
                        "Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)."
                        % br.region)

                lastRegion = None
                tempContents[br.region.chr] = OrderedDict()  #sorteddict()
                if sparse:
                    chrStartIdxs[br.region.chr] = totElCount
            else:
                if br.region < lastRegion:
                    raise InvalidFormatError(
                        "Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s."
                        % (lastRegion, br.region))
                if lastRegion.overlaps(br.region):
                    raise InvalidFormatError(
                        "Error: bounding regions '%s' and '%s' overlap." %
                        (lastRegion, br.region))
                if lastRegion.end == br.region.start:
                    raise InvalidFormatError(
                        "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)."
                        % (lastRegion, br.region))

            if len(br.region) < 1:
                raise InvalidFormatError(
                    "Error: bounding region '%s' does not have positive length."
                    % br.region)

            if not sparse and len(br.region) != br.elCount:
                raise InvalidFormatError(
                    "Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s"
                    % (br.region, len(br.region), br.elCount))

            startIdx, endIdx = (totElCount, totElCount +
                                br.elCount) if not sparse else (None, None)
            totElCount += br.elCount
            if sparse:
                chrEndIdxs[br.region.chr] = totElCount

            tempContents[br.region.chr][br.region.start] = BoundingRegionInfo(
                br.region.start, br.region.end, startIdx, endIdx, 0, 0)

            lastRegion = br.region

        if sparse:
            totBinCount = 0
            for chr in tempContents:
                chrLen = GenomeInfo.getChrLen(self._genome, chr)
                numBinsInChr = CompBinManager.getNumOfBins(
                    GenomeRegion(start=0, end=chrLen))
                for key in tempContents[chr].keys():
                    startBinIdx = totBinCount
                    endBinIdx = totBinCount + numBinsInChr
                    brInfo = tempContents[chr][key]

                    if chr in genomeElementChrs:
                        tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \
                                                                    chrStartIdxs[chr], chrEndIdxs[chr], \
                                                                    startBinIdx, endBinIdx)
                    else:
                        if chrEndIdxs[chr] - chrStartIdxs[chr] > 0:
                            raise InvalidFormatError(
                                "Error: bounding region '%s' has incorrect element count: %s > 0"
                                % (GenomeRegion(chr=chr,
                                                start=brInfo.start,
                                                end=brInfo.end),
                                   chrEndIdxs[chr] - chrStartIdxs[chr]))
                        tempContents[chr][key] = BoundingRegionInfo(
                            brInfo.start, brInfo.end, 0, 0, 0, 0)

                if chr in genomeElementChrs:
                    totBinCount += numBinsInChr

        if len(genomeElementChrs - set(tempContents.keys())) > 0:
            raise InvalidFormatError(
                'Error: some chromosomes (sequences) contains data, but has no bounding regions: %s'
                % ', '.join(genomeElementChrs - set(tempContents.keys())))

        ensurePathExists(self._fn)

        for chr in tempContents:
            brInfoDict = tempContents[chr]
            tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()),
                                             tuple(brInfoDict.values()))

        brShelve = safeshelve.open(self._fn, 'c', protocol=self.PROTOCOL)
        brShelve.update(tempContents)
        brShelve.close()

        while not self.fileExists():
            from gold.application.LogSetup import logMessage
            logMessage(
                "Bounding region shelve file '%s' has yet to be created" %
                self._fn)
            import time
            time.sleep(0.2)
    def execute(cls, choices, galaxyFn=None, username=''):
        '''
        Is called when execute-button is pushed by web-user. Should print
        output as HTML to standard out, which will be directed to a results page
        in Galaxy history. If getOutputFormat is anything else than HTML, the
        output should be written to the file with path galaxyFn. If needed,
        StaticFile can be used to get a path where additional files can be put
        (e.g. generated image files). choices is a list of selections made by
        web-user in each options box.
        '''
        from quick.application.ExternalTrackManager import ExternalTrackManager
        from gold.origdata.BedGenomeElementSource import BedCategoryGenomeElementSource
        from gold.origdata.GtrackGenomeElementSource import GtrackGenomeElementSource
        from gold.origdata.TrackGenomeElementSource import TrackGenomeElementSource
        from gold.track.GenomeRegion import GenomeRegion
        from quick.util.GenomeInfo import GenomeInfo
        from collections import defaultdict

        genome = choices[0]
        track = choices[2].split(':')
        allowOverlaps = True if choices[3] == 'Yes' else False

        regionList = []
        for chrom in GenomeInfo.getChrList(genome):
            start = 0
            chromSize = GenomeInfo.getChrLen(genome, chrom)
            regionList.append(GenomeRegion(genome, chrom, start, chromSize))

        if choices[1] == 'From Hyperbrowser repository':
            geSource = TrackGenomeElementSource(genome, track, regionList)
        else:
            fileType = ExternalTrackManager.extractFileSuffixFromGalaxyTN(
                track)
            fn = ExternalTrackManager.extractFnFromGalaxyTN(track)
            geSource = BedCategoryGenomeElementSource(
                fn
            ) if fileType == 'category.bed' else GtrackGenomeElementSource(fn)

        resultMinDict = defaultdict(dict)
        resultMaxDict = defaultdict(dict)
        for ge in geSource:
            if resultMaxDict[ge.chr].has_key(ge.val):
                if ge.end:
                    if resultMaxDict[ge.chr][ge.val] < ge.end:
                        resultMaxDict[ge.chr][ge.val] = ge.end
                elif resultMaxDict[ge.chr][ge.val] < ge.start:
                    resultMaxDict[ge.chr][ge.val] = ge.start

                if resultMinDict[ge.chr][ge.val] > ge.start:
                    resultMinDict[ge.chr][ge.val] = ge.start
            else:
                resultMaxDict[ge.chr][ge.val] = ge.end if ge.end else ge.start
                resultMinDict[ge.chr][ge.val] = ge.start

        utfil = open(galaxyFn, 'w')
        quitFlag = False
        errorMsg = 'Error, overlapping regions '
        catsConflicting = []
        for chrom in sorted(resultMinDict.keys()):

            for category in resultMinDict[chrom].keys():
                lower, upper = resultMinDict[chrom][category], resultMaxDict[
                    chrom][category]
                if not allowOverlaps:
                    for cat in resultMinDict[chrom]:
                        if cat != category:
                            l, u = resultMinDict[chrom][cat], resultMaxDict[
                                chrom][cat]
                            if l >= upper or u <= lower:
                                continue
                            if l > lower or u < upper:
                                quitFlag = True
                                catsConflicting.append(
                                    '(Category: %s,  Region: %i - %i) vs. (Category: %s, Region: %i - %i)'
                                    % (category, lower, upper, cat, l, u))
                                #break
                    #if quitFlag: break

                print >> utfil, '\t'.join(
                    [chrom, str(lower),
                     str(upper + 1), category])

            #if quitFlag: break
        utfil.close()

        if quitFlag:
            open(galaxyFn, 'w').write(
                'Error: overlapping resulting regions are not allowed with selected preferences:\n'
                + '\n'.join(catsConflicting))
    def execute(cls, choices, galaxyFn=None, username=""):
        """Is called when execute-button is pushed by web-user.
        Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
        If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.
        If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
        choices is a list of selections made by web-user in each options box.
        """
        print "Executing..."
        genome = choices[0]
        infile = choices[1]
        windowSize = int(choices[2])
        normquantile = float(choices[3])
        percentile = float(choices[4])

        inFn = ExternalTrackManager.extractFnFromGalaxyTN(infile.split(":"))
        data = open(inFn, "r").read()
        fetVals, addr = cls.preProcessPvalues(data, 2)
        stddevs, addr = cls.preProcessPvalues(data, 3)
        output = open(galaxyFn, "w")
        # Tuva changed sorted elms to FALSE
        output.write(
            "##gtrack version: 1.0\n"
            + "##track type: segments\n"
            + "##uninterrupted data lines: true\n"
            + "##sorted elements: false\n"
            + "##no overlapping elements: true\n"
            + "###seqid\tstart\tend\n"
        )

        # Calculate limit for FET:
        m = stats.cmedian(fetVals)
        upperquant = stats.scoreatpercentile(stddevs, percentile)
        qnorm = stats.norm.ppf(normquantile)
        limit = m + qnorm * upperquant
        print "Windows found", sum(fetVals >= limit)
        print "percentile", percentile, "normquantile", normquantile
        print "mean", m, "upperquant", upperquant, "qnorm", qnorm
        print "Limit", limit
        addrs = numpy.array(addr)
        filteredaddrs = addrs[fetVals >= limit]

        print GenomeInfo.getChrList(genome)

        curchrom = ""
        start = ""
        end = sys.maxint
        prevAddr = -1000000.0
        for addr in filteredaddrs:
            addrList = addr.split("\t")
            if addrList[0] != curchrom or int(addrList[1]) - windowSize > prevAddr:
                if curchrom != "":
                    newend = prevAddr + windowSize if prevAddr + windowSize < end else end
                    output.write(start + "\t" + str(newend) + "\n")
                start = addr
                curchrom = addrList[0]
                end = int(GenomeInfo.getChrLen(genome, curchrom)) - 1

            prevAddr = int(addr.split("\t")[1])

        newend = prevAddr + windowSize if prevAddr + windowSize < end else end
        output.write(start + "\t" + str(newend) + "\n")
        output.close()
 def _getChrLen(self):
     return GenomeInfo.getChrLen(self.genome, self.chr)
 def _createTrackCommon(cls, genome, inTrackName, outTrackName, windowSize, func, username, chrList):
     regionList = [GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) ) for chr in chrList]
     PreProcessCustomTrackJob(genome, outTrackName, regionList, cls._getGeSourceForRegion, \
                              username=username, inTrackName=inTrackName, windowSize=windowSize, func=func).process()
 def _getChrLen(self):
     return GenomeInfo.getChrLen(self.genome, self.chr)