def __iter__(self):
        brShelve1 = self._getBoundingRegionShelve(self._trackName1)
        brShelve2 = self._getBoundingRegionShelve(self._trackName2)

        allBrsAreWholeChrs1 = self._commonAllBoundingRegionsAreWholeChr(brShelve1) \
            if brShelve1 is not None else False
        allBrsAreWholeChrs2 = self._commonAllBoundingRegionsAreWholeChr(brShelve2) \
            if brShelve2 is not None else False

        for chr in GenomeInfo.getExtendedChrList(self.genome):
            if brShelve1 is None:
                yield GenomeRegion(self.genome, chr, 0,
                                   GenomeInfo.getChrLen(self.genome, chr))
            else:
                brList1 = brShelve1.getAllBoundingRegionsForChr(chr)

                if brShelve2 is None or \
                    (allBrsAreWholeChrs2 and not allBrsAreWholeChrs1):
                    for reg in brList1:
                        yield reg
                else:
                    brList2 = brShelve2.getAllBoundingRegionsForChr(chr)
                    if allBrsAreWholeChrs1 and not allBrsAreWholeChrs2:
                        for reg in brList2:
                            yield reg
                    else:
                        for reg in self.getAllIntersectingRegions(
                                self.genome, chr, brList1, brList2):
                            yield reg
 def execute(choices, galaxyFn=None, username=''):
     '''Is called when execute-button is pushed by web-user.
     Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
     If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
     choices is a list of selections made by web-user in each options box.
     '''
     print 'Executing...'
     
     tempinfofile=ExternalTrackManager.extractFnFromGalaxyTN(choices[0].split(":"))
     abbrv=GenomeImporter.getGenomeAbbrv(tempinfofile)
     gi = GenomeInfo(abbrv)
     chrNamesInFasta=gi.sourceChrNames
     
     chromNamesDict={}
     chrDict = InstallGenomeTool._getRenamedChrDictWithSelection(choices)
         
     for i, key in enumerate(chrDict.keys()):
         if chrDict[key]:
             chromNamesDict[chrNamesInFasta[i]]=key
     print 'All chromosomes chosen: ' + str(chromNamesDict)
         
     stdChrDict = InstallGenomeTool._getRenamedChrDictWithSelection(choices, stdChrs=True)
     stdChrs = [x for x in stdChrDict if stdChrDict[x]]
     print 'Standard chromosomes chosen: ' + ", ".join(stdChrs)
     
     GenomeImporter.createGenome(abbrv, gi.fullName, chromNamesDict, stdChrs, username=username)
     
     gi.installedBy = username
     gi.timeOfInstallation = datetime.now()
     gi.store()
    def __iter__(self):
        brShelve1 = self._getBoundingRegionShelve(self._trackName1)
        brShelve2 = self._getBoundingRegionShelve(self._trackName2)
        
        for chr in GenomeInfo.getExtendedChrList(self.genome):
            if brShelve1 is None:
                yield GenomeRegion(self.genome, chr, 0, GenomeInfo.getChrLen(self.genome, chr))
            else:
                brList1 = brShelve1.getAllBoundingRegions(chr)
                allBrsAreWholeChrs1 = self._commonAllBoundingRegionsAreWholeChr(brShelve1)
                allBrsAreWholeChrs2 = self._commonAllBoundingRegionsAreWholeChr(brShelve2) \
                    if brShelve2 is not None else False

                if brShelve2 is None or \
                    (allBrsAreWholeChrs2 and not allBrsAreWholeChrs1):
                    for reg in brList1:
                        yield reg
                else:
                    brList2 = brShelve2.getAllBoundingRegions(chr)
                    if allBrsAreWholeChrs1 and not allBrsAreWholeChrs2:
                        for reg in brList2:
                            yield reg
                    else:
                        for reg in self.getAllIntersectingRegions(self.genome, chr, brList1, brList2):
                            yield reg
Beispiel #4
0
def createAssemblyGapsFile(genome, assemblyChars='ACGTacgt'):
    """genome assemblyChars='ACGTacgt'"""
    basePath = gcf.createOrigPath(genome, GenomeInfo.getPropertyTrackName(genome, 'gaps'),'')
    outFn = basePath + 'assemblyGaps.bed'
    qcf.ensurePathExists(outFn)
    outFile = open(outFn,'w')
    
    seqTrack = PlainTrack( GenomeInfo.getSequenceTrackName(genome) )

    anyGaps = False
    for chr in GenomeInfo.getExtendedChrList(genome):
        chrRegion = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr))
        seqTV = seqTrack.getTrackView(chrRegion)
        seq = seqTV.valsAsNumpyArray()
        
        #gapIndexes = numpy.arange(len(seq))[(seq == 'n') | (seq == 'N')]
        gapIndexes = numpy.arange(len(seq))[numpy.logical_not( numpy.logical_or.reduce([seq == x for x in assemblyChars]) )]
        gapIndexDiff = gapIndexes[1:] - gapIndexes[:-1]
        gapBeginIndexes = numpy.delete(gapIndexes, (numpy.arange(len(gapIndexDiff)) + 1)[gapIndexDiff==1])
        gapEndIndexes = numpy.delete(gapIndexes + 1, numpy.arange(len(gapIndexDiff))[gapIndexDiff==1])
        
        assert len(gapBeginIndexes) == len(gapEndIndexes)
        
        for i in xrange(len(gapBeginIndexes)):
            anyGaps = True
            outFile.write('\t'.join([chr, str(gapBeginIndexes[i]), str(gapEndIndexes[i])]) + os.linesep)
        
    if not anyGaps:
        outFile.write('\t'.join([GenomeInfo.getExtendedChrList(genome)[0], '1', '1']))
        
    outFile.close()
Beispiel #5
0
    def execute(cls, choices, galaxyFn=None, username=''):
        from quick.application.ExternalTrackManager import ExternalTrackManager

        genome = choices[0]
        preProcTN1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(
            genome, choices[2].split(
                ':')) if choices[1] == 'history' else choices[2].split(':')
        chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom))
                            for chrom in GenomeInfo.getChrList(genome)])

        trackType = choices[3].split(':')[1]
        fnSource = ExternalTrackManager.extractFnFromGalaxyTN(
            choices[3].split(':'))

        if trackType in ['valued.bed', 'category.bed', 'bed']:
            geSource = GenomeElementSorter(
                BedGenomeElementSource(fnSource, genome=genome)).__iter__()

        elif trackType == 'gtrack':
            geSource = GenomeElementSorter(
                GtrackGenomeElementSource(fnSource, genome=genome)).__iter__()
            #headLinesStr = geSource.getHeaderLines().replace('##','\n##')
        else:
            raise InvalidFormatError(
                'The Binning must be of the following formats: gtrack, valued.bed, category.bed ,bed ...'
            )

        cls.PrintResultToHistItem(galaxyFn, geSource, preProcTN1, genome,
                                  username)
Beispiel #6
0
 def _removeBoundingRegionTuplesIfFullChrsAndNotFixedGapSize(self):
     if self.getFixedGapSize() == 0 and not self._reprIsDense:
         # If only full chromosomes
         if all(brt.region.chr in GenomeInfo.getExtendedChrList(self._genome) and \
                 brt.region.start == 0 and \
                  brt.region.end == GenomeInfo.getChrLen(self._genome, brt.region.chr) \
                   for brt in self._boundingRegionTuples):
             self._boundingRegionTuples = []
 def _removeBoundingRegionTuplesIfFullChrsAndNotFixedGapSize(self):
     if self.getFixedGapSize() == 0 and not self._reprIsDense:
         # If only full chromosomes
         if all(brt.region.chr in GenomeInfo.getExtendedChrList(self._genome) and \
                 brt.region.start == 0 and \
                  brt.region.end == GenomeInfo.getChrLen(self._genome, brt.region.chr) \
                   for brt in self._boundingRegionTuples):
             self._boundingRegionTuples = []
    def validate_snp(cls, snps, genome):
        DNA = ['A', 'C', 'G', 'T'] + cls.AMBIGUOUS_DNA.keys()

        err = []
        for snp in snps:
            assert len(snp) == 5
            _rsid, _chr, _pos, _ref, _var = snp
            #spec = ':'.join(snp)
            spec = repr(snp)

            if _rsid and not _chr:
                err.append('Invalid RefSNP: rs' + _rsid)
                continue
            if _chr not in GenomeInfo.getChrList(genome):
                err.append(spec + ' Chromosome ' + _chr + ' is not valid')
                continue
            if not _pos.isdigit():
                err.append(spec + ' Position must numeric')
                continue
            if int(_pos) < 0:
                err.append(spec + ' Position must be higher than 0')
                continue

            chrLen = GenomeInfo.getChrLen(genome, _chr)
            if int(_pos) > chrLen:
                err.append(spec +
                           ' Position is higher than length of %s (%d)' %
                           (_chr, chrLen))
                continue

            ref = VariantMeltingProfile.get_reference_allele(
                genome, _chr, _pos, len(_ref))
            if _ref != ref:
                err.append(
                    spec +
                    ' Reference allele does not match reference genome, should be: '
                    + ref)
                continue

            if _ref == 'N':
                err.append(spec + ' Reference allele can not be N')
                continue

            if not _var:
                err.append(spec + ' Variant allele not specified')
                continue

            if not all([v in DNA for v in _var]):
                err.append(spec + ' Variant allele ' + _var + " is not valid")
                continue

            if cls.AMBIGUOUS_DNA.has_key(
                    _var) and _ref in cls.AMBIGUOUS_DNA[_var]:
                err.append(spec +
                           ' Ambiguous variant allele includes reference')
                continue

        return err
Beispiel #9
0
 def __new__(cls, genome):
     from gold.track.GenomeRegion import GenomeRegion
     from quick.util.GenomeInfo import GenomeInfo
     chrList = GenomeInfo.getChrList(genome)
     if len(chrList) > 0:
         return [
             GenomeRegion(genome,
                          GenomeInfo.getChrList(genome)[0], 0, 1)
         ]
 def _checkValidStart(self, chr, start):
     if start < 0:
         raise InvalidFormatError('Error: start position is negative: %s' % start)
 
     if self.genome and \
         GenomeInfo.isValidChr(self.genome, chr) and \
             start > GenomeInfo.getChrLen(self.genome, chr):
                 raise InvalidFormatError('Error: start position is larger than chromosome size (%s) < %d' % \
                                          (GenomeInfo.getChrLen(self.genome, chr), start))
     return start
 def createNmerChains(self, n):
     for chr in GenomeInfo.getChrList(self._genome):
         print 'Creating chains of nmers of length ', n, ' for chromosome ', chr
         chrLen = GenomeInfo.getChrLen(self._genome,chr)
         chrReg = GenomeRegion( self._genome, chr, 0, chrLen )
         seqTV = PlainTrack( GenomeInfo.getSequenceTrackName(self._genome) ).getTrackView(chrReg)
         
         #nmersAsInts = NmerAsIntSlidingWindow(n, FuncValTvWrapper(seqTV))
         nmersAsInts = NmerAsIntSlidingWindow(n, seqTV.valsAsNumpyArray())
         SameValueIndexChainsFactory.generate( nmersAsInts, chrLen, 4**n, self._createPath(n), chr )
 def getNumberElements(genome, trackName):
     track = PlainTrack(trackName)
     numElements = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         numElements = numElements + [len(tv.startsAsNumpyArray())]
         
     return numElements
 def getAnchor(genome, trackName):
     track = PlainTrack(trackName)
     anchor = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         anchor = anchor + [str(tv.genomeAnchor)]
     
     return anchor
    def _checkValidStart(self, chr, start):
        if start < 0:
            raise InvalidFormatError('Error: start position is negative: %s' %
                                     start)

        if self.genome and \
            GenomeInfo.isValidChr(self.genome, chr) and \
                start > GenomeInfo.getChrLen(self.genome, chr):
            raise InvalidFormatError('Error: start position is larger than the size of chromosome "%s" (%s > %s)' % \
                                     (chr, start, GenomeInfo.getChrLen(self.genome, chr)))
        return start
    def createNmerChains(self, n):
        for chr in GenomeInfo.getChrList(self._genome):
            print 'Creating chains of nmers of length ', n, ' for chromosome ', chr
            chrLen = GenomeInfo.getChrLen(self._genome, chr)
            chrReg = GenomeRegion(self._genome, chr, 0, chrLen)
            seqTV = PlainTrack(GenomeInfo.getSequenceTrackName(
                self._genome)).getTrackView(chrReg)

            #nmersAsInts = NmerAsIntSlidingWindow(n, FuncValTvWrapper(seqTV))
            nmersAsInts = NmerAsIntSlidingWindow(n, seqTV.valsAsNumpyArray())
            SameValueIndexChainsFactory.generate(nmersAsInts, chrLen, 4**n,
                                                 self._createPath(n), chr)
Beispiel #16
0
    def execute(cls, choices, galaxyFn=None, username=''):
        from gold.util.RandomUtil import random

        outputFile = open(galaxyFn, 'w')
        genome = choices[0]
        histItem = choices[2]
        trackItem = choices[3]
        chromRegsPath = GenomeInfo.getChrRegsFn(genome)

        chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom))
                            for chrom in GenomeInfo.getChrList(genome)])
        geSource = headLinesStr = None
        if choices[1] == 'history':

            trackType = choices[2].split(':')[1]
            username = ''.join(
                [chr(random.randint(97, 122)) for i in range(6)])
            tempFn = createCollectedPath(
                genome, [],
                username + '_'.join([str(v) for v in time.localtime()[:6]]) +
                '.' + trackType)
            fnSource = ExternalTrackManager.extractFnFromGalaxyTN(
                choices[2].split(':'))
            open(tempFn, 'w').write(open(fnSource, 'r').read())

            if trackType in ['valued.bed', 'category.bed', 'bed']:
                geSource = GenomeElementSorter(
                    BedGenomeElementSource(tempFn, genome=genome)).__iter__()

            #elif trackType == 'gtrack':
            #    geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__()
            #    headLinesStr = geSource.getHeaderLines().replace('##','\n##')

            cls.WriteExpandedElementsToFile(geSource,
                                            chrSizeDict,
                                            outputFile,
                                            headLinesStr,
                                            writeHeaderFlag=True)
            os.remove(tempFn)

        else:
            writeHeaderFlag = True
            for chrom in GenomeInfo.getChrList(genome):
                gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom])
                plTrack = PlainTrack(trackItem.split(':'))
                geSource = GenomeElementTvWrapper(
                    plTrack.getTrackView(gRegion)).__iter__()
                cls.WriteExpandedElementsToFile(geSource, chrSizeDict,
                                                outputFile, headLinesStr,
                                                writeHeaderFlag)
                writeHeaderFlag = False
        outputFile.close()
 def _checkValidEnd(self, chr, end, start=None):
     if end < 0:
         raise InvalidFormatError('Error: end position is negative: %s' % end)
     
     if self.genome and \
         GenomeInfo.isValidChr(self.genome, chr) and \
             end-1 > GenomeInfo.getChrLen(self.genome, chr):
                 raise InvalidFormatError('Error: end position is larger than chromosome size (%s)' % \
                                          GenomeInfo.getChrLen(self.genome, chr))
     if start is not None and end <= start:
             raise InvalidFormatError('Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d' % (end, start))
     
     return end
 def getSegmentSizes(genome, trackName):
     track = PlainTrack(trackName)
     segmentSize = []; sumSegmentSize = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         sizeSegments = tv.endsAsNumpyArray() - tv.startsAsNumpyArray()
         sumSizes = sizeSegments.sum()
         segmentSize = segmentSize + [sizeSegments.tolist()]
         sumSegmentSize = sumSegmentSize + [sumSizes.tolist()]
         
     return sumSegmentSize
Beispiel #19
0
    def isMemoBin(region):
        if not IS_EXPERIMENTAL_INSTALLATION:
            return CompBinManager.isCompBin(region)

        if CompBinManager.ALLOW_COMP_BIN_SPLITTING:
            isCompBin = CompBinManager.isCompBin(region)
            return isCompBin
        else:
            isChr = not hasattr(region, '__iter__') and any([region.chr, region.start, region.end] == [r.chr, r.start, r.end] \
                                                            for r in GenomeInfo.getChrRegs(region.genome))
            isChrArm = not hasattr(region, '__iter__') and any([region.chr, region.start, region.end] == [r.chr, r.start, r.end] \
                                                            for r in GenomeInfo.getChrArmRegs(region.genome))

            return (isChr or isChrArm)
 def _getBoundingRegionTupleList(self, case, sortedAssertElList):
     boundingRegions = [br for br in sorted(case.boundingRegionsAssertList) if br.region.chr is not None]
     if len(boundingRegions) > 0:
         return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=br.region.chr, \
                                                  start=br.region.start if br.region.start is not None else 0, \
                                                  end=br.region.end if br.region.end is not None else \
                                                      GenomeInfo.getChrLen(self.GENOME, br.region.chr)), br.elCount)
                 for br in boundingRegions]
     else:
         totChrList = [ge.chr for ge in sortedAssertElList]
         chrBrList = OrderedDict( [ (i, totChrList.count(i)) for i in sorted(set(totChrList)) ] )
         return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=chr, start=0, \
                                                  end=GenomeInfo.getChrLen(self.GENOME, chr)), elCount) \
                 for chr, elCount in chrBrList.iteritems()]
    def execute(cls, choices, galaxyFn=None, username=''):

        start = time.time()
        genome = choices[0]
        trackName = choices[1].split(':')
        outFn = galaxyFn
        if choices[5] == 'Write to Standardised file':
            outFn = createOrigPath(genome, choices[-1].split(':'),
                                   'collapsed_result.bedgraph')
            ensurePathExists(outFn[:outFn.rfind('/') + 1])

        combineMethod = choices[2]
        category = choices[3] if choices[3] else ''
        numSamples = choices[4] if choices[4] else '1'

        analysisDef = 'dummy [combineMethod=%s] %s [numSamples=%s] -> ConvertToNonOverlappingCategorySegmentsPythonStat' % \
                        (combineMethod, '[category=%s]' % category if category != '' else '', numSamples) #'Python'

        for regSpec in GenomeInfo.getChrList(genome):
            res = GalaxyInterface.runManual([trackName], analysisDef, regSpec, '*', genome, username=username, \
                                            printResults=False, printHtmlWarningMsgs=False)

            from gold.origdata.TrackGenomeElementSource import TrackViewGenomeElementSource
            from gold.origdata.BedComposer import CategoryBedComposer
            for resDict in res.values():
                trackView = resDict['Result']
                tvGeSource = TrackViewGenomeElementSource(
                    genome, trackView, trackName)
                CategoryBedComposer(tvGeSource).composeToFile(outFn)
Beispiel #22
0
 def _createTrackCommon(cls, genome, inTrackName, outTrackName, windowSize,
                        func, username, chrList):
     regionList = [
         GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr))
         for chr in chrList
     ]
     for region in regionList:
         PreProcessCustomTrackJob(genome,
                                  outTrackName, [region],
                                  cls._getGeSourceForRegion,
                                  username=username,
                                  preProcess=True,
                                  finalize=False,
                                  inTrackName=inTrackName,
                                  windowSize=windowSize,
                                  func=func).process()
     PreProcessCustomTrackJob(genome,
                              outTrackName,
                              regionList,
                              cls._getGeSourceForRegion,
                              username=username,
                              preProcess=False,
                              finalize=True,
                              inTrackName=inTrackName,
                              windowSize=windowSize,
                              func=func).process()
Beispiel #23
0
 def getTotalBpSpan(self):
     if self.chr is None:
         return sum(len(reg) for reg in GenomeInfo.getChrRegs(self.genome))
     #elif not self.start:
     #return GenomeInfo.getChrLen(self.genome, self.chr)
     else:
         return len(self)
    def _createPreProcFiles(self):
        collector = TrackInfoDataCollector(self._genome, self._trackName)
        collector.updateMetaDataForFinalization(self._geSource.getFileSuffix(), self._geSource.getPrefixList(), \
                                                self._geSource.getValDataType(), self._geSource.getValDim(), \
                                                self._geSource.getEdgeWeightDataType(), self._geSource.getEdgeWeightDim(), \
                                                self._geSource.hasUndirectedEdges(),
                                                self._geSource.getVersion(), PreProcessUtils.constructId(self._geSource))

        if collector.getNumElements(self._chr, self._allowOverlaps) == 0:
            return
        
        if self._mode != 'Real':
            for ge in self._geSource:
                pass
            return
        
        dirPath = createDirPath(self._trackName, self._genome, self._chr, self._allowOverlaps)

        dir = OutputDirectory(dirPath, collector.getPrefixList(self._allowOverlaps), \
                              collector.getNumElements(self._chr, self._allowOverlaps),\
                              GenomeInfo.getChrLen(self._genome, self._chr), \
                              collector.getValDataType(), collector.getValDim(), \
                              collector.getEgdeWeightDataType(), collector.getEgdeWeightDim(), \
                              collector.getMaxNumEdges(self._chr, self._allowOverlaps), \
                              collector.getMaxStrLens(self._chr, self._allowOverlaps))
        
        writeFunc = dir.writeRawSlice if self._geSource.isSliceSource() else dir.writeElement
        
        for ge in self._geSource:
            writeFunc(ge)
        
        collector.appendPreProcessedChr(self._allowOverlaps, self._chr)
        
        dir.close()
Beispiel #25
0
 def getValuesFromBedFile(cls, genome, fn, colorPattern=(1,0,0)):
     resDict = defaultdict(list)
     valDict = defaultdict(list)
     lineTab = []
     if type(fn) == type(None):
         return resDict
     elif isinstance(fn, basestring):
         lineTab = open(fn,'r').read().split('\n')
     else:
         lineTab = fn.returnComposed().split('\n')
     
     valueList = []
     for line in lineTab:
         lineTab = line.split('\t')
         try:
             chrom = lineTab[0]
             valDict[chrom]+=[float(lineTab[3])]
         except:
             logMessage(line)
     
     maxVal = max(max(valDict.values()))
     for chrom in GenomeInfo.getChrList(genome):
         if valDict.has_key(chrom):
             try:
                 resDict[chrom]+= [tuple([255 - (int(val*255/maxVal)*v) for v in colorPattern]) for val in valDict[chrom]]
             except:
                 logMessage ('Ny rundeeee:  '+ str([v for v in valDict[chrom][:10]])+ ':   '+str(maxVal))
                            
     print 'count', len(valDict.values())
     return resDict, maxVal
 def _compute(self):
     tv = self._children[0].getResult()
     starts, ends = tv.startsAsNumpyArray(), tv.endsAsNumpyArray()
     
     borderDict = defaultdict(int)
     listLen = len(starts)
     
     for index in xrange(listLen):
         borderDict[starts[index]]+=1
         borderDict[ends[index]]-=1
     
     
     sortedPos = sorted(borderDict)
     range(0, chrlength, microbinzie)
     
     #handle start border issues
     startList, endList, valList = (sortedPos,  sortedPos[1:], [])  if sortedPos[0] == 0 else  ([0] + sortedPos,  sortedPos,  [0])
     
     #Handle end border issues 
     chrEndPos = GenomeInfo.getChrLen(tv.genomeAnchor.genome, tv.genomeAnchor.chr)-1
     startList, endList  = (startList, endList+[chrEndPos])  if endList[-1]<chrEndPos else  (startList[:-1], endList)
     
     #make step-function values
     accVal = 0
     for pos in sortedPos:
         accVal+= borderDict[pos]
         valList.append(accVal)
     
     if chrEndPos == pos:
         valList.pop()
     
         
     return TrackView(genomeAnchor=tv.genomeAnchor, startList=np.array(startList), endList=np.array(endList), valList=np.array(valList), \
                      strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=False)
    def getTrackView(self, region):
        assert self._origRegion == region
        allChrArmRegs = GenomeInfo.getContainingChrArms(region)
        if len(allChrArmRegs) != 1:
            raise CentromerError
        chrArm = allChrArmRegs[0]
        
        buffer = self._getIndepencyBufferSize(region)
        sourceRegs = chrArm.exclude( copy(region).extend(-buffer).extend(buffer) )
        assert len(sourceRegs) in [1,2]
        
        if not any(len(sourceReg) >= self.MIN_SOURCE_TO_SAMPLE_SIZE_RATIO * len(region) for sourceReg in sourceRegs):
            raise TooLargeBinError('Source region lengths of ' + str([len(x) for x in sourceRegs]) +
                                   ' are too small compared to region length of ' + str(len(region)) +
                                   ' according to MIN_SOURCE_TO_SAMPLE_SIZE_RATIO: ' + str(self.MIN_SOURCE_TO_SAMPLE_SIZE_RATIO))
        
        if len(sourceRegs) == 1:
            sourceReg = sourceRegs[0]
        else:
            firstSourceProportion = (len(sourceRegs[0])-len(region)) / sum(len(sourceRegs[i])-len(region) for i in range(2))
            sourceReg = sourceRegs[0] if random.random() < firstSourceProportion else sourceRegs[1]

        randOffset = random.randint( 0, len(sourceReg) - len(region) )
        start = sourceReg.start + randOffset
        end = start + len(region)
        randRegion = GenomeRegion(region.genome, region.chr, start, end)

        rawData = RawDataStat(randRegion, self._origTrack, self._trackFormatReq)
        tv = rawData.getResult()
        assert region != tv.genomeAnchor        
        return tv
 def execute(cls, choices, galaxyFn=None, username=''):
     start = time.time()
     genome = choices[0]
     trackName = choices[1].split(':')
     outFn = galaxyFn
     if choices[5] == 'Write to Standardised file':
         outFn = createOrigPath(genome, choices[-1].split(':'), 'collapsed_result.bedgraph')
         ensurePathExists(outFn[:outFn.rfind('/')+1])
             
     combineMethod = choices[2]
     category = choices[3] if choices[3] else ''
     numSamples = choices[4] if choices[4] else '1'
     
     analysisDef = 'dummy [combineMethod=%s] %s [numSamples=%s] -> ConvertToNonOverlappingCategorySegmentsPythonStat' % \
                     (combineMethod, '[category=%s]' % category if category != '' else '', numSamples) #'Python'
                                               
     for regSpec in  GenomeInfo.getChrList(genome):
         res = GalaxyInterface.runManual([trackName], analysisDef, regSpec, '*', genome, username=username, \
                                         printResults=False, printHtmlWarningMsgs=False)
         
         from gold.origdata.TrackGenomeElementSource import TrackViewGenomeElementSource
         from gold.origdata.BedComposer import CategoryBedComposer
         for resDict in res.values():
             tvGeSource = TrackViewGenomeElementSource(genome, resDict['Result'], trackName)
             CategoryBedComposer(tvGeSource).composeToFile(outFn)
Beispiel #29
0
    def sortChrDict(self):

        chr = GenomeInfo.getStdChrLengthDict(self.gsuite.genome)

        remeberString = []
        keysList = []
        for el in chr.keys():
            try:
                elC = int(el.replace('chr', ''))
                keysList.append(elC)
            except:
                remeberString.append(el.replace('chr', ''))

        sChr = sorted(keysList) + sorted(remeberString)

        chrDict = OrderedDict()
        chrLength = OrderedDict()
        val = 0
        for elChr in sChr:
            el = 'chr' + str(elChr)
            chrDict[el] = chr[el]
            chrLength[el] = val
            val += chr[el]

        return chrDict, chrLength
def extractTestGenomeAndPreProcess(galaxy_dir):
    hbPath = os.path.join(galaxy_dir, 'lib', 'hb')
    from config.Config import ORIG_DATA_PATH
    from gold.origdata.PreProcessTracksJob import PreProcessAllTracksJob
    from setup.InstallFunctions import executeShellCmd
    from gold.util.CommonFunctions import createDirPath
    from quick.util.GenomeInfo import GenomeInfo
    from quick.application.ProcTrackOptions import ProcTrackOptions
    from gold.description.TrackInfo import TrackInfo
    import shutil

    testGenomeFn = os.sep.join([hbPath, 'data', 'TestGenome.tar.gz'])
    executeShellCmd('tar xfz %s --keep-newer-files -C %s' % (testGenomeFn, ORIG_DATA_PATH), \
                    pipe=False, printError=True, onError='exit')
    print 'OK: Extracted TestGenome files.'

    PreProcessAllTracksJob.PASS_ON_EXCEPTIONS = True
    try:
        PreProcessAllTracksJob('TestGenome').process()
        PreProcessAllTracksJob(
            'TestGenome', GenomeInfo.getChrTrackName('TestGenome')).process()
        print 'OK: Finished preprocessing TestGenome.'
    except Exception, e:
        print 'FAILED: Error when preprocessing TestGenome. Error:'
        print '        ' + str(e).strip()
        sys.exit(1)
Beispiel #31
0
    def describeUserBinSource(self, regSpec, binSpec):
        from quick.application.UserBinSource import parseRegSpec
        from quick.util.CommonFunctions import strWithStdFormatting, \
            generateStandardizedBpSizeText, parseShortenedSizeSpec
        from quick.util.GenomeInfo import GenomeInfo

        regions = parseRegSpec(regSpec, self._genome)
        if len(regions) == 1:
            region = regions[0]
            regStr = ' chromosome ' + region.chr +\
                     ' of genome build "' + self._genome + '"' +\
                     ((' from position ' + strWithStdFormatting(region.start+1) + ' to ' + \
                        strWithStdFormatting(region.end)) if not region.isWholeChr() else '')
        else:
            if all(region.chr is None or region.isWholeChr()
                   for region in regions):
                regionChrs = set([region.chr for region in regions])
                allChrs = set(GenomeInfo.getChrList(self._genome))
                if len(regions) == len(allChrs) and regionChrs == allChrs:
                    regStr = ' all chromosomes'
                else:
                    regStr = ' chromosomes ' + ', '.join(region.chr
                                                         for region in regions)
            else:
                regStr = ' %s regions' % len(regions)
            regStr += ' of genome build "%s"' % self._genome

        return 'Using' + regStr +\
                ((', divided into intervals of size ' +\
                generateStandardizedBpSizeText( parseShortenedSizeSpec( binSpec ) ) + ',') if binSpec != '*' else '') +\
                ' as bins'
Beispiel #32
0
 def formatBedLines(cls, genome, lineDict, binSize):
     chrLength = GenomeInfo.getStdChrLengthDict(genome)
     numElems = dict([(k, v/binSize+(1 if v%binSize>0 else 0))for k, v in chrLength.items()])
     resDict = dict([(k, [0.0]*v) for k, v in numElems.items()])
     microDict = defaultdict(dict)
     microBin = binSize/100
     fullMicroBin = [microBin]*100
     
     for chrom, vals in lineDict.items():
         try:
             prevStart, prevEnd = vals[0]
             for start,end in vals[1:]:
                 if prevEnd>=start:
                     if end>prevEnd:
                         prevEnd = end
                     continue
                 cls.putBpsInResultDict(resDict, chrom, prevStart, prevEnd, binSize, microDict, microBin, fullMicroBin)
                 prevStart, prevEnd = start, end
             cls.putBpsInResultDict(resDict, chrom, prevStart, prevEnd, binSize, microDict, microBin, fullMicroBin)
         except:
             pass
     #logMessage('resDict[chr1][26]:   '+repr(resDict['chr1'][26]))
     #logMessage("microDict['chr1'][26]:   "+str(sum([v if v<10001 else v-10000 for v in microDict['chr1'][26]]))+':   '+repr(microDict['chr1'][26]))
     maxVal = max( [max(v) for v in resDict.values()] )
     return resDict, microDict, maxVal
Beispiel #33
0
 def execute(cls, choices, galaxyFn=None, username=''):
     '''
     Is called when execute-button is pushed by web-user. Should print
     output as HTML to standard out, which will be directed to a results page
     in Galaxy history. If getOutputFormat is anything else than HTML, the
     output should be written to the file with path galaxyFn. If needed,
     StaticFile can be used to get a path where additional files can be put
     (e.g. generated image files). choices is a list of selections made by
     web-user in each options box.
     '''
     
     try:
         historyInputTN = choices[0].split(':') #from history
         historyGalaxyFn = ExternalTrackManager.extractFnFromGalaxyTN( historyInputTN) #same as galaxyFn in execute of create benchmark..
         randomStatic = RunSpecificPickleFile(historyGalaxyFn) #finds path to static file created for a previous history element, and directs to a pickle file
         myInfo = randomStatic.loadPickledObject()
     except:
         return None
     
     galaxyTN = myInfo[3].split(':')
     myFileName = ExternalTrackManager.extractFnFromGalaxyTN(galaxyTN)
     genome = myInfo[0]
     
     gtrackSource = GtrackGenomeElementSource(myFileName, genome)
     regionList = []
     
     for obj in gtrackSource:
         regionList.append(GenomeRegion(obj.genome, obj.chr, obj.start, obj.end))
     
     extractor = TrackExtractor()
             
     fn = extractor.extract(GenomeInfo.getSequenceTrackName(genome), regionList, galaxyFn, 'fasta')
 def getOptionsBoxChr(cls, prevChoices):
     ref_snp = cls.get_ref_snp(prevChoices)
     if len(ref_snp) == 1 and prevChoices.run == 'Single':
         rs = ref_snp[0]
         return [rs[1]]
     elif len(ref_snp) > 1 or prevChoices.run == 'Batch':
         return None
     return GenomeInfo.getChrList(prevChoices.genome)
 def isValidTrack(genome, trackName, fullAccess=False):
     if not TrackInfo(genome, trackName).isValid(fullAccess):
         return False
     
     for fn in ProcTrackOptions._getDirContents(genome, trackName):
         if GenomeInfo.isValidChr(genome, fn) or isBoundingRegionFileName(fn):
             return True
     return  False
 def assertChrElCounts(self, trackName, chrElCountDict, allowOverlaps, customBins):
     for chr in chrElCountDict.keys():
         if chr in customBins:
             region = customBins[chr]
         else:
             region = GenomeRegion(self.GENOME, chr, 0, GenomeInfo.getChrLen(self.GENOME, chr))
         tv = self._getTrackView(trackName, region, allowOverlaps)
         self.assertEquals(chrElCountDict[chr], len([x for x in tv]))
Beispiel #37
0
 def _createTrackCommon(cls, genome, inTrackName, outTrackName, windowSize,
                        func, username, chrList):
     regionList = [
         GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr))
         for chr in chrList
     ]
     PreProcessCustomTrackJob(genome, outTrackName, regionList, cls._getGeSourceForRegion, \
                              username=username, inTrackName=inTrackName, windowSize=windowSize, func=func).process()
    def _checkValidEnd(self, chr, end, start=None):
        if end < 0:
            raise InvalidFormatError('Error: end position is negative: %s' %
                                     end)

        if self.genome and \
            GenomeInfo.isValidChr(self.genome, chr) and \
                end-1 > GenomeInfo.getChrLen(self.genome, chr):
            raise InvalidFormatError('Error: end position is larger than the size of chromosome "%s" (%s > %s)' % \
                                     (chr, end-1, GenomeInfo.getChrLen(self.genome, chr)))
        if start is not None and end <= start:
            if not start == end == 1:
                raise InvalidFormatError(
                    'Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d'
                    % (end, start))

        return end
 def getGlobalSource(globalSourceStr, genome, minimal):
     if minimal == True:
         return MinimalBinSource(genome)
     elif globalSourceStr == 'test':
         return UserBinSource('TestGenome:chr21:10000000-15000000','1000000')
     elif globalSourceStr == 'chrs':
         return GenomeInfo.getChrRegs(genome)
     elif globalSourceStr == 'chrarms':
         return GenomeInfo.getChrArmRegs(genome)
     elif globalSourceStr == 'ensembl':
         return GenomeInfo.getStdGeneRegs(genome)
     elif globalSourceStr == 'userbins':
         from gold.application.StatRunner import StatJob
         assert StatJob.USER_BIN_SOURCE is not None
         return StatJob.USER_BIN_SOURCE
         #return kwArgs['userBins']
     else:
         raise ShouldNotOccurError('globalSource not recognized')
    def findOverrepresentedTFsFromGeneSet(genome, tfSource, ensembleGeneIdList,upFlankSize, downFlankSize, geneSource, galaxyFn):
        #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat'
        #print 'overriding galaxyFN!: ', galaxyFn
        galaxyId = extractIdFromGalaxyFn(galaxyFn)
        uniqueWebPath = getUniqueWebPath(extractIdFromGalaxyFn(galaxyFn))

        assert genome == 'hg18'
        
        tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome)
        tfTrackName = tfTrackNameMappings[tfSource]
        
        
        #Get gene track
        assert geneSource == 'Ensembl'
        targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed'
        geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
        geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
        GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn )
        
        assert upFlankSize == downFlankSize == 0 #Should instead extend regions to include flanks
        
        tcGeneRegsTempFn = uniqueWebPath + os.sep + 'tcGeneRegs.targetcontrol.bedgraph'
        #Think this will be okay, subtraction not necessary as targets are put first:
        controlGeneRegsTempFn = geneRegsFn
        #print targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn
        GalaxyInterface.combineToTargetControl(targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn)
        
        #tcGeneRegsExternalTN = ['external'] +galaxyId +  [tcGeneRegsTempFn]
        tcGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc')
        
        #tcGeneRegsExternalTN = ['external'] +targetGalaxyId +  [tcGeneRegsTempFn]
        #tcGeneRegsExternalTN = ['galaxy', externalId, tcGeneRegsTempFn]
        
        targetGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc', '1')
        controlGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc', '0')
        
        #pre-process
        print 'Pre-processing file: %s, with trackname: %s ' % (tcGeneRegsTempFn, tcGeneRegsExternalTN)
        ExternalTrackManager.preProcess(tcGeneRegsTempFn, tcGeneRegsExternalTN, 'targetcontrol.bedgraph',genome)
        print 'Pre-processing TN: ', targetGeneRegsExternalTN
        ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed',genome)
        print 'Pre-processing TN: ', controlGeneRegsExternalTN
        ExternalTrackManager.preProcess(controlGeneRegsTempFn, controlGeneRegsExternalTN, 'bed',genome)
        
        #print tcGeneRegsExternalTN
        trackName1, trackName2 = tfTrackName, tcGeneRegsExternalTN
        
        analysisDef = 'Categories differentially located in targets?: Which categories of track1-points fall more inside case than control track2-segments? [rawStatistic:=PointCountInsideSegsStat:]' +\
                  '[tf1:=SegmentToStartPointFormatConverter:] [tf2:=TrivialFormatConverter:]' +\
                  '-> DivergentRowsInCategoryMatrixStat'
        regSpec, binSpec = '*','*'
        
        #print 'skipping preproc!!'
        #ExternalTrackManager.preProcess(tcGeneRegsExternalTN[-1], tcGeneRegsExternalTN, 'targetcontrol.bedgraph', genome)
        #ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed', genome)
        
        GalaxyInterface.runManual([trackName1, trackName2], analysisDef, regSpec, binSpec, genome, printResults=True, printHtmlWarningMsgs=False)
    def getAllBoundingRegions(self):
        if not self.fileExists():
            from gold.util.CommonFunctions import prettyPrintTrackName
            raise BoundingRegionsNotAvailableError('Bounding regions not available for track: ' + \
                prettyPrintTrackName(self._trackName))

        for chr in GenomeInfo.getExtendedChrList(self._genome):
            for reg in self.getAllBoundingRegionsForChr(chr):
                yield reg
 def getGenomicElements(genome, trackName):
     track = PlainTrack(trackName)
     genElements = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         for el in tv:
             #print chrom, el.start(), el.end() #, el.name()
             genElements = genElements + [[chrom, el.start(), el.end()]]
             
     return genElements
 
     #print numpy.version.version # 1.7.1 !!
     #unique, counts = numpy.unique(segmentSize, return_counts=True) # This is for numpy 1.9
     #print numpy.asarray((unique, counts)).T
     
     '''track.setFormatConverter('SegmentToMidPointFormatConverter')
Beispiel #43
0
 def getNmerAndCleanedNmerTrackName(genome, trackName):
     from quick.util.GenomeInfo import GenomeInfo
     from copy import copy
     tn = copy(trackName)
     tn[-1] = tn[-1].lower()
     nmer = tn[-1]
     if len(tn) == len(GenomeInfo.getNmerTrackName(genome)) + 1:
         tn = tn[0:-1] + [str(len(nmer)) + '-mers'] + tn[-1:]
     return nmer, tn
Beispiel #44
0
    def isCompBin(region):
        if isIter(region):
            return False

        offsetOK = (CompBinManager.getOffset(
            region.start, CompBinManager.getBinNumber(region.start)) == 0)
        lengthOK = (len(region) == min(
            CompBinManager.getCompBinSize(),
            GenomeInfo.getChrLen(region.genome, region.chr) - region.start))
        return offsetOK and lengthOK
    def generateGenomeAnnotations(cls, abbrv):
        fnSource = cls.getCollectedPathGFF(abbrv)
        if os.path.exists(fnSource):
            from quick.extra.StandardizeTrackFiles import SplitFileToSubDirs
            SplitFileToSubDirs.parseFiles(abbrv, GenomeInfo.getGenomeAnnotationsTrackName(abbrv), direction='coll_to_std', \
                                          suffix='.gff', catSuffix='.category.gff', subTypeCol='2', depth='1', numHeaderLines='0')

            fnDest = cls.getStandardizedPathGFF(abbrv)
            ensurePathExists(fnDest)
            shutil.copyfile(fnSource, fnDest)
 def retrieveTrack(self, regionTrackName, fastaFileName):
     
     regionTrackName = regionTrackName.split(':')
     myFileName = ExternalTrackManager.extractFnFromGalaxyTN(regionTrackName)
     gtrackSource = GtrackGenomeElementSource(myFileName, self._genome)
     regionList = []
     
     for obj in gtrackSource:
         regionList.append(GenomeRegion(obj.genome, obj.chr, obj.start, obj.end))
     
     return self._extractor.extract(GenomeInfo.getSequenceTrackName(self._genome), regionList, fastaFileName, 'fasta')
 def execute(choices, galaxyFn=None, username=''):
     '''Is called when execute-button is pushed by web-user.
     Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
     If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
     choices is a list of selections made by web-user in each options box.
     '''
     genome = choices[0]
     nmer = choices[1].lower()
     regSpec = choices[2]
     binSpec = '*'
     trackName = GenomeInfo.getPropertyTrackName(genome, 'nmer') + [str(len(nmer))+'-mers',nmer]
     assert galaxyFn is not None
     GalaxyInterface.extractTrackManyBins(genome, trackName, regSpec, binSpec, True, 'point bed', False, False, galaxyFn)
    def getSubtypes(genome, trackName, fullAccess=False):
        dirPath = createDirPath(trackName, genome)
        subtypes = [fn for fn in ProcTrackOptions._getDirContents(genome, trackName) \
                    if not (fn[0] in ['.','_'] or os.path.isfile(dirPath + os.sep + fn) \
                    or GenomeInfo.isValidChr(genome, fn))]

        #fixme, just temporarily:, these dirs should start with _
        subtypes= [x for x in subtypes if not x in ['external','ucsc'] ]
        
        if not fullAccess and not ProcTrackOptions._isLiteratureTrack(genome, trackName):
            subtypes = [x for x in subtypes if not TrackInfo(genome, trackName+[x]).private]

        return sorted(subtypes, key=str.lower)
 def execute(cls, choices, galaxyFn=None, username=''):
     outputFile =  open(galaxyFn, 'w')
     genome = choices[0]
     histItem = choices[2]
     trackItem = choices[3]
     chromRegsPath = GenomeInfo.getChrRegsFn(genome)
     
     chrSizeDict =  dict([ ( chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)])
     geSource = headLinesStr = None
     if choices[1] == 'History':
         
         trackType = choices[2].split(':')[1]
         username = ''.join([chr(random.randint(97,122)) for i in range(6)]) 
         tempFn = createCollectedPath(genome, [], username+'_'.join([str(v) for v in time.localtime()[:6]])+'.'+trackType)
         fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':'))
         open(tempFn,'w').write(open(fnSource,'r').read())
         
         
         if trackType in ['marked.bed', 'category.bed', 'bed']:
             geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__()
         
         elif trackType == 'gtrack':
             geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__()
             headLinesStr = geSource.getHeaderLines().replace('##','\n##')
         
         cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True)
         os.remove(tempFn)
     
     else:
         writeHeaderFlag = True
         for chrom in GenomeInfo.getChrList(genome):
             gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom])
             plTrack = PlainTrack(trackItem.split(':'))
             geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__()
             cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag)
             writeHeaderFlag = False    
     outputFile.close()
Beispiel #50
0
    def execute(cls, choices, galaxyFn=None, username=''):
        start = time.time()
        genome = choices[0]
        trackName = choices[1].split(':')
        #outFn = open(NONSTANDARD_DATA_PATH+'/hg19/Private/Sigven/resultat.bed','w')
        analysisDef = '-> ConvertToNonOverlappingCategorySegmentsPythonStat' #'Python'
        for regSpec in  GenomeInfo.getChrList(genome):
            res = GalaxyInterface.runManual([trackName], analysisDef, regSpec, '*', genome, username=username, \
                                            printResults=False, printHtmlWarningMsgs=False)

            from gold.origdata.TrackGenomeElementSource import TrackViewGenomeElementSource
            from gold.origdata.BedComposer import CategoryBedComposer
            for resDict in res.values():
                tvGeSource = TrackViewGenomeElementSource(genome, resDict['Result'], trackName)
                CategoryBedComposer(tvGeSource).composeToFile(outFn)
    def __iter__(self):
        chr = self.chr
        trackName1, trackName2, w1, w2, genome = self.trackName1, self.trackName2, self.w1, self.w2, self.genome
        
        region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) )

        track1 = PlainTrack(trackName1)
        tv1 = track1.getTrackView(region)
        vals1 = tv1.valsAsNumpyArray()
        
        track2 = PlainTrack(trackName2)
        tv2 = track2.getTrackView(region)
        vals2 = tv2.valsAsNumpyArray()
        
        for i in xrange(len(vals1)):
            yield w1*vals1[i] + w2*vals2[i]
    def __iter__(self):
        from gold.application.RSetup import r
        chr = self.chr
        trackName1, genome = self.trackName1, self.genome
        factor = self.factor
        region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) )

        track1 = PlainTrack(trackName1)
        tv1 = track1.getTrackView(region)
        vals1 = tv1.valsAsNumpyArray()
        
        #scale between 0 and 1..:
        minVal, maxVal = vals1.min(), vals1.max()
        vals1 = (vals1 - minVal) * (1/(maxVal-minVal))
        for pos in xrange(len(vals1)):
            #print r.runif(1), vals1[pos]
            if r.runif(1) < factor*vals1[pos]:
                yield [pos,pos+1]
    def yielder(self, curTn):
        if self._avoidLiterature and curTn == GenomeInfo.getPropertyTrackName(self._genome, 'literature'):
            return
        
        for subtype in ProcTrackOptions.getSubtypes(self._genome, curTn, self._fullAccess):
            #if self._avoidLiterature and subtype == 'Literature':
            
            if subtype[0] in ['.','_']:
                continue

            newTn = curTn + [subtype]

            doBreak = False
            for subTn in self.yielder(newTn):
                yield subTn

        if ProcTrackOptions.isValidTrack(self._genome, curTn, self._fullAccess):
            yield curTn
    def createBoundingRegionShelve(genome, trackName, allowOverlaps):
        collector = TrackInfoDataCollector(genome, trackName)
        geChrList = collector.getPreProcessedChrs(allowOverlaps)

        boundingRegionTuples = [x for x in collector.getBoundingRegionTuples(allowOverlaps) if x.region.chr is not None]
        
        if len(boundingRegionTuples) == 0:
            boundingRegionTuples = [BoundingRegionTuple( \
                                     GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(genome, chr)), \
                                     collector.getNumElements(chr, allowOverlaps) ) \
                                    for chr in geChrList]
        brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps)
        brShelve.storeBoundingRegions(boundingRegionTuples, geChrList, not collector.getTrackFormat().reprIsDense())
        
        boundingRegionChrs = set([br.region.chr for br in boundingRegionTuples])
        for chr in boundingRegionChrs | set(geChrList):
            if brShelve.getTotalElementCount(chr) != collector.getNumElements(chr, allowOverlaps):
                raise ShouldNotOccurError("Error: The total element count for all bounding regions of chromosome '%s' is not equal to the number of genome elements of that chromosome. %s != %s" % \
                                          (chr, brShelve.getTotalElementCount(chr), collector.getNumElements(chr, allowOverlaps)) )
 def validateAndReturnErrors(choices):
     '''
     Should validate the selected input parameters. If the parameters are not valid,
     an error text explaining the problem should be returned. The GUI then shows this text
     to the user (if not empty) and greys out the execute button (also if the text isempty).
     If all parameters are valid, the method should return None, which enables the execute button.
     '''
     genome, tn, tf = ExtractIntersectingGenesTool._getBasicTrackFormat(choices)
     geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
     
     if not ExtractIntersectingGenesTool._isValidTrack(choices):
         return ""
         return "The selected track (%s) is not valid." % ':'.join(tn)        
     
     if tf.split()[-1] not in ['points', 'segments']:
         return "The track format of the selected track must be either points or segments. Currently: %s" % tf
     
     if not ProcTrackOptions.isValidTrack(genome, geneRegsTrackName, True):
         return "The track used for gene ids (%s) is not valid. This is an internal error." % ':'.join(geneRegsTrackName)        
    def execute(cls, choices, galaxyFn=None, username=''):
        '''Is called when execute-button is pushed by web-user.
        Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
        If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
        choices is a list of selections made by web-user in each options box.
        '''
        genome = choices[0]
        nmer = choices[1].lower()
        regSpec = choices[2]
        analysisRegions = parseRegSpec(regSpec, genome)
        
        binSize = cls._calcBinSize(nmer, analysisRegions)
        binSpec = '*' if binSize is None else str( binSize ) 
        numBins = len( AutoBinner(analysisRegions, binSize) )
        
        from quick.application.GalaxyInterface import GalaxyInterface
        from quick.util.GenomeInfo import GenomeInfo
        trackName1 = GenomeInfo.getPropertyTrackName(genome, 'nmer') + [str(len(nmer))+'-mers',nmer]
        trackName2 = ['']
        analysisDef = 'Counts: The number of track1-points -> CountPointStat'
        #regSpec = '*'
        #print 'Using binSize: ',binSpec
        #print 'TN1: ',trackName1
        from gold.result.HtmlCore import HtmlCore
        print str(HtmlCore().styleInfoBegin(styleClass='debug'))
        GalaxyInterface.run(trackName1, trackName2, analysisDef, regSpec, binSpec, genome, galaxyFn)
        print str(HtmlCore().styleInfoEnd())

        plotFileNamer = GalaxyRunSpecificFile(['0','CountPointStat_Result_gwplot.pdf'], galaxyFn)
        textualDataFileNamer = GalaxyRunSpecificFile(['0','CountPointStat_Result.bedgraph'], galaxyFn)
        
        core = HtmlCore()
        core.paragraph('Inspect nmer frequency variation as a %s or as underlying %s.</p>' % ( plotFileNamer.getLink('plot'), textualDataFileNamer.getLink('textual data') ))
        core.divider()
        core.paragraph('The occurrence frequency of your specified nmer ("%s") has been computed along the genome, within your specified analysis region ("%s").' % (nmer, regSpec))
        core.paragraph('The analysis region was divided into %i bins, based on calculations trying to find appropriate bin size (get enough data per bin and restrict maximum number of bins).' % numBins)
        
        trackName1modified = trackName1[0:-2] + trackName1[-1:]
        preSelectedAnalysisUrl = createHyperBrowserURL(genome, trackName1modified,[''], analysis='Counts',method='auto',region=regSpec, binsize=binSpec)
        core.divider()
        core.paragraph('If you do not find the inferred bin size to be appropriate, you can set this manually in a ' + str(HtmlCore().link('new analysis', preSelectedAnalysisUrl)) + '.')
        print str(core)
 def _createNmerTrack(self, nmerList, lowerOrder=None):
     nmerLengths = list(set([len(nmer) for nmer in nmerList]))
     assert len(nmerLengths)==1
     
     chainOrder = lowerOrder if lowerOrder is not None else nmerLengths[0]
     
     regionList = [GenomeRegion(self._genome, chr, 0, GenomeInfo.getChrLen(self._genome, chr) ) for chr in GenomeInfo.getChrList(self._genome)]
     
     for region in regionList:
         print '|',
         
         chains = SameValueIndexChainsFactory.load(self._createPath(chainOrder), region.chr)
         
         for nmer in nmerList:
             if len(nmerList) > 1:
                 print '.',
             
             if lowerOrder is not None:
                 nmerPrefix = nmer[0:chainOrder]
                 rawIndexGenerator = chains.getIndexGenerator(NmerTools.nmerAsInt(nmerPrefix))             
                 indexGenerator = LowerOrderChainWrapper(rawIndexGenerator, nmerPrefix, nmer, self._genome, region.chr)
             else:
                 indexGenerator = chains.getIndexGenerator(NmerTools.nmerAsInt(nmer)) 
     
             #print 'Length of lower order chain: %i and %i' % (sum(1 for x in indexGenerator), sum(1 for x in indexGenerator))
             #print 'Length of wrapped chain: %i and %i' % (sum(1 for x in wrappedIndexGenerator), sum(1 for x in wrappedIndexGenerator))            
             
             PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), [region], \
                                      self._getNmerGeSourceForChr, finalize=False, preProcess=True, \
                                      indexGenerator=indexGenerator).process()
                 
     for nmer in nmerList:
         try:
             PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), regionList, \
                                      self._getNmerGeSourceForChr, preProcess=False, finalize=True, \
                                      indexGenerator=[0]).process()
         except EmptyGESourceError:
             PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), [GenomeRegion(self._genome, regionList[0].chr, -1, 0)], \
                                      self._getNmerGeSourceForChr, preProcess=True, finalize=True, \
                                      indexGenerator=[-1]).process()
     
     return
Beispiel #58
0
    def nextBin(self):
        #start = self.start
        #for chr in self.chromosomes:
        #    if self.genome:
        #        chrLen = GenomeInfo.getChrLen(self.genome, chr)
        #    else:
        #        chrLen = self.end
        #        assert chrLen is not None
        #    
        #    if self.end is None:
        #        chrEnd = chrLen
        #    else:
        #        chrEnd = min(self.end, chrLen)
        #    #chrLen = 3100000
        #    
        #    while (start < chrEnd):
        #        if self.binLen is not None:
        #            end = min(start+self.binLen, chrEnd)
        #        else:
        #            end = chrEnd
        #        #print 'YIELDING: ',start, end, chrEnd
        #        yield GenomeRegion(self.genome, chr, start, end)
        #        if self.binLen is not None:
        #            start += self.binLen
        #        else:
        #            start = chrLen
        #
        #    #in case of more chromosomes, reset start:
        #    start = 0
        for region in self._userBinSource:
            start = region.start if region.start is not None else 0

            chrLen = GenomeInfo.getChrLen(region.genome, region.chr) if region.genome is not None else None
            regEnd = min([x for x in [region.end, chrLen] if x is not None])
            
            if self._binLen is None:
                yield GenomeRegion(region.genome, region.chr, start, regEnd)
            else:
                while start < regEnd:
                    end = min(start + self._binLen, regEnd)
                    yield GenomeRegion(region.genome, region.chr, start, end)
                    start += self._binLen
 def execute(cls, choices, galaxyFn=None, username=''):
     from quick.application.ExternalTrackManager import ExternalTrackManager
     
     genome = choices[0]
     preProcTN1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, choices[2].split(':')) if choices[1] == 'History' else choices[2].split(':')
     chrSizeDict =  dict([ ( chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)])
     
     
     trackType = choices[3].split(':')[1]
     fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[3].split(':'))
     
     if trackType in ['marked.bed', 'category.bed', 'bed']:
         geSource = GenomeElementSorter(BedGenomeElementSource(fnSource, genome=genome)).__iter__()
         
     elif trackType == 'gtrack':
         geSource = GenomeElementSorter(GtrackGenomeElementSource(fnSource, genome=genome)).__iter__()
         #headLinesStr = geSource.getHeaderLines().replace('##','\n##')
     else:
         raise InvalidFormatError('The Binning must be of the following formats: gtrack, marked.bed, category.bed ,bed ...')
         
         
     cls.PrintResultToHistItem( galaxyFn, geSource, preProcTN1, genome, username)
 def createGenome(cls, genome, fullName, chromNamesDict, standardChromosomes, username=''):        
     basePath = cls.getBasePath(genome)
     trackName=GenomeInfo.getSequenceTrackName(genome)
     print("Splitting genome file into chromosomes.")
     SplitFasta.parseFiles(genome, trackName, chromNamesDict=chromNamesDict)
     print("Processing genome")
     PreProcessAllTracksJob(genome).process()
     
     #print "Writing name file.:", fullName  
     #nameFn=createOrigPath(genome,[], "_name.txt" if experimental else "#name.txt")
     #ensurePathExists(nameFn)
     #f=open(nameFn, "w")
     #f.write(fullName)
     #f.close()
     
     print("Creating chromosome file")
     createChromosomeFile(genome, ",".join(standardChromosomes))
     print("Creating assembly gaps file")
     createAssemblyGapsFile(genome)
     print("Processing genome")
     PreProcessAllTracksJob(genome).process()
     print(genome + " genome added")