def __iter__(self): brShelve1 = self._getBoundingRegionShelve(self._trackName1) brShelve2 = self._getBoundingRegionShelve(self._trackName2) allBrsAreWholeChrs1 = self._commonAllBoundingRegionsAreWholeChr(brShelve1) \ if brShelve1 is not None else False allBrsAreWholeChrs2 = self._commonAllBoundingRegionsAreWholeChr(brShelve2) \ if brShelve2 is not None else False for chr in GenomeInfo.getExtendedChrList(self.genome): if brShelve1 is None: yield GenomeRegion(self.genome, chr, 0, GenomeInfo.getChrLen(self.genome, chr)) else: brList1 = brShelve1.getAllBoundingRegionsForChr(chr) if brShelve2 is None or \ (allBrsAreWholeChrs2 and not allBrsAreWholeChrs1): for reg in brList1: yield reg else: brList2 = brShelve2.getAllBoundingRegionsForChr(chr) if allBrsAreWholeChrs1 and not allBrsAreWholeChrs2: for reg in brList2: yield reg else: for reg in self.getAllIntersectingRegions( self.genome, chr, brList1, brList2): yield reg
def execute(choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' print 'Executing...' tempinfofile=ExternalTrackManager.extractFnFromGalaxyTN(choices[0].split(":")) abbrv=GenomeImporter.getGenomeAbbrv(tempinfofile) gi = GenomeInfo(abbrv) chrNamesInFasta=gi.sourceChrNames chromNamesDict={} chrDict = InstallGenomeTool._getRenamedChrDictWithSelection(choices) for i, key in enumerate(chrDict.keys()): if chrDict[key]: chromNamesDict[chrNamesInFasta[i]]=key print 'All chromosomes chosen: ' + str(chromNamesDict) stdChrDict = InstallGenomeTool._getRenamedChrDictWithSelection(choices, stdChrs=True) stdChrs = [x for x in stdChrDict if stdChrDict[x]] print 'Standard chromosomes chosen: ' + ", ".join(stdChrs) GenomeImporter.createGenome(abbrv, gi.fullName, chromNamesDict, stdChrs, username=username) gi.installedBy = username gi.timeOfInstallation = datetime.now() gi.store()
def __iter__(self): brShelve1 = self._getBoundingRegionShelve(self._trackName1) brShelve2 = self._getBoundingRegionShelve(self._trackName2) for chr in GenomeInfo.getExtendedChrList(self.genome): if brShelve1 is None: yield GenomeRegion(self.genome, chr, 0, GenomeInfo.getChrLen(self.genome, chr)) else: brList1 = brShelve1.getAllBoundingRegions(chr) allBrsAreWholeChrs1 = self._commonAllBoundingRegionsAreWholeChr(brShelve1) allBrsAreWholeChrs2 = self._commonAllBoundingRegionsAreWholeChr(brShelve2) \ if brShelve2 is not None else False if brShelve2 is None or \ (allBrsAreWholeChrs2 and not allBrsAreWholeChrs1): for reg in brList1: yield reg else: brList2 = brShelve2.getAllBoundingRegions(chr) if allBrsAreWholeChrs1 and not allBrsAreWholeChrs2: for reg in brList2: yield reg else: for reg in self.getAllIntersectingRegions(self.genome, chr, brList1, brList2): yield reg
def createAssemblyGapsFile(genome, assemblyChars='ACGTacgt'): """genome assemblyChars='ACGTacgt'""" basePath = gcf.createOrigPath(genome, GenomeInfo.getPropertyTrackName(genome, 'gaps'),'') outFn = basePath + 'assemblyGaps.bed' qcf.ensurePathExists(outFn) outFile = open(outFn,'w') seqTrack = PlainTrack( GenomeInfo.getSequenceTrackName(genome) ) anyGaps = False for chr in GenomeInfo.getExtendedChrList(genome): chrRegion = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) seqTV = seqTrack.getTrackView(chrRegion) seq = seqTV.valsAsNumpyArray() #gapIndexes = numpy.arange(len(seq))[(seq == 'n') | (seq == 'N')] gapIndexes = numpy.arange(len(seq))[numpy.logical_not( numpy.logical_or.reduce([seq == x for x in assemblyChars]) )] gapIndexDiff = gapIndexes[1:] - gapIndexes[:-1] gapBeginIndexes = numpy.delete(gapIndexes, (numpy.arange(len(gapIndexDiff)) + 1)[gapIndexDiff==1]) gapEndIndexes = numpy.delete(gapIndexes + 1, numpy.arange(len(gapIndexDiff))[gapIndexDiff==1]) assert len(gapBeginIndexes) == len(gapEndIndexes) for i in xrange(len(gapBeginIndexes)): anyGaps = True outFile.write('\t'.join([chr, str(gapBeginIndexes[i]), str(gapEndIndexes[i])]) + os.linesep) if not anyGaps: outFile.write('\t'.join([GenomeInfo.getExtendedChrList(genome)[0], '1', '1'])) outFile.close()
def execute(cls, choices, galaxyFn=None, username=''): from quick.application.ExternalTrackManager import ExternalTrackManager genome = choices[0] preProcTN1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN( genome, choices[2].split( ':')) if choices[1] == 'history' else choices[2].split(':') chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) trackType = choices[3].split(':')[1] fnSource = ExternalTrackManager.extractFnFromGalaxyTN( choices[3].split(':')) if trackType in ['valued.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter( BedGenomeElementSource(fnSource, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter( GtrackGenomeElementSource(fnSource, genome=genome)).__iter__() #headLinesStr = geSource.getHeaderLines().replace('##','\n##') else: raise InvalidFormatError( 'The Binning must be of the following formats: gtrack, valued.bed, category.bed ,bed ...' ) cls.PrintResultToHistItem(galaxyFn, geSource, preProcTN1, genome, username)
def _removeBoundingRegionTuplesIfFullChrsAndNotFixedGapSize(self): if self.getFixedGapSize() == 0 and not self._reprIsDense: # If only full chromosomes if all(brt.region.chr in GenomeInfo.getExtendedChrList(self._genome) and \ brt.region.start == 0 and \ brt.region.end == GenomeInfo.getChrLen(self._genome, brt.region.chr) \ for brt in self._boundingRegionTuples): self._boundingRegionTuples = []
def validate_snp(cls, snps, genome): DNA = ['A', 'C', 'G', 'T'] + cls.AMBIGUOUS_DNA.keys() err = [] for snp in snps: assert len(snp) == 5 _rsid, _chr, _pos, _ref, _var = snp #spec = ':'.join(snp) spec = repr(snp) if _rsid and not _chr: err.append('Invalid RefSNP: rs' + _rsid) continue if _chr not in GenomeInfo.getChrList(genome): err.append(spec + ' Chromosome ' + _chr + ' is not valid') continue if not _pos.isdigit(): err.append(spec + ' Position must numeric') continue if int(_pos) < 0: err.append(spec + ' Position must be higher than 0') continue chrLen = GenomeInfo.getChrLen(genome, _chr) if int(_pos) > chrLen: err.append(spec + ' Position is higher than length of %s (%d)' % (_chr, chrLen)) continue ref = VariantMeltingProfile.get_reference_allele( genome, _chr, _pos, len(_ref)) if _ref != ref: err.append( spec + ' Reference allele does not match reference genome, should be: ' + ref) continue if _ref == 'N': err.append(spec + ' Reference allele can not be N') continue if not _var: err.append(spec + ' Variant allele not specified') continue if not all([v in DNA for v in _var]): err.append(spec + ' Variant allele ' + _var + " is not valid") continue if cls.AMBIGUOUS_DNA.has_key( _var) and _ref in cls.AMBIGUOUS_DNA[_var]: err.append(spec + ' Ambiguous variant allele includes reference') continue return err
def __new__(cls, genome): from gold.track.GenomeRegion import GenomeRegion from quick.util.GenomeInfo import GenomeInfo chrList = GenomeInfo.getChrList(genome) if len(chrList) > 0: return [ GenomeRegion(genome, GenomeInfo.getChrList(genome)[0], 0, 1) ]
def _checkValidStart(self, chr, start): if start < 0: raise InvalidFormatError('Error: start position is negative: %s' % start) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ start > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: start position is larger than chromosome size (%s) < %d' % \ (GenomeInfo.getChrLen(self.genome, chr), start)) return start
def createNmerChains(self, n): for chr in GenomeInfo.getChrList(self._genome): print 'Creating chains of nmers of length ', n, ' for chromosome ', chr chrLen = GenomeInfo.getChrLen(self._genome,chr) chrReg = GenomeRegion( self._genome, chr, 0, chrLen ) seqTV = PlainTrack( GenomeInfo.getSequenceTrackName(self._genome) ).getTrackView(chrReg) #nmersAsInts = NmerAsIntSlidingWindow(n, FuncValTvWrapper(seqTV)) nmersAsInts = NmerAsIntSlidingWindow(n, seqTV.valsAsNumpyArray()) SameValueIndexChainsFactory.generate( nmersAsInts, chrLen, 4**n, self._createPath(n), chr )
def getNumberElements(genome, trackName): track = PlainTrack(trackName) numElements = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) numElements = numElements + [len(tv.startsAsNumpyArray())] return numElements
def getAnchor(genome, trackName): track = PlainTrack(trackName) anchor = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) anchor = anchor + [str(tv.genomeAnchor)] return anchor
def _checkValidStart(self, chr, start): if start < 0: raise InvalidFormatError('Error: start position is negative: %s' % start) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ start > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: start position is larger than the size of chromosome "%s" (%s > %s)' % \ (chr, start, GenomeInfo.getChrLen(self.genome, chr))) return start
def createNmerChains(self, n): for chr in GenomeInfo.getChrList(self._genome): print 'Creating chains of nmers of length ', n, ' for chromosome ', chr chrLen = GenomeInfo.getChrLen(self._genome, chr) chrReg = GenomeRegion(self._genome, chr, 0, chrLen) seqTV = PlainTrack(GenomeInfo.getSequenceTrackName( self._genome)).getTrackView(chrReg) #nmersAsInts = NmerAsIntSlidingWindow(n, FuncValTvWrapper(seqTV)) nmersAsInts = NmerAsIntSlidingWindow(n, seqTV.valsAsNumpyArray()) SameValueIndexChainsFactory.generate(nmersAsInts, chrLen, 4**n, self._createPath(n), chr)
def execute(cls, choices, galaxyFn=None, username=''): from gold.util.RandomUtil import random outputFile = open(galaxyFn, 'w') genome = choices[0] histItem = choices[2] trackItem = choices[3] chromRegsPath = GenomeInfo.getChrRegsFn(genome) chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) geSource = headLinesStr = None if choices[1] == 'history': trackType = choices[2].split(':')[1] username = ''.join( [chr(random.randint(97, 122)) for i in range(6)]) tempFn = createCollectedPath( genome, [], username + '_'.join([str(v) for v in time.localtime()[:6]]) + '.' + trackType) fnSource = ExternalTrackManager.extractFnFromGalaxyTN( choices[2].split(':')) open(tempFn, 'w').write(open(fnSource, 'r').read()) if trackType in ['valued.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter( BedGenomeElementSource(tempFn, genome=genome)).__iter__() #elif trackType == 'gtrack': # geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__() # headLinesStr = geSource.getHeaderLines().replace('##','\n##') cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True) os.remove(tempFn) else: writeHeaderFlag = True for chrom in GenomeInfo.getChrList(genome): gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom]) plTrack = PlainTrack(trackItem.split(':')) geSource = GenomeElementTvWrapper( plTrack.getTrackView(gRegion)).__iter__() cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag) writeHeaderFlag = False outputFile.close()
def _checkValidEnd(self, chr, end, start=None): if end < 0: raise InvalidFormatError('Error: end position is negative: %s' % end) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ end-1 > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: end position is larger than chromosome size (%s)' % \ GenomeInfo.getChrLen(self.genome, chr)) if start is not None and end <= start: raise InvalidFormatError('Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d' % (end, start)) return end
def getSegmentSizes(genome, trackName): track = PlainTrack(trackName) segmentSize = []; sumSegmentSize = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) sizeSegments = tv.endsAsNumpyArray() - tv.startsAsNumpyArray() sumSizes = sizeSegments.sum() segmentSize = segmentSize + [sizeSegments.tolist()] sumSegmentSize = sumSegmentSize + [sumSizes.tolist()] return sumSegmentSize
def isMemoBin(region): if not IS_EXPERIMENTAL_INSTALLATION: return CompBinManager.isCompBin(region) if CompBinManager.ALLOW_COMP_BIN_SPLITTING: isCompBin = CompBinManager.isCompBin(region) return isCompBin else: isChr = not hasattr(region, '__iter__') and any([region.chr, region.start, region.end] == [r.chr, r.start, r.end] \ for r in GenomeInfo.getChrRegs(region.genome)) isChrArm = not hasattr(region, '__iter__') and any([region.chr, region.start, region.end] == [r.chr, r.start, r.end] \ for r in GenomeInfo.getChrArmRegs(region.genome)) return (isChr or isChrArm)
def _getBoundingRegionTupleList(self, case, sortedAssertElList): boundingRegions = [br for br in sorted(case.boundingRegionsAssertList) if br.region.chr is not None] if len(boundingRegions) > 0: return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=br.region.chr, \ start=br.region.start if br.region.start is not None else 0, \ end=br.region.end if br.region.end is not None else \ GenomeInfo.getChrLen(self.GENOME, br.region.chr)), br.elCount) for br in boundingRegions] else: totChrList = [ge.chr for ge in sortedAssertElList] chrBrList = OrderedDict( [ (i, totChrList.count(i)) for i in sorted(set(totChrList)) ] ) return [BoundingRegionTuple(GenomeRegion(self.GENOME, chr=chr, start=0, \ end=GenomeInfo.getChrLen(self.GENOME, chr)), elCount) \ for chr, elCount in chrBrList.iteritems()]
def execute(cls, choices, galaxyFn=None, username=''): start = time.time() genome = choices[0] trackName = choices[1].split(':') outFn = galaxyFn if choices[5] == 'Write to Standardised file': outFn = createOrigPath(genome, choices[-1].split(':'), 'collapsed_result.bedgraph') ensurePathExists(outFn[:outFn.rfind('/') + 1]) combineMethod = choices[2] category = choices[3] if choices[3] else '' numSamples = choices[4] if choices[4] else '1' analysisDef = 'dummy [combineMethod=%s] %s [numSamples=%s] -> ConvertToNonOverlappingCategorySegmentsPythonStat' % \ (combineMethod, '[category=%s]' % category if category != '' else '', numSamples) #'Python' for regSpec in GenomeInfo.getChrList(genome): res = GalaxyInterface.runManual([trackName], analysisDef, regSpec, '*', genome, username=username, \ printResults=False, printHtmlWarningMsgs=False) from gold.origdata.TrackGenomeElementSource import TrackViewGenomeElementSource from gold.origdata.BedComposer import CategoryBedComposer for resDict in res.values(): trackView = resDict['Result'] tvGeSource = TrackViewGenomeElementSource( genome, trackView, trackName) CategoryBedComposer(tvGeSource).composeToFile(outFn)
def _createTrackCommon(cls, genome, inTrackName, outTrackName, windowSize, func, username, chrList): regionList = [ GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) for chr in chrList ] for region in regionList: PreProcessCustomTrackJob(genome, outTrackName, [region], cls._getGeSourceForRegion, username=username, preProcess=True, finalize=False, inTrackName=inTrackName, windowSize=windowSize, func=func).process() PreProcessCustomTrackJob(genome, outTrackName, regionList, cls._getGeSourceForRegion, username=username, preProcess=False, finalize=True, inTrackName=inTrackName, windowSize=windowSize, func=func).process()
def getTotalBpSpan(self): if self.chr is None: return sum(len(reg) for reg in GenomeInfo.getChrRegs(self.genome)) #elif not self.start: #return GenomeInfo.getChrLen(self.genome, self.chr) else: return len(self)
def _createPreProcFiles(self): collector = TrackInfoDataCollector(self._genome, self._trackName) collector.updateMetaDataForFinalization(self._geSource.getFileSuffix(), self._geSource.getPrefixList(), \ self._geSource.getValDataType(), self._geSource.getValDim(), \ self._geSource.getEdgeWeightDataType(), self._geSource.getEdgeWeightDim(), \ self._geSource.hasUndirectedEdges(), self._geSource.getVersion(), PreProcessUtils.constructId(self._geSource)) if collector.getNumElements(self._chr, self._allowOverlaps) == 0: return if self._mode != 'Real': for ge in self._geSource: pass return dirPath = createDirPath(self._trackName, self._genome, self._chr, self._allowOverlaps) dir = OutputDirectory(dirPath, collector.getPrefixList(self._allowOverlaps), \ collector.getNumElements(self._chr, self._allowOverlaps),\ GenomeInfo.getChrLen(self._genome, self._chr), \ collector.getValDataType(), collector.getValDim(), \ collector.getEgdeWeightDataType(), collector.getEgdeWeightDim(), \ collector.getMaxNumEdges(self._chr, self._allowOverlaps), \ collector.getMaxStrLens(self._chr, self._allowOverlaps)) writeFunc = dir.writeRawSlice if self._geSource.isSliceSource() else dir.writeElement for ge in self._geSource: writeFunc(ge) collector.appendPreProcessedChr(self._allowOverlaps, self._chr) dir.close()
def getValuesFromBedFile(cls, genome, fn, colorPattern=(1,0,0)): resDict = defaultdict(list) valDict = defaultdict(list) lineTab = [] if type(fn) == type(None): return resDict elif isinstance(fn, basestring): lineTab = open(fn,'r').read().split('\n') else: lineTab = fn.returnComposed().split('\n') valueList = [] for line in lineTab: lineTab = line.split('\t') try: chrom = lineTab[0] valDict[chrom]+=[float(lineTab[3])] except: logMessage(line) maxVal = max(max(valDict.values())) for chrom in GenomeInfo.getChrList(genome): if valDict.has_key(chrom): try: resDict[chrom]+= [tuple([255 - (int(val*255/maxVal)*v) for v in colorPattern]) for val in valDict[chrom]] except: logMessage ('Ny rundeeee: '+ str([v for v in valDict[chrom][:10]])+ ': '+str(maxVal)) print 'count', len(valDict.values()) return resDict, maxVal
def _compute(self): tv = self._children[0].getResult() starts, ends = tv.startsAsNumpyArray(), tv.endsAsNumpyArray() borderDict = defaultdict(int) listLen = len(starts) for index in xrange(listLen): borderDict[starts[index]]+=1 borderDict[ends[index]]-=1 sortedPos = sorted(borderDict) range(0, chrlength, microbinzie) #handle start border issues startList, endList, valList = (sortedPos, sortedPos[1:], []) if sortedPos[0] == 0 else ([0] + sortedPos, sortedPos, [0]) #Handle end border issues chrEndPos = GenomeInfo.getChrLen(tv.genomeAnchor.genome, tv.genomeAnchor.chr)-1 startList, endList = (startList, endList+[chrEndPos]) if endList[-1]<chrEndPos else (startList[:-1], endList) #make step-function values accVal = 0 for pos in sortedPos: accVal+= borderDict[pos] valList.append(accVal) if chrEndPos == pos: valList.pop() return TrackView(genomeAnchor=tv.genomeAnchor, startList=np.array(startList), endList=np.array(endList), valList=np.array(valList), \ strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=False)
def getTrackView(self, region): assert self._origRegion == region allChrArmRegs = GenomeInfo.getContainingChrArms(region) if len(allChrArmRegs) != 1: raise CentromerError chrArm = allChrArmRegs[0] buffer = self._getIndepencyBufferSize(region) sourceRegs = chrArm.exclude( copy(region).extend(-buffer).extend(buffer) ) assert len(sourceRegs) in [1,2] if not any(len(sourceReg) >= self.MIN_SOURCE_TO_SAMPLE_SIZE_RATIO * len(region) for sourceReg in sourceRegs): raise TooLargeBinError('Source region lengths of ' + str([len(x) for x in sourceRegs]) + ' are too small compared to region length of ' + str(len(region)) + ' according to MIN_SOURCE_TO_SAMPLE_SIZE_RATIO: ' + str(self.MIN_SOURCE_TO_SAMPLE_SIZE_RATIO)) if len(sourceRegs) == 1: sourceReg = sourceRegs[0] else: firstSourceProportion = (len(sourceRegs[0])-len(region)) / sum(len(sourceRegs[i])-len(region) for i in range(2)) sourceReg = sourceRegs[0] if random.random() < firstSourceProportion else sourceRegs[1] randOffset = random.randint( 0, len(sourceReg) - len(region) ) start = sourceReg.start + randOffset end = start + len(region) randRegion = GenomeRegion(region.genome, region.chr, start, end) rawData = RawDataStat(randRegion, self._origTrack, self._trackFormatReq) tv = rawData.getResult() assert region != tv.genomeAnchor return tv
def execute(cls, choices, galaxyFn=None, username=''): start = time.time() genome = choices[0] trackName = choices[1].split(':') outFn = galaxyFn if choices[5] == 'Write to Standardised file': outFn = createOrigPath(genome, choices[-1].split(':'), 'collapsed_result.bedgraph') ensurePathExists(outFn[:outFn.rfind('/')+1]) combineMethod = choices[2] category = choices[3] if choices[3] else '' numSamples = choices[4] if choices[4] else '1' analysisDef = 'dummy [combineMethod=%s] %s [numSamples=%s] -> ConvertToNonOverlappingCategorySegmentsPythonStat' % \ (combineMethod, '[category=%s]' % category if category != '' else '', numSamples) #'Python' for regSpec in GenomeInfo.getChrList(genome): res = GalaxyInterface.runManual([trackName], analysisDef, regSpec, '*', genome, username=username, \ printResults=False, printHtmlWarningMsgs=False) from gold.origdata.TrackGenomeElementSource import TrackViewGenomeElementSource from gold.origdata.BedComposer import CategoryBedComposer for resDict in res.values(): tvGeSource = TrackViewGenomeElementSource(genome, resDict['Result'], trackName) CategoryBedComposer(tvGeSource).composeToFile(outFn)
def sortChrDict(self): chr = GenomeInfo.getStdChrLengthDict(self.gsuite.genome) remeberString = [] keysList = [] for el in chr.keys(): try: elC = int(el.replace('chr', '')) keysList.append(elC) except: remeberString.append(el.replace('chr', '')) sChr = sorted(keysList) + sorted(remeberString) chrDict = OrderedDict() chrLength = OrderedDict() val = 0 for elChr in sChr: el = 'chr' + str(elChr) chrDict[el] = chr[el] chrLength[el] = val val += chr[el] return chrDict, chrLength
def extractTestGenomeAndPreProcess(galaxy_dir): hbPath = os.path.join(galaxy_dir, 'lib', 'hb') from config.Config import ORIG_DATA_PATH from gold.origdata.PreProcessTracksJob import PreProcessAllTracksJob from setup.InstallFunctions import executeShellCmd from gold.util.CommonFunctions import createDirPath from quick.util.GenomeInfo import GenomeInfo from quick.application.ProcTrackOptions import ProcTrackOptions from gold.description.TrackInfo import TrackInfo import shutil testGenomeFn = os.sep.join([hbPath, 'data', 'TestGenome.tar.gz']) executeShellCmd('tar xfz %s --keep-newer-files -C %s' % (testGenomeFn, ORIG_DATA_PATH), \ pipe=False, printError=True, onError='exit') print 'OK: Extracted TestGenome files.' PreProcessAllTracksJob.PASS_ON_EXCEPTIONS = True try: PreProcessAllTracksJob('TestGenome').process() PreProcessAllTracksJob( 'TestGenome', GenomeInfo.getChrTrackName('TestGenome')).process() print 'OK: Finished preprocessing TestGenome.' except Exception, e: print 'FAILED: Error when preprocessing TestGenome. Error:' print ' ' + str(e).strip() sys.exit(1)
def describeUserBinSource(self, regSpec, binSpec): from quick.application.UserBinSource import parseRegSpec from quick.util.CommonFunctions import strWithStdFormatting, \ generateStandardizedBpSizeText, parseShortenedSizeSpec from quick.util.GenomeInfo import GenomeInfo regions = parseRegSpec(regSpec, self._genome) if len(regions) == 1: region = regions[0] regStr = ' chromosome ' + region.chr +\ ' of genome build "' + self._genome + '"' +\ ((' from position ' + strWithStdFormatting(region.start+1) + ' to ' + \ strWithStdFormatting(region.end)) if not region.isWholeChr() else '') else: if all(region.chr is None or region.isWholeChr() for region in regions): regionChrs = set([region.chr for region in regions]) allChrs = set(GenomeInfo.getChrList(self._genome)) if len(regions) == len(allChrs) and regionChrs == allChrs: regStr = ' all chromosomes' else: regStr = ' chromosomes ' + ', '.join(region.chr for region in regions) else: regStr = ' %s regions' % len(regions) regStr += ' of genome build "%s"' % self._genome return 'Using' + regStr +\ ((', divided into intervals of size ' +\ generateStandardizedBpSizeText( parseShortenedSizeSpec( binSpec ) ) + ',') if binSpec != '*' else '') +\ ' as bins'
def formatBedLines(cls, genome, lineDict, binSize): chrLength = GenomeInfo.getStdChrLengthDict(genome) numElems = dict([(k, v/binSize+(1 if v%binSize>0 else 0))for k, v in chrLength.items()]) resDict = dict([(k, [0.0]*v) for k, v in numElems.items()]) microDict = defaultdict(dict) microBin = binSize/100 fullMicroBin = [microBin]*100 for chrom, vals in lineDict.items(): try: prevStart, prevEnd = vals[0] for start,end in vals[1:]: if prevEnd>=start: if end>prevEnd: prevEnd = end continue cls.putBpsInResultDict(resDict, chrom, prevStart, prevEnd, binSize, microDict, microBin, fullMicroBin) prevStart, prevEnd = start, end cls.putBpsInResultDict(resDict, chrom, prevStart, prevEnd, binSize, microDict, microBin, fullMicroBin) except: pass #logMessage('resDict[chr1][26]: '+repr(resDict['chr1'][26])) #logMessage("microDict['chr1'][26]: "+str(sum([v if v<10001 else v-10000 for v in microDict['chr1'][26]]))+': '+repr(microDict['chr1'][26])) maxVal = max( [max(v) for v in resDict.values()] ) return resDict, microDict, maxVal
def execute(cls, choices, galaxyFn=None, username=''): ''' Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' try: historyInputTN = choices[0].split(':') #from history historyGalaxyFn = ExternalTrackManager.extractFnFromGalaxyTN( historyInputTN) #same as galaxyFn in execute of create benchmark.. randomStatic = RunSpecificPickleFile(historyGalaxyFn) #finds path to static file created for a previous history element, and directs to a pickle file myInfo = randomStatic.loadPickledObject() except: return None galaxyTN = myInfo[3].split(':') myFileName = ExternalTrackManager.extractFnFromGalaxyTN(galaxyTN) genome = myInfo[0] gtrackSource = GtrackGenomeElementSource(myFileName, genome) regionList = [] for obj in gtrackSource: regionList.append(GenomeRegion(obj.genome, obj.chr, obj.start, obj.end)) extractor = TrackExtractor() fn = extractor.extract(GenomeInfo.getSequenceTrackName(genome), regionList, galaxyFn, 'fasta')
def getOptionsBoxChr(cls, prevChoices): ref_snp = cls.get_ref_snp(prevChoices) if len(ref_snp) == 1 and prevChoices.run == 'Single': rs = ref_snp[0] return [rs[1]] elif len(ref_snp) > 1 or prevChoices.run == 'Batch': return None return GenomeInfo.getChrList(prevChoices.genome)
def isValidTrack(genome, trackName, fullAccess=False): if not TrackInfo(genome, trackName).isValid(fullAccess): return False for fn in ProcTrackOptions._getDirContents(genome, trackName): if GenomeInfo.isValidChr(genome, fn) or isBoundingRegionFileName(fn): return True return False
def assertChrElCounts(self, trackName, chrElCountDict, allowOverlaps, customBins): for chr in chrElCountDict.keys(): if chr in customBins: region = customBins[chr] else: region = GenomeRegion(self.GENOME, chr, 0, GenomeInfo.getChrLen(self.GENOME, chr)) tv = self._getTrackView(trackName, region, allowOverlaps) self.assertEquals(chrElCountDict[chr], len([x for x in tv]))
def _createTrackCommon(cls, genome, inTrackName, outTrackName, windowSize, func, username, chrList): regionList = [ GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) for chr in chrList ] PreProcessCustomTrackJob(genome, outTrackName, regionList, cls._getGeSourceForRegion, \ username=username, inTrackName=inTrackName, windowSize=windowSize, func=func).process()
def _checkValidEnd(self, chr, end, start=None): if end < 0: raise InvalidFormatError('Error: end position is negative: %s' % end) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ end-1 > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: end position is larger than the size of chromosome "%s" (%s > %s)' % \ (chr, end-1, GenomeInfo.getChrLen(self.genome, chr))) if start is not None and end <= start: if not start == end == 1: raise InvalidFormatError( 'Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d' % (end, start)) return end
def getGlobalSource(globalSourceStr, genome, minimal): if minimal == True: return MinimalBinSource(genome) elif globalSourceStr == 'test': return UserBinSource('TestGenome:chr21:10000000-15000000','1000000') elif globalSourceStr == 'chrs': return GenomeInfo.getChrRegs(genome) elif globalSourceStr == 'chrarms': return GenomeInfo.getChrArmRegs(genome) elif globalSourceStr == 'ensembl': return GenomeInfo.getStdGeneRegs(genome) elif globalSourceStr == 'userbins': from gold.application.StatRunner import StatJob assert StatJob.USER_BIN_SOURCE is not None return StatJob.USER_BIN_SOURCE #return kwArgs['userBins'] else: raise ShouldNotOccurError('globalSource not recognized')
def findOverrepresentedTFsFromGeneSet(genome, tfSource, ensembleGeneIdList,upFlankSize, downFlankSize, geneSource, galaxyFn): #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat' #print 'overriding galaxyFN!: ', galaxyFn galaxyId = extractIdFromGalaxyFn(galaxyFn) uniqueWebPath = getUniqueWebPath(extractIdFromGalaxyFn(galaxyFn)) assert genome == 'hg18' tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome) tfTrackName = tfTrackNameMappings[tfSource] #Get gene track assert geneSource == 'Ensembl' targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed' geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn ) assert upFlankSize == downFlankSize == 0 #Should instead extend regions to include flanks tcGeneRegsTempFn = uniqueWebPath + os.sep + 'tcGeneRegs.targetcontrol.bedgraph' #Think this will be okay, subtraction not necessary as targets are put first: controlGeneRegsTempFn = geneRegsFn #print targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn GalaxyInterface.combineToTargetControl(targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn) #tcGeneRegsExternalTN = ['external'] +galaxyId + [tcGeneRegsTempFn] tcGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc') #tcGeneRegsExternalTN = ['external'] +targetGalaxyId + [tcGeneRegsTempFn] #tcGeneRegsExternalTN = ['galaxy', externalId, tcGeneRegsTempFn] targetGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc', '1') controlGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc', '0') #pre-process print 'Pre-processing file: %s, with trackname: %s ' % (tcGeneRegsTempFn, tcGeneRegsExternalTN) ExternalTrackManager.preProcess(tcGeneRegsTempFn, tcGeneRegsExternalTN, 'targetcontrol.bedgraph',genome) print 'Pre-processing TN: ', targetGeneRegsExternalTN ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed',genome) print 'Pre-processing TN: ', controlGeneRegsExternalTN ExternalTrackManager.preProcess(controlGeneRegsTempFn, controlGeneRegsExternalTN, 'bed',genome) #print tcGeneRegsExternalTN trackName1, trackName2 = tfTrackName, tcGeneRegsExternalTN analysisDef = 'Categories differentially located in targets?: Which categories of track1-points fall more inside case than control track2-segments? [rawStatistic:=PointCountInsideSegsStat:]' +\ '[tf1:=SegmentToStartPointFormatConverter:] [tf2:=TrivialFormatConverter:]' +\ '-> DivergentRowsInCategoryMatrixStat' regSpec, binSpec = '*','*' #print 'skipping preproc!!' #ExternalTrackManager.preProcess(tcGeneRegsExternalTN[-1], tcGeneRegsExternalTN, 'targetcontrol.bedgraph', genome) #ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed', genome) GalaxyInterface.runManual([trackName1, trackName2], analysisDef, regSpec, binSpec, genome, printResults=True, printHtmlWarningMsgs=False)
def getAllBoundingRegions(self): if not self.fileExists(): from gold.util.CommonFunctions import prettyPrintTrackName raise BoundingRegionsNotAvailableError('Bounding regions not available for track: ' + \ prettyPrintTrackName(self._trackName)) for chr in GenomeInfo.getExtendedChrList(self._genome): for reg in self.getAllBoundingRegionsForChr(chr): yield reg
def getGenomicElements(genome, trackName): track = PlainTrack(trackName) genElements = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) for el in tv: #print chrom, el.start(), el.end() #, el.name() genElements = genElements + [[chrom, el.start(), el.end()]] return genElements #print numpy.version.version # 1.7.1 !! #unique, counts = numpy.unique(segmentSize, return_counts=True) # This is for numpy 1.9 #print numpy.asarray((unique, counts)).T '''track.setFormatConverter('SegmentToMidPointFormatConverter')
def getNmerAndCleanedNmerTrackName(genome, trackName): from quick.util.GenomeInfo import GenomeInfo from copy import copy tn = copy(trackName) tn[-1] = tn[-1].lower() nmer = tn[-1] if len(tn) == len(GenomeInfo.getNmerTrackName(genome)) + 1: tn = tn[0:-1] + [str(len(nmer)) + '-mers'] + tn[-1:] return nmer, tn
def isCompBin(region): if isIter(region): return False offsetOK = (CompBinManager.getOffset( region.start, CompBinManager.getBinNumber(region.start)) == 0) lengthOK = (len(region) == min( CompBinManager.getCompBinSize(), GenomeInfo.getChrLen(region.genome, region.chr) - region.start)) return offsetOK and lengthOK
def generateGenomeAnnotations(cls, abbrv): fnSource = cls.getCollectedPathGFF(abbrv) if os.path.exists(fnSource): from quick.extra.StandardizeTrackFiles import SplitFileToSubDirs SplitFileToSubDirs.parseFiles(abbrv, GenomeInfo.getGenomeAnnotationsTrackName(abbrv), direction='coll_to_std', \ suffix='.gff', catSuffix='.category.gff', subTypeCol='2', depth='1', numHeaderLines='0') fnDest = cls.getStandardizedPathGFF(abbrv) ensurePathExists(fnDest) shutil.copyfile(fnSource, fnDest)
def retrieveTrack(self, regionTrackName, fastaFileName): regionTrackName = regionTrackName.split(':') myFileName = ExternalTrackManager.extractFnFromGalaxyTN(regionTrackName) gtrackSource = GtrackGenomeElementSource(myFileName, self._genome) regionList = [] for obj in gtrackSource: regionList.append(GenomeRegion(obj.genome, obj.chr, obj.start, obj.end)) return self._extractor.extract(GenomeInfo.getSequenceTrackName(self._genome), regionList, fastaFileName, 'fasta')
def execute(choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' genome = choices[0] nmer = choices[1].lower() regSpec = choices[2] binSpec = '*' trackName = GenomeInfo.getPropertyTrackName(genome, 'nmer') + [str(len(nmer))+'-mers',nmer] assert galaxyFn is not None GalaxyInterface.extractTrackManyBins(genome, trackName, regSpec, binSpec, True, 'point bed', False, False, galaxyFn)
def getSubtypes(genome, trackName, fullAccess=False): dirPath = createDirPath(trackName, genome) subtypes = [fn for fn in ProcTrackOptions._getDirContents(genome, trackName) \ if not (fn[0] in ['.','_'] or os.path.isfile(dirPath + os.sep + fn) \ or GenomeInfo.isValidChr(genome, fn))] #fixme, just temporarily:, these dirs should start with _ subtypes= [x for x in subtypes if not x in ['external','ucsc'] ] if not fullAccess and not ProcTrackOptions._isLiteratureTrack(genome, trackName): subtypes = [x for x in subtypes if not TrackInfo(genome, trackName+[x]).private] return sorted(subtypes, key=str.lower)
def execute(cls, choices, galaxyFn=None, username=''): outputFile = open(galaxyFn, 'w') genome = choices[0] histItem = choices[2] trackItem = choices[3] chromRegsPath = GenomeInfo.getChrRegsFn(genome) chrSizeDict = dict([ ( chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) geSource = headLinesStr = None if choices[1] == 'History': trackType = choices[2].split(':')[1] username = ''.join([chr(random.randint(97,122)) for i in range(6)]) tempFn = createCollectedPath(genome, [], username+'_'.join([str(v) for v in time.localtime()[:6]])+'.'+trackType) fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':')) open(tempFn,'w').write(open(fnSource,'r').read()) if trackType in ['marked.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__() headLinesStr = geSource.getHeaderLines().replace('##','\n##') cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True) os.remove(tempFn) else: writeHeaderFlag = True for chrom in GenomeInfo.getChrList(genome): gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom]) plTrack = PlainTrack(trackItem.split(':')) geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__() cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag) writeHeaderFlag = False outputFile.close()
def execute(cls, choices, galaxyFn=None, username=''): start = time.time() genome = choices[0] trackName = choices[1].split(':') #outFn = open(NONSTANDARD_DATA_PATH+'/hg19/Private/Sigven/resultat.bed','w') analysisDef = '-> ConvertToNonOverlappingCategorySegmentsPythonStat' #'Python' for regSpec in GenomeInfo.getChrList(genome): res = GalaxyInterface.runManual([trackName], analysisDef, regSpec, '*', genome, username=username, \ printResults=False, printHtmlWarningMsgs=False) from gold.origdata.TrackGenomeElementSource import TrackViewGenomeElementSource from gold.origdata.BedComposer import CategoryBedComposer for resDict in res.values(): tvGeSource = TrackViewGenomeElementSource(genome, resDict['Result'], trackName) CategoryBedComposer(tvGeSource).composeToFile(outFn)
def __iter__(self): chr = self.chr trackName1, trackName2, w1, w2, genome = self.trackName1, self.trackName2, self.w1, self.w2, self.genome region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) ) track1 = PlainTrack(trackName1) tv1 = track1.getTrackView(region) vals1 = tv1.valsAsNumpyArray() track2 = PlainTrack(trackName2) tv2 = track2.getTrackView(region) vals2 = tv2.valsAsNumpyArray() for i in xrange(len(vals1)): yield w1*vals1[i] + w2*vals2[i]
def __iter__(self): from gold.application.RSetup import r chr = self.chr trackName1, genome = self.trackName1, self.genome factor = self.factor region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) ) track1 = PlainTrack(trackName1) tv1 = track1.getTrackView(region) vals1 = tv1.valsAsNumpyArray() #scale between 0 and 1..: minVal, maxVal = vals1.min(), vals1.max() vals1 = (vals1 - minVal) * (1/(maxVal-minVal)) for pos in xrange(len(vals1)): #print r.runif(1), vals1[pos] if r.runif(1) < factor*vals1[pos]: yield [pos,pos+1]
def yielder(self, curTn): if self._avoidLiterature and curTn == GenomeInfo.getPropertyTrackName(self._genome, 'literature'): return for subtype in ProcTrackOptions.getSubtypes(self._genome, curTn, self._fullAccess): #if self._avoidLiterature and subtype == 'Literature': if subtype[0] in ['.','_']: continue newTn = curTn + [subtype] doBreak = False for subTn in self.yielder(newTn): yield subTn if ProcTrackOptions.isValidTrack(self._genome, curTn, self._fullAccess): yield curTn
def createBoundingRegionShelve(genome, trackName, allowOverlaps): collector = TrackInfoDataCollector(genome, trackName) geChrList = collector.getPreProcessedChrs(allowOverlaps) boundingRegionTuples = [x for x in collector.getBoundingRegionTuples(allowOverlaps) if x.region.chr is not None] if len(boundingRegionTuples) == 0: boundingRegionTuples = [BoundingRegionTuple( \ GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(genome, chr)), \ collector.getNumElements(chr, allowOverlaps) ) \ for chr in geChrList] brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps) brShelve.storeBoundingRegions(boundingRegionTuples, geChrList, not collector.getTrackFormat().reprIsDense()) boundingRegionChrs = set([br.region.chr for br in boundingRegionTuples]) for chr in boundingRegionChrs | set(geChrList): if brShelve.getTotalElementCount(chr) != collector.getNumElements(chr, allowOverlaps): raise ShouldNotOccurError("Error: The total element count for all bounding regions of chromosome '%s' is not equal to the number of genome elements of that chromosome. %s != %s" % \ (chr, brShelve.getTotalElementCount(chr), collector.getNumElements(chr, allowOverlaps)) )
def validateAndReturnErrors(choices): ''' Should validate the selected input parameters. If the parameters are not valid, an error text explaining the problem should be returned. The GUI then shows this text to the user (if not empty) and greys out the execute button (also if the text isempty). If all parameters are valid, the method should return None, which enables the execute button. ''' genome, tn, tf = ExtractIntersectingGenesTool._getBasicTrackFormat(choices) geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) if not ExtractIntersectingGenesTool._isValidTrack(choices): return "" return "The selected track (%s) is not valid." % ':'.join(tn) if tf.split()[-1] not in ['points', 'segments']: return "The track format of the selected track must be either points or segments. Currently: %s" % tf if not ProcTrackOptions.isValidTrack(genome, geneRegsTrackName, True): return "The track used for gene ids (%s) is not valid. This is an internal error." % ':'.join(geneRegsTrackName)
def execute(cls, choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' genome = choices[0] nmer = choices[1].lower() regSpec = choices[2] analysisRegions = parseRegSpec(regSpec, genome) binSize = cls._calcBinSize(nmer, analysisRegions) binSpec = '*' if binSize is None else str( binSize ) numBins = len( AutoBinner(analysisRegions, binSize) ) from quick.application.GalaxyInterface import GalaxyInterface from quick.util.GenomeInfo import GenomeInfo trackName1 = GenomeInfo.getPropertyTrackName(genome, 'nmer') + [str(len(nmer))+'-mers',nmer] trackName2 = [''] analysisDef = 'Counts: The number of track1-points -> CountPointStat' #regSpec = '*' #print 'Using binSize: ',binSpec #print 'TN1: ',trackName1 from gold.result.HtmlCore import HtmlCore print str(HtmlCore().styleInfoBegin(styleClass='debug')) GalaxyInterface.run(trackName1, trackName2, analysisDef, regSpec, binSpec, genome, galaxyFn) print str(HtmlCore().styleInfoEnd()) plotFileNamer = GalaxyRunSpecificFile(['0','CountPointStat_Result_gwplot.pdf'], galaxyFn) textualDataFileNamer = GalaxyRunSpecificFile(['0','CountPointStat_Result.bedgraph'], galaxyFn) core = HtmlCore() core.paragraph('Inspect nmer frequency variation as a %s or as underlying %s.</p>' % ( plotFileNamer.getLink('plot'), textualDataFileNamer.getLink('textual data') )) core.divider() core.paragraph('The occurrence frequency of your specified nmer ("%s") has been computed along the genome, within your specified analysis region ("%s").' % (nmer, regSpec)) core.paragraph('The analysis region was divided into %i bins, based on calculations trying to find appropriate bin size (get enough data per bin and restrict maximum number of bins).' % numBins) trackName1modified = trackName1[0:-2] + trackName1[-1:] preSelectedAnalysisUrl = createHyperBrowserURL(genome, trackName1modified,[''], analysis='Counts',method='auto',region=regSpec, binsize=binSpec) core.divider() core.paragraph('If you do not find the inferred bin size to be appropriate, you can set this manually in a ' + str(HtmlCore().link('new analysis', preSelectedAnalysisUrl)) + '.') print str(core)
def _createNmerTrack(self, nmerList, lowerOrder=None): nmerLengths = list(set([len(nmer) for nmer in nmerList])) assert len(nmerLengths)==1 chainOrder = lowerOrder if lowerOrder is not None else nmerLengths[0] regionList = [GenomeRegion(self._genome, chr, 0, GenomeInfo.getChrLen(self._genome, chr) ) for chr in GenomeInfo.getChrList(self._genome)] for region in regionList: print '|', chains = SameValueIndexChainsFactory.load(self._createPath(chainOrder), region.chr) for nmer in nmerList: if len(nmerList) > 1: print '.', if lowerOrder is not None: nmerPrefix = nmer[0:chainOrder] rawIndexGenerator = chains.getIndexGenerator(NmerTools.nmerAsInt(nmerPrefix)) indexGenerator = LowerOrderChainWrapper(rawIndexGenerator, nmerPrefix, nmer, self._genome, region.chr) else: indexGenerator = chains.getIndexGenerator(NmerTools.nmerAsInt(nmer)) #print 'Length of lower order chain: %i and %i' % (sum(1 for x in indexGenerator), sum(1 for x in indexGenerator)) #print 'Length of wrapped chain: %i and %i' % (sum(1 for x in wrappedIndexGenerator), sum(1 for x in wrappedIndexGenerator)) PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), [region], \ self._getNmerGeSourceForChr, finalize=False, preProcess=True, \ indexGenerator=indexGenerator).process() for nmer in nmerList: try: PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), regionList, \ self._getNmerGeSourceForChr, preProcess=False, finalize=True, \ indexGenerator=[0]).process() except EmptyGESourceError: PreProcessCustomTrackJob(self._genome, self._createTrackName(nmer), [GenomeRegion(self._genome, regionList[0].chr, -1, 0)], \ self._getNmerGeSourceForChr, preProcess=True, finalize=True, \ indexGenerator=[-1]).process() return
def nextBin(self): #start = self.start #for chr in self.chromosomes: # if self.genome: # chrLen = GenomeInfo.getChrLen(self.genome, chr) # else: # chrLen = self.end # assert chrLen is not None # # if self.end is None: # chrEnd = chrLen # else: # chrEnd = min(self.end, chrLen) # #chrLen = 3100000 # # while (start < chrEnd): # if self.binLen is not None: # end = min(start+self.binLen, chrEnd) # else: # end = chrEnd # #print 'YIELDING: ',start, end, chrEnd # yield GenomeRegion(self.genome, chr, start, end) # if self.binLen is not None: # start += self.binLen # else: # start = chrLen # # #in case of more chromosomes, reset start: # start = 0 for region in self._userBinSource: start = region.start if region.start is not None else 0 chrLen = GenomeInfo.getChrLen(region.genome, region.chr) if region.genome is not None else None regEnd = min([x for x in [region.end, chrLen] if x is not None]) if self._binLen is None: yield GenomeRegion(region.genome, region.chr, start, regEnd) else: while start < regEnd: end = min(start + self._binLen, regEnd) yield GenomeRegion(region.genome, region.chr, start, end) start += self._binLen
def execute(cls, choices, galaxyFn=None, username=''): from quick.application.ExternalTrackManager import ExternalTrackManager genome = choices[0] preProcTN1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, choices[2].split(':')) if choices[1] == 'History' else choices[2].split(':') chrSizeDict = dict([ ( chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) trackType = choices[3].split(':')[1] fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[3].split(':')) if trackType in ['marked.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter(BedGenomeElementSource(fnSource, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter(GtrackGenomeElementSource(fnSource, genome=genome)).__iter__() #headLinesStr = geSource.getHeaderLines().replace('##','\n##') else: raise InvalidFormatError('The Binning must be of the following formats: gtrack, marked.bed, category.bed ,bed ...') cls.PrintResultToHistItem( galaxyFn, geSource, preProcTN1, genome, username)
def createGenome(cls, genome, fullName, chromNamesDict, standardChromosomes, username=''): basePath = cls.getBasePath(genome) trackName=GenomeInfo.getSequenceTrackName(genome) print("Splitting genome file into chromosomes.") SplitFasta.parseFiles(genome, trackName, chromNamesDict=chromNamesDict) print("Processing genome") PreProcessAllTracksJob(genome).process() #print "Writing name file.:", fullName #nameFn=createOrigPath(genome,[], "_name.txt" if experimental else "#name.txt") #ensurePathExists(nameFn) #f=open(nameFn, "w") #f.write(fullName) #f.close() print("Creating chromosome file") createChromosomeFile(genome, ",".join(standardChromosomes)) print("Creating assembly gaps file") createAssemblyGapsFile(genome) print("Processing genome") PreProcessAllTracksJob(genome).process() print(genome + " genome added")