def __iter__(self): chr = self.chr trackName1, trackName2, w1, w2, genome = self.trackName1, self.trackName2, self.w1, self.w2, self.genome region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) ) track1 = PlainTrack(trackName1) tv1 = track1.getTrackView(region) vals1 = tv1.valsAsNumpyArray() track2 = PlainTrack(trackName2) tv2 = track2.getTrackView(region) vals2 = tv2.valsAsNumpyArray() for i in xrange(len(vals1)): yield w1*vals1[i] + w2*vals2[i]
def createAssemblyGapsFile(genome, assemblyChars='ACGTacgt'): """genome assemblyChars='ACGTacgt'""" basePath = gcf.createOrigPath(genome, GenomeInfo.getPropertyTrackName(genome, 'gaps'),'') outFn = basePath + 'assemblyGaps.bed' qcf.ensurePathExists(outFn) outFile = open(outFn,'w') seqTrack = PlainTrack( GenomeInfo.getSequenceTrackName(genome) ) anyGaps = False for chr in GenomeInfo.getExtendedChrList(genome): chrRegion = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) seqTV = seqTrack.getTrackView(chrRegion) seq = seqTV.valsAsNumpyArray() #gapIndexes = numpy.arange(len(seq))[(seq == 'n') | (seq == 'N')] gapIndexes = numpy.arange(len(seq))[numpy.logical_not( numpy.logical_or.reduce([seq == x for x in assemblyChars]) )] gapIndexDiff = gapIndexes[1:] - gapIndexes[:-1] gapBeginIndexes = numpy.delete(gapIndexes, (numpy.arange(len(gapIndexDiff)) + 1)[gapIndexDiff==1]) gapEndIndexes = numpy.delete(gapIndexes + 1, numpy.arange(len(gapIndexDiff))[gapIndexDiff==1]) assert len(gapBeginIndexes) == len(gapEndIndexes) for i in xrange(len(gapBeginIndexes)): anyGaps = True outFile.write('\t'.join([chr, str(gapBeginIndexes[i]), str(gapEndIndexes[i])]) + os.linesep) if not anyGaps: outFile.write('\t'.join([GenomeInfo.getExtendedChrList(genome)[0], '1', '1'])) outFile.close()
def __iter__(self): chr = self.chr trackName1, trackName2, w1, w2, genome = self.trackName1, self.trackName2, self.w1, self.w2, self.genome region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) track1 = PlainTrack(trackName1) tv1 = track1.getTrackView(region) vals1 = tv1.valsAsNumpyArray() track2 = PlainTrack(trackName2) tv2 = track2.getTrackView(region) vals2 = tv2.valsAsNumpyArray() for i in xrange(len(vals1)): yield w1 * vals1[i] + w2 * vals2[i]
def getAnchor(genome, trackName): track = PlainTrack(trackName) anchor = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) anchor = anchor + [str(tv.genomeAnchor)] return anchor
def getNumberElements(genome, trackName): track = PlainTrack(trackName) numElements = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) numElements = numElements + [len(tv.startsAsNumpyArray())] return numElements
def execute(cls, choices, galaxyFn=None, username=''): from gold.util.RandomUtil import random outputFile = open(galaxyFn, 'w') genome = choices[0] histItem = choices[2] trackItem = choices[3] chromRegsPath = GenomeInfo.getChrRegsFn(genome) chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) geSource = headLinesStr = None if choices[1] == 'history': trackType = choices[2].split(':')[1] username = ''.join( [chr(random.randint(97, 122)) for i in range(6)]) tempFn = createCollectedPath( genome, [], username + '_'.join([str(v) for v in time.localtime()[:6]]) + '.' + trackType) fnSource = ExternalTrackManager.extractFnFromGalaxyTN( choices[2].split(':')) open(tempFn, 'w').write(open(fnSource, 'r').read()) if trackType in ['valued.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter( BedGenomeElementSource(tempFn, genome=genome)).__iter__() #elif trackType == 'gtrack': # geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__() # headLinesStr = geSource.getHeaderLines().replace('##','\n##') cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True) os.remove(tempFn) else: writeHeaderFlag = True for chrom in GenomeInfo.getChrList(genome): gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom]) plTrack = PlainTrack(trackItem.split(':')) geSource = GenomeElementTvWrapper( plTrack.getTrackView(gRegion)).__iter__() cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag) writeHeaderFlag = False outputFile.close()
def getSegmentSizes(genome, trackName): track = PlainTrack(trackName) segmentSize = []; sumSegmentSize = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) sizeSegments = tv.endsAsNumpyArray() - tv.startsAsNumpyArray() sumSizes = sizeSegments.sum() segmentSize = segmentSize + [sizeSegments.tolist()] sumSegmentSize = sumSegmentSize + [sumSizes.tolist()] return sumSegmentSize
def _addPeaks(self): #trackName = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, self.trackName) track = PlainTrack(self.trackName) chromRegs = GlobalBinSource(genome) i = 0 for region in chromRegs: if i > 2: break tv = track.getTrackView(region) starts = tv.startsAsNumpyArray() ends = tv.endsAsNumpyArray() for (start, end) in zip(starts, ends): self.peaks.append(Peak(self, region.chr, start, end)) i += 1
def execute(choices, galaxyFn=None, username=''): ''' Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' print "<h2>Test tool<h2>" fastaTrack = PlainTrack(['Sequence', 'DNA']) for i in range(0, 500): seqTv = fastaTrack.getTrackView( GenomeRegion("hg19", "chr1", 1000000, 1001000)) sequence = seqTv.valsAsNumpyArray() print sequence
def __iter__(self): from gold.application.RSetup import r chr = self.chr trackName1, genome = self.trackName1, self.genome factor = self.factor region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) ) track1 = PlainTrack(trackName1) tv1 = track1.getTrackView(region) vals1 = tv1.valsAsNumpyArray() #scale between 0 and 1..: minVal, maxVal = vals1.min(), vals1.max() vals1 = (vals1 - minVal) * (1/(maxVal-minVal)) for pos in xrange(len(vals1)): #print r.runif(1), vals1[pos] if r.runif(1) < factor*vals1[pos]: yield [pos,pos+1]
def getGenomicElements(genome, trackName): track = PlainTrack(trackName) genElements = [] for chrom in GenomeInfo.getChrList(genome): chromLen = GenomeInfo.getChrLen(genome, chrom) region = GenomeRegion(genome, chrom, 0, chromLen) tv = track.getTrackView(region) for el in tv: #print chrom, el.start(), el.end() #, el.name() genElements = genElements + [[chrom, el.start(), el.end()]] return genElements #print numpy.version.version # 1.7.1 !! #unique, counts = numpy.unique(segmentSize, return_counts=True) # This is for numpy 1.9 #print numpy.asarray((unique, counts)).T '''track.setFormatConverter('SegmentToMidPointFormatConverter')
def __iter__(self): for pos in self._lowerOrderChain: from gold.track.Track import PlainTrack from quick.util.GenomeInfo import GenomeInfo from gold.track.GenomeRegion import GenomeRegion track = PlainTrack(GenomeInfo.getSequenceTrackName(self._genome)) region = GenomeRegion(self._genome, self._chr, pos, pos + len(self._fullNmer)) fullSubstring = (''.join( track.getTrackView(region).valsAsNumpyArray())).lower() pl = len(self._nmerPrefix) assert self._fullNmer[0:pl] == fullSubstring[ 0: pl], 'The prefix of lower order does not match at the positions given by the chain. %s vs %s. Region: %s' % ( self._fullNmer[0:pl], fullSubstring[0:pl], region) #print 'Comparing nmers: %s VS %s (at pos:%i).' % (self._fullNmer, fullSubstring, pos) if self._fullNmer == fullSubstring: yield pos
def __iter__(self): from proto.RSetup import r chr = self.chr trackName1, genome = self.trackName1, self.genome factor = self.factor region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr)) track1 = PlainTrack(trackName1) tv1 = track1.getTrackView(region) vals1 = tv1.valsAsNumpyArray() #scale between 0 and 1..: minVal, maxVal = vals1.min(), vals1.max() vals1 = (vals1 - minVal) * (1 / (maxVal - minVal)) for pos in xrange(len(vals1)): #print r.runif(1), vals1[pos] if r.runif(1) < factor * vals1[pos]: yield [pos, pos + 1]
def getMutatedSequence(cls, genome, regionDict, pointDict=None): resultDict = defaultdict(list) regionList = [] fastaTrack = PlainTrack(['Sequence', 'DNA']) for chrom in regionDict.keys(): for start, end in regionDict[chrom]: seqTv = fastaTrack.getTrackView( GenomeRegion(genome, chrom, start, end)) valList = list(seqTv.valsAsNumpyArray()) if pointDict: mutatedPoints = [ v[1:] for v in pointDict[chrom] if v[0] == start ] for index, val in mutatedPoints: val = val[-1] if val.find('>') >= 0 else val valList[index] = val resultDict[chrom].append( '>%s %i-%i\n%s' % (chrom, start + 1, end, ''.join(valList))) return resultDict
def execute(cls, choices, galaxyFn=None, username=''): outputFile = open(galaxyFn, 'w') genome = choices[0] histItem = choices[2] trackItem = choices[3] chromRegsPath = GenomeInfo.getChrRegsFn(genome) chrSizeDict = dict([ ( chr, GenomeInfo.getChrLen(genome, chr)) for chr in GenomeInfo.getChrList(genome)]) geSource = headLinesStr = None if choices[1] == 'history': trackType = choices[2].split(':')[1] from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile tempFn = GalaxyRunSpecificFile(['fromHistory.'+trackType],galaxyFn).getDiskPath(True) fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':')) open(tempFn,'w').write(open(fnSource,'r').read()) if trackType in ['valued.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__() headLinesStr = geSource.getHeaderLines().replace('##','\n##') cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True) os.remove(tempFn) else: writeHeaderFlag = True for chr in GenomeInfo.getChrList(genome): gRegion = GenomeRegion(genome, chr, 0, chrSizeDict[chr]) plTrack = PlainTrack(trackItem.split(':')) geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__() cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag) writeHeaderFlag = False outputFile.close()
def execute(cls, choices, galaxyFn=None, username=''): outputFile = open(galaxyFn, 'w') genome = choices[0] histItem = choices[2] trackItem = choices[3] chromRegsPath = GenomeInfo.getChrRegsFn(genome) chrSizeDict = dict([ ( chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) geSource = headLinesStr = None if choices[1] == 'History': trackType = choices[2].split(':')[1] username = ''.join([chr(random.randint(97,122)) for i in range(6)]) tempFn = createCollectedPath(genome, [], username+'_'.join([str(v) for v in time.localtime()[:6]])+'.'+trackType) fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':')) open(tempFn,'w').write(open(fnSource,'r').read()) if trackType in ['marked.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__() headLinesStr = geSource.getHeaderLines().replace('##','\n##') cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True) os.remove(tempFn) else: writeHeaderFlag = True for chrom in GenomeInfo.getChrList(genome): gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom]) plTrack = PlainTrack(trackItem.split(':')) geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__() cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag) writeHeaderFlag = False outputFile.close()
#create a track track = PlainTrack(['Genes and gene subsets', 'Genes', 'Refseq']) #track = PlainTrack(['DNA structure','Bendability']) #create a region of interest region = GenomeRegion('hg18', 'chr1', 1000, 900000) #Could instead have been iterator of regions, e.g. genome-wide: #from quick.application.UserBinSource import UserBinSource #regionIter = UserBinSource('*','*','hg18') #for region in regionIter: # track.getTrackView(region): #print 'Last region of iter: ', region #iterate through elements of the track in this region trackView = track.getTrackView(region) for element in trackView: #just print the intervals for now.. print element.start(), element.end() tv = track.getTrackView(region) print 'Number of elements in region, the slow way: ', len( [element for element in tv]) print 'Number of elements in region, the fast way: ', len( tv.startsAsNumpyArray()) print 'Bp coverage by elements in the region, the slow way: ', sum( element.end() - element.start() for element in tv) print 'Bp coverage by elements in the region, the fast way: ', tv.endsAsNumpyArray( ).sum() - tv.startsAsNumpyArray().sum()
#create a track track = PlainTrack(['Genes and gene subsets','Genes','Refseq']) #track = PlainTrack(['DNA structure','Bendability']) #create a region of interest region = GenomeRegion('hg18','chr1',1000,900000) #Could instead have been iterator of regions, e.g. genome-wide: #from quick.application.UserBinSource import UserBinSource #regionIter = UserBinSource('*','*','hg18') #for region in regionIter: # pass #print 'Last region of iter: ', region #iterate through elements of the track in this region for element in track.getTrackView(region): #just print the intervals for now.. print element.start(), element.end() tv = track.getTrackView(region) print 'Number of elements in region, the slow way: ', len([element for element in tv]) print 'Number of elements in region, the fast way: ', len(tv.startsAsNumpyArray()) print 'Bp coverage by elements in the region, the slow way: ', sum(element.end()-element.start() for element in tv) print 'Bp coverage by elements in the region, the fast way: ', tv.endsAsNumpyArray().sum() - tv.startsAsNumpyArray().sum() trackExplanation = \ ''' A Track object loads the appropriate preprocessed data based on a track name. Calling the method getTrackView gives an object (really of class TrackView) that is used simply to iterate through all track elements of the given genome region. A track element (of class TrackElement) has methods start,end,val,strand. Some of these will typically be None, depending on the format of the requested track (e.g. for Segments the method val will return None..)