def _build_trees_by_chrom(blocks, verbose=False): """ Construct set of interval trees from an iterable of genome alignment blocks. :return: a dictionary indexed by chromosome name where each entry is an interval tree for that chromosome. """ if verbose: sys.stderr.write("separating blocks by chromosome... ") by_chrom = {} for b in blocks: if b.chrom not in by_chrom: by_chrom[b.chrom] = [] by_chrom[b.chrom].append(b) if verbose: sys.stderr.write("done\n") if verbose: sys.stderr.write("building interval trees by chromosome... ") res = {} for c in by_chrom: res[c] = IntervalTree(by_chrom[c], openEnded=True) if verbose: sys.stderr.write("done\n") return res
def populate(self, filehandle): #print "populating " + str(self) #print "seeking to " + str(self.fileLocation) filehandle.seek(self.fileLocation) if self.debug: sys.stderr.write("populating " + str(self) + "\n") for line in filehandle: # get next element line = line.strip() if self.debug: sys.stderr.write("\t" + "current line is " + str(line) + "\n") line = line.strip() if line == "": continue e = parseWigString(line) # we're done if we've left this block's chrom, or if we've moved beyond # the end of this blocks boundary. if e.chrom != self.chrom or e.start > self.end: break self.data.append(e) if self.debug: sys.stderr.write("built tree for " + str(self) + "\n") if len(self.data) == 0: print "empty! --> " + str(self) self.iTree = IntervalTree(self.data, openEnded=True)
def __load(self, verbose=False): """ @summary: load the contents of a wig file into this object """ byChrom = {} for e in wigIterator(self.filename, verbose=verbose): if e.chrom not in byChrom: byChrom[e.chrom] = [] byChrom[e.chrom].append(e) for chrom in byChrom: self.itrees[chrom] = IntervalTree(byChrom[chrom], openEnded=True)
def _buildTree(self, weights, candidates): """ @summary: build interval tree from cumulative weights """ intervals = [] total = 0.0 for i in range(0, len(weights)): weight = weights[i] obj = candidates[i] start = total end = total + weight intervals.append(WeightedRandom.Interval(start, end, obj)) total = total + weight return IntervalTree(intervals)
def intervalTrees(reffh, scoreType=int, verbose=False): """ Build a dictionary of interval trees indexed by chrom from a BED stream or file :param reffh: This can be either a string, or a stream-like object. In the former case, it is treated as a filename. The format of the file/stream must be BED. :param scoreType: The data type for scores (the fifth column) in the BED file. :param verbose: output progress messages to sys.stderr if True """ if type(reffh).__name__ == "str": fh = open(reffh) else: fh = reffh # load all the regions and split them into lists for each chrom elements = {} if verbose and fh != sys.stdin: totalLines = linesInFile(fh.name) pind = ProgressIndicator(totalToDo=totalLines, messagePrefix="completed", messageSuffix="of loading " + fh.name) for element in BEDIterator(fh, scoreType=scoreType, verbose=verbose): if element.chrom not in elements: elements[element.chrom] = [] elements[element.chrom].append(element) if verbose and fh != sys.stdin: pind.done += 1 pind.showProgress() # create an interval tree for each list trees = {} if verbose: totalLines = len(elements) pind = ProgressIndicator(totalToDo=totalLines, messagePrefix="completed", messageSuffix="of making interval trees") for chrom in elements: trees[chrom] = IntervalTree(elements[chrom], openEnded=True) if verbose: pind.done += 1 pind.showProgress() return trees
def __init__(self, whole_chrom_files, partial_chrom_files, factory): """Constructor; see class docsstring for param details.""" self.current = None self.current_key = None self.factory = factory self.whole_chrom_files = whole_chrom_files self.partial_trees = {} by_chrom = {} for chrom, start, end in partial_chrom_files: k = (chrom, start, end) v = partial_chrom_files[k] if chrom in whole_chrom_files: raise GenomeAlignmentError("Oops") if chrom not in by_chrom: by_chrom[chrom] = [] interval = GenomicInterval(chrom, start, end) by_chrom[chrom].append(JITGenomeAlignmentKeyInterval(interval, v)) for chrom in by_chrom: self.partial_trees[chrom] = IntervalTree(by_chrom[chrom]) for chrom, start, end in partial_chrom_files: hits = self.partial_trees[chrom].intersectingInterval(start, end) if len(hits) != 1: raise GenomeAlignmentError("Oops")
def intervalTreesFromList(inElements, verbose=False, openEnded=False): """ build a dictionary, indexed by chrom name, of interval trees for each chrom. :param inElements: list of genomic intervals. Members of the list must have chrom, start and end fields; no other restrictions. :param verbose: output progress messages to sys.stderr if True """ elements = {} if verbose: totalLines = len(inElements) pind = ProgressIndicator(totalToDo=totalLines, messagePrefix="completed", messageSuffix="of parsing") for element in inElements: if element.chrom not in elements: elements[element.chrom] = [] elements[element.chrom].append(element) if verbose: pind.done += 1 pind.showProgress() # create an interval tree for each list trees = {} if verbose: totalLines = len(elements) pind = ProgressIndicator(totalToDo=totalLines, messagePrefix="completed", messageSuffix="of making interval trees") for chrom in elements: trees[chrom] = IntervalTree(elements[chrom], openEnded) if verbose: pind.done += 1 pind.showProgress() return trees
def build(self): currentBlock = None at = self.handle.tell() seenChroms = set() lastIndexSeen = -1 if self.verbose: try: pind = ProgressIndicator( totalToDo=os.path.getsize(self.handle.name), messagePrefix="completed", messageSuffix="of building index for " + self.handle.name) except: sys.stderr.write("IndexedWig -- warning: " + "unable to show progress for stream\n") self.verbose = False ### note, for loop seems to buffer the file and so tell() gives a ### location that is not where the current line was read from, so ### we stick to readline instead. rline = None while rline != "": # get the next element rline = self.handle.readline() line = rline.strip() if line == "": continue e = parseWigString(line) # keep track of what chroms we've seen for checking order if not e.chrom in seenChroms: seenChroms.add(e.chrom) lastIndexSeen = -1 # check chrom order is ok for seenChrom in seenChroms: if seenChrom > e.chrom: msg = "wig file is not sorted, entry for chrom " + str(seenChrom) +\ " appears after entry for " + str(e.chrom) raise IndexedWigError(msg) # check position order is ok if e.start < lastIndexSeen: msg = "wig file is not sorted, entry for chrom " + str(e.chrom) +\ " at " + str(e.start) + " appears after " + str(lastIndexSeen) raise IndexedWigError(msg) # update the last index we've seen lastIndexSeen = e.end # debugging message if the current block is full if self.debug is True: sys.stderr.write("processing " + str(e)) if currentBlock is not None: sys.stderr.write("; is current block full?" + str(currentBlock.isfull()) + "\n") else: sys.stderr.write("\n") # we might need to make a new block for this element if currentBlock is None or currentBlock.isfull() or \ currentBlock.chrom != e.chrom: if self.debug: sys.stderr.write("making new block with " + str(e) + "\n") if currentBlock is not None: if self.debug: sys.stderr.write("closed block: " + str(currentBlock) + "\n") if currentBlock.chrom not in self.blocksByChrom: self.blocksByChrom[currentBlock.chrom] = [] self.blocksByChrom[currentBlock.chrom].append(currentBlock) currentBlock = WigBlock(at, e, self.blocksize) # add the element to the current block currentBlock.add(e) at = self.handle.tell() if self.verbose: pind.done = self.handle.tell() pind.showProgress() # don't forget to add the final block if currentBlock != None: if self.debug: sys.stderr.write("closed block: " + str(currentBlock) + "\n") if currentBlock.chrom not in self.blocksByChrom: self.blocksByChrom[currentBlock.chrom] = [] self.blocksByChrom[currentBlock.chrom].append(currentBlock) # build the interval trees for chrom in self.blocksByChrom: self.itrees[chrom] = IntervalTree(self.blocksByChrom[chrom], openEnded=True)