def binned_bitsets_by_chrom(f, chrom, chrom_col=0, start_col=1, end_col=2): """Read a file by chrom name into a bitset""" bitset = BinnedBitSet(MAX) for line in f: if line.startswith("#"): continue fields = line.split() if fields[chrom_col] == chrom: start, end = int(fields[start_col]), int(fields[end_col]) bitset.set_range(start, end-start) return bitset
def binned_bitsets(self, upstream_pad=0, downstream_pad=0, lens={}): # The incoming lens dictionary is a dictionary of chromosome lengths # which are used to initialize the bitsets. last_chrom = None last_bitset = None bitsets = dict() for interval in self: if isinstance(interval, GenomicInterval): chrom = interval[self.chrom_col] if chrom != last_chrom: if chrom not in bitsets: size = lens.get(chrom, MAX) try: bbs = BinnedBitSet(size) except ValueError as e: # We will only reach here when constructing this bitset from the lens dict # since the value of MAX is always safe. raise Exception( "Invalid chrom length %s in 'lens' dictionary. %s" % (str(size), str(e))) bitsets[chrom] = bbs last_chrom = chrom last_bitset = bitsets[chrom] start = max(int(interval[self.start_col]), 0) end = min(int(interval[self.end_col]), last_bitset.size) last_bitset.set_range(start, end - start) return bitsets
def binned_bitsets_proximity(f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream=0, downstream=0): """Read a file into a dictionary of bitsets""" last_chrom = None last_bitset = None bitsets = dict() for line in f: if line.startswith("#"): continue # print "input=%s" % ( line ), fields = line.split() strand = "+" if len(fields) >= strand_col + 1: if fields[strand_col] == "-": strand = "-" chrom = fields[chrom_col] if chrom != last_chrom: if chrom not in bitsets: bitsets[chrom] = BinnedBitSet(MAX) last_chrom = chrom last_bitset = bitsets[chrom] start, end = int(fields[start_col]), int(fields[end_col]) if strand == "+": if upstream: start = max(0, start - upstream) if downstream: end = min(MAX, end + downstream) if strand == "-": if upstream: end = min(MAX, end + upstream) if downstream: start = max(0, start - downstream) # print "set: start=%d\tend=%d" % ( start, end ) if end-start > 0: last_bitset.set_range(start, end-start) return bitsets
def binned_bitsets_from_list(list=[]): """Read a list into a dictionary of bitsets""" last_chrom = None last_bitset = None bitsets = dict() for l in list: chrom = l[0] if chrom != last_chrom: if chrom not in bitsets: bitsets[chrom] = BinnedBitSet(MAX) last_chrom = chrom last_bitset = bitsets[chrom] start, end = int(l[1]), int(l[2]) last_bitset.set_range(start, end - start) return bitsets
def binned_bitsets_from_bed_file(f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream_pad=0, downstream_pad=0, lens={}): """ Read a file into a dictionary of bitsets. The defaults arguments - 'f' should be a file like object (or any iterable containing strings) - 'chrom_col', 'start_col', and 'end_col' must exist in each line. - 'strand_col' is optional, any line without it will be assumed to be '+' - if 'lens' is provided bitset sizes will be looked up from it, otherwise chromosomes will be assumed to be the maximum size """ last_chrom = None last_bitset = None bitsets = dict() offset = 0 for line in f: if line.startswith("#") or line.isspace(): continue # Ignore browser lines completely if line.startswith("browser"): continue # Need to check track lines due to the offset if line.startswith("track"): m = re.search(r"offset=(\d+)", line) if m and m.group(1): offset = int(m.group(1)) continue fields = line.split() chrom = fields[chrom_col] if chrom != last_chrom: if chrom not in bitsets: if chrom in lens: size = lens[chrom] else: size = MAX bitsets[chrom] = BinnedBitSet(size) last_chrom = chrom last_bitset = bitsets[chrom] start, end = int(fields[start_col]) + offset, int(fields[end_col]) + offset if upstream_pad: start = max(0, start - upstream_pad) if downstream_pad: end = min(size, end + downstream_pad) if start > end: warn("Interval start after end!") last_bitset.set_range(start, end-start) return bitsets
def bitset_complement(exons): bits = BinnedBitSet(MAX) introns = [] for start, end in exons: bits.set_range(start, end - start) bits.invert() # only complement within the range of the list ex_start = min([a[0] for a in exons]) ex_end = max([a[1] for a in exons]) end = ex_start len = ex_end while True: start = bits.next_set(end) if start == bits.size: break end = bits.next_clear(start) if end > len: end = len if start != end: introns.append((start, end)) if end == len: break return introns
def clone(bits): b = BinnedBitSet(bits.size) b.ior(bits) return b
def list2bits(ex): bits = BinnedBitSet(MAX) for start, end in ex: bits.set_range(start, end - start) return bits
def clone( bits ): b = BinnedBitSet( bits.size ) b.ior( bits ) return b
def copybits(binnedbits): bitset = BinnedBitSet(binnedbits.size) bitset.ior(binnedbits) return bitset