def __init__(self, entries, format = 'Limited', bigBed=False): """ Create a BedFile instance. :param entries: an iterable of entries or a filename :param format: the format of the BED file """ self.format = format if bigBed: self.chroms = bigbed.readBigBed(entries) elif isinstance(entries, str): # filename try: self.chroms = readBedFile(entries, format) except UnicodeDecodeError: self.chroms = bigbed.readBigBed(entries) else: self.chroms = dict() if format.lower().startswith('bedpe'): for entry in entries: for num in range(1, 3): if num == 1: tree = self.chroms.get(entry.chrom1) if not tree: tree = ival.IntervalTree() self.chroms[entry.chrom1] = tree iv = ival.Interval(entry.chromStart1, entry.chromEnd1) tree.put(iv, entry) elif num == 2: tree = self.chroms.get(entry.chrom2) if not tree: tree = ival.IntervalTree() self.chroms[entry.chrom2] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.chromStart2, entry.chromEnd2) tree.put(iv, entry) else: for entry in entries: # check if the chromosome has been seen before tree = self.chroms.get(entry.chrom) if not tree: tree = ival.IntervalTree() self.chroms[entry.chrom] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.chromStart, entry.chromEnd) tree.put(iv, entry)
def poolBED(self, bedfile): for entry in bedfile: # check if the chromosome has been seen before tree = self.chroms.get(entry.chrom) if not tree: tree = ival.IntervalTree() self.chroms[entry.chrom] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.chromStart, entry.chromEnd) tree.put(iv, entry)
def readBigBed(filename): file = pyBigWig.open(filename) chroms = dict() if file.isBigBed(): if file.SQL().decode('utf8').lower().find( 'narrowpeak') != -1 or file.SQL().decode('utf8').lower().find( 'broadpeak') != -1: for chrom in file.chroms(): entries = file.entries(chrom, 0, file.chroms(chrom)) for entry in entries: try: words = entry[2].strip().split() chromStart = int(entry[0]) chromEnd = int(entry[1]) bed_entry = bed.BedEntry(chrom, chromStart, chromEnd) if len(words) >= 7: # narrowpeaks bed_entry.addOption(name=words[0], score=int(words[1]), strand=words[2], signalValue=float(words[3]), pValue=float(words[4]), qValue=float(words[5]), peak=int(words[6])) else: # broadpeaks bed_entry.addOption(name=words[0], score=int(words[1]), strand=words[2], signalValue=float(words[3]), pValue=float(words[4]), qValue=float(words[5])) # check if the chromosome has been seen before tree = chroms.get(chrom) if not tree: tree = ival.IntervalTree() chroms[chrom] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(bed_entry.chromStart, bed_entry.chromEnd) tree.put(iv, bed_entry) except RuntimeError as e: raise RuntimeError('Error in BIGBED file (%s)' % (e.strerror)) else: print("BigBed file not ENCODE narrowPeak or broadPeak") file.close() return chroms
def __contains__(self, item): if isinstance(item, BedEntry): tree = self.chroms.get(item.chrom) if tree is None: return False else: return ival.Interval(item.chromStart, item.chromEnd) in tree elif isinstance(item, BedPE): tree1 = self.chroms.get(item.chrom1) tree2 = self.chroms.get(item.chrom2) container = [] if tree1 is None: container.append(False) else: container.append(ival.Interval(item.chromStart1, item.chromEnd1) in tree1) if tree2 is None: container.append(False) else: container.append(ival.Interval(item.chromStart2, item. chromEnd2) in tree2) return container else: return False
def getClosest(self, item): if isinstance(item, BedEntry): tree = self.chroms.get(item.chrom) if tree is None: return None else: iv = ival.Interval(item.chromStart, item.chromEnd) node = tree.closest(iv) if node is not None: return node.values else: return None elif isinstance(item, BedPE): tree1 = self.chroms.get(item.chrom1) tree2 = self.chroms.get(item.chrom2) container = [] if tree1 is None: container.append(None) else: iv = ival.Interval(item.chromStart1, item.chromEnd1) node = tree1.closest(iv) if node is not None: container.append(node.values) else: container.append(None) if tree2 is None: container.append(None) else: iv = ival.Interval(item.chromStart1, item.chromEnd1) node = tree2.closest(iv) if node is not None: container.append(node.values) else: container.append(None) return container else: return None
def getOverlap(self, item): if isinstance(item, BedEntry): tree = self.chroms.get(item.chrom) if tree is None: return None else: iv = ival.Interval(item.chromStart, item.chromEnd) res = tree.isectall(iv) ret = [] for r in res: ret.extend(r.values) return ret elif isinstance(item, BedPE): tree1 = self.chroms.get(item.chrom1) tree2 = self.chroms.get(item.chrom2) container = [] if tree1 is None: container.append(None) else: iv = ival.Interval(item.chromStart1, item.chromEnd1) res = tree1.isectall(iv) ret = [] for r in res: ret.extend(r.values) container.append(ret) if tree2 is None: container.append(None) else: iv = ival.Interval(item.chromStart2, item.chromEnd2) res = tree2.isectall(iv) ret = [] for r in res: ret.extend(r.values) container.append(ret) return container else: return None
def readBedFile(filename, format = 'Limited'): """ Read a BED file. format: specifies the format of the file, "Limited", e.g. chr22 1000 5000 chr22 2000 6000 "Optional", e.g. track name=pairedReads description="Clone Paired Reads" useScore=1 chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512 chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601 ... (also handles the Limited + score, and BED6 format) "Peaks", e.g. chr1 569780 569930 . 0 . 19 6.07811 -1 -1 chr1 713300 713450 . 0 . 54 49.1167 -1 -1 "Strand", e.g. chr4 185772359 185772424 - chr18 20513381 20513401 + also supports a 5th label field chr5 20611949 20611949 + ENSG00000251629_20611949 chr3 42187863 42187863 - ENSG00000234562_42187863 "Summit", e.g. # d = 130 chr start end length summit tags -10*log10(pvalue) fold_enrichment FDR(%) chr1 8250 8671 422 286 46 145.84 11.68 0.51 chr1 36382 36984 603 405 46 315.23 27.05 0.24 "CCAT", e.g. chr8 94747805 94747070 94749250 525 3 21.519196 0.002000 chr17 55277895 55277070 55279280 560 18 21.283333 0.002000 "Cropped", e.g. chr1 851602 10 chr1 921184 18 chr1 931838 9 "BedPE", e.g. chrom1 chromStart1 chromEnd1 chrom2 chromStart2 chromEnd2 + any number of additional fields chr1 85617 86100 chr1 120030 125039 chr2 73891 74871 chr5 12709 12990 """ f = open(filename) row = 0 acceptHeaderRows = 1 headerRow = None sissrs = False gem = False start = False chroms = dict() for line in f: row += 1 words = line.strip().split() if len(words) == 0: continue if words[0].strip().startswith('='): sissrs = True continue if words[0].strip().startswith('Position'): gem = True continue if sissrs: if words[0].strip().startswith('-'): start = True continue elif start: chrom = words[0] chromStart = int(words[1]) chromEnd = int(words[2]) entry = BedEntry(chrom, chromStart, chromEnd) entry.addOption(signalValue=int(words[3]), name = " ", score = int(words[3]), strand = '.', pValue = float(-1), qValue = float(-1), peak = int(-1)) # check if the chromosome has been seen before tree = chroms.get(chrom) if not tree: tree = ival.IntervalTree() chroms[chrom] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.chromStart, entry.chromEnd) tree.put(iv, entry) else: continue elif gem: chrom, centre = words[0].split(':') centre = int(centre) chromStart = centre - 50 chromEnd = centre + 50 entry = BedEntry(chrom, chromStart, chromEnd) entry.addOption(signalValue=float(words[1]), name=" ", score=float(words[7]), strand=words[13], peak=centre, pValue=float(words[6]), qValue=float(words[5])) tree = chroms.get(chrom) if not tree: tree = ival.IntervalTree() chroms[chrom] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.chromStart, entry.chromEnd) tree.put(iv, entry) else: if len(words) == 0: continue # ignore empty lines if words[0].strip().startswith('#'): continue # comment if words[0].strip().startswith('browser'): continue # ignore if words[0].strip().startswith('track'): continue # ignore if words[1].strip().startswith('start'): continue # ignore try: if format.lower().startswith('bedpe'): chrom1 = words[0] chromStart1 = int(words[1]) chromEnd1 = int(words[2]) chrom2 = words[3] chromStart2 = int(words[4]) chromEnd2 = int(words[5]) entry = BedPE(chrom1, chromStart1, chromEnd1, chrom2, chromStart2, chromEnd2) if len(words) == 8: entry.addOption(PETs=int(words[6]), pValue=float(words[7])) if len(words) == 13: entry.addOption(name1=words[6], name2=words[7], depth1=int(words[8]), depth2=int(words[9]), PETs=int(words[10]), pValue=float(words[11]), fdr=float(words[12])) if chrom1 == chrom2: tree = chroms.get(chrom1) if not tree: tree = ival.IntervalTree() chroms[chrom1] = tree iv1 = ival.Interval(entry.chromStart1, entry.chromEnd1) iv2 = ival.Interval(entry.chromStart2, entry.chromEnd2) tree.put(iv1, entry) tree.put(iv2, entry) else: tree1 = chroms.get(chrom1) tree2 = chroms.get(chrom2) if not tree1: tree1 = ival.IntervalTree() chroms[chrom1] = tree1 if not tree2: tree2 = ival.IntervalTree() chroms[chrom2] = tree2 # put the entry in the interval tree for the appropriate chromosome iv1 = ival.Interval(entry.chromStart1, entry.chromEnd1) iv2 = ival.Interval(entry.chromStart2, entry.chromEnd2) tree1.put(iv1, entry) tree2.put(iv2, entry) else: chrom = words[0] if format.lower().startswith('ccat'): chromStart = int(words[2]) chromEnd = int(words[3]) else: # all other standard BED formats try: chromStart = int(words[1]) chromEnd = int(words[2]) except ValueError: print(words) continue entry = BedEntry(chrom, chromStart, chromEnd) if format.lower().startswith('opt'): if len(words) >= 9: entry.addOption(name = words[3], score = float(words[4]), strand = words[5], thickStart = int(words[6]), thickEnd = int(words[7]), itemRgb = words[8]) elif len(words) >= 6: entry.addOption(name = words[3], score = float(words[4]), strand = words[5]) elif len(words) >= 5: entry.addOption(name = words[3], score = float(words[4])) elif len(words) >= 4: entry.addOption(name = words[3]) else: entry.addOption(name = '.', score = int(words[3]), strand = '.') elif format.lower().startswith('bed6'): entry.addOption(name=words[3], score=float(words[4]), strand=words[5]) elif format.lower().startswith('strand'): if len(words) >= 4: # properly formatted entry.addOption(strand = words[3]) if len(words) >= 5: entry.addOption(name = words[4]) elif format.lower().startswith('peak'): if len(words) >= 10: # narrowpeaks entry.addOption(name = words[3], score = int(words[4]), strand = words[5], signalValue = float(words[6]), pValue = float(words[7]), qValue = float(words[8]), peak = int(words[9])) else: # broadpeaks entry.addOption(name = words[3], score = int(words[4]), strand = words[5], signalValue = float(words[6]), pValue = float(words[7]), qValue = float(words[8])) elif format.lower().startswith('rp'): entry.addOption(name=words[3], score=int(words[4]), strand=words[5], signalValue=float(words[6]), pValue=float(words[7]), qValue=float(words[8]), rank=[float(r) for r in list(words[9].split(","))]) elif format.lower().startswith('idr'): entry.addOption(name=words[3], score=int(words[4]), strand=words[5], signalValue=float(words[6]), pValue=float(words[7]), qValue=float(words[8])) elif format.lower().startswith('2idr'): #For IDR input with actual IDR values entry.addOption(name=words[3], pValue=float(words[9]), qValue=float(words[10])) elif format.lower().startswith('summit'): if len(words) >= 9: entry.addOption(summit = int(words[4]), tags = int(words[5]), pValue = float(words[6]), fold = float(words[7]), fdr = float(words[8])) else: entry.addOption(summit = int(words[4]), tags = int(words[5]), pValue = float(words[6]), fold = float(words[7])) elif format.lower().startswith('ccat'): entry.addOption(summit = int(words[1]) - entry.chromStart, tags = int(words[4]), bg = int(words[5]), zscore = float(words[6]), fdr = float(words[7]), name = '.', score = int(words[4]), strand = '.') elif format.lower().startswith('crop'): entry.addOption(score = int(words[2]), name = '.', strand = '.') entry.chromEnd = entry.chromStart + 1 elif format.lower().startswith('bed12'): entry.addOption(name=words[3], score=float(words[4]), strand=words[5], thickStart=int(words[6]), thickEnd=int(words[7]), itemRgb=words[8], blockCount=int(words[9]), blockSizes=words[10], blockStarts=words[11]) elif format.lower().startswith('TSS'): entry.addOption(name=str(words[3]), gene=str(words[4]), strand=words[5]) elif format.lower().startswith('mspc'): entry.addOption(name=str(words[3]), signalValue=float(words[4])) # check if the chromosome has been seen before tree = chroms.get(chrom) if not tree: tree = ival.IntervalTree() chroms[chrom] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.chromStart, entry.chromEnd) tree.put(iv, entry) except RuntimeError as e: if not acceptHeaderRows: raise RuntimeError('Error in BED file at row %d (%s)' % (row, e.strerror)) else: headerRow = words acceptHeaderRows -= 1 # count down the number of header rows that can occur f.close() return chroms
def getInterval(self): return ival.Interval(self.chromStart, self.chromEnd)
def getInterval(self): return [ival.Interval(self.chromStart1, self.chromEnd1), ival.Interval(self.chromStart2, self.chromEnd2)]