def count(self, bed): '''update internal counts.''' results = [] for track in self.tracks: try: overlaps = [(x[0], x[1]) for x in self.index[track][bed.contig].find( bed.start, bed.end)] except KeyError: overlaps = [] results.append((len(overlaps), Intervals.calculateOverlap([ (bed.start, bed.end), ], Intervals.combine(overlaps)))) self.data = results
def merge(iterator, max_distance=0, by_name=False, min_intervals=1, remove_inconsistent=False, resolve_blocks=False, stranded=False): """iterator for merging adjacent bed entries. *max_distance* > 0 permits merging of intervals that are not directly adjacent. If *by_name = True*, only entries with the same name are merged. If *remove_inconsistent*, overlapping intervals where the names are inconsistent will be removed. The score gives the number of intervals that have been merged. """ if remove_inconsistent and by_name: assert ValueError( "using both remove_inconsistent and by_name makes no sense") def iterate_chunks(iterator): max_end = defaultdict(int) to_join = defaultdict(list) last_name = defaultdict(str) last = next(iterator) if not stranded: strand = "." else: strand = last.strand max_end[strand] = last.end to_join[strand] = [last] for bed in iterator: if not stranded: strand = "." else: strand = bed.strand d = bed.start - max_end[strand] if bed.contig == last.contig: assert bed.start >= last.start, \ "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" \ % (d, last, bed) if bed.contig != last.contig: for s in to_join: if to_join[s]: yield to_join[s] to_join[s] = [] max_end[s] = 0 elif (d > max_distance or (by_name and last_name[strand] and last_name[strand] != bed.name)): if to_join[strand]: yield to_join[strand] to_join[strand] = list() last = bed last_name[strand] = last.name max_end[strand] = max(bed.end, max_end[strand]) to_join[strand].append(bed) for strand in sorted(to_join): if to_join[strand]: try: yield to_join[strand] except: return c = E.Counter() for to_join in iterate_chunks(iterator): c.input += 1 if remove_inconsistent: names = set([x.name for x in to_join]) if len(names) > 1: c.skipped_inconsistent_intervals += 1 continue if resolve_blocks: # keep track of number of intervals in each entry for bed in to_join: bed["score"] = 1 merged = True while merged: joined = [] not_joined = [] merged = False while len(to_join) > 0: bed1, to_join = to_join[0], to_join[1:] intervals1 = bed1.toIntervals() for bed2 in to_join: intervals2 = bed2.toIntervals() if Intervals.calculateOverlap(intervals1, intervals2) > 0: intervals = Intervals.combine(intervals1 + intervals2) bed1.fromIntervals(intervals) bed1["score"] += bed2["score"] merged = True else: not_joined.append(bed2) joined.append(bed1) to_join = not_joined not_joined = [] to_join = joined joined = [] to_join = sorted(to_join, key=lambda x: int(x.start)) # keep only those with the created from the merge of the minimum # number of intervals for bed in to_join: if bed["score"] < min_intervals: c.skipped_min_intervals += 1 continue yield bed c.output += 1 else: if len(to_join) < min_intervals: c.skipped_min_intervals += 1 continue a = to_join[0] a.end = max([entry.end for entry in to_join]) a.score = len(to_join) yield a c.output += 1 E.info(str(c))