def test_length(): iv = Interval(0, 0) assert iv.length() == 0 iv = Interval(0, 3) assert iv.length() == 3 iv = Interval(-1, 1, 'data') assert iv.length() == 2 iv = Interval(0.1, 3) assert iv.length() == 2.9
def test_length(): iv = Interval(0, 0) assert iv.length() == 0 iv = Interval(0, 3) assert iv.length() == 3 iv = Interval(-1, 1, 'data') assert iv.length() == 2 iv = Interval(0.1, 3) assert iv.length() == 2.9
def find_len_non_overlap(interval: Interval, itree: IntervalTree) -> int: overlaps = IntervalTree(itree.overlap(interval)) overlaps.merge_overlaps() len_overlap = sum([intersection(interval, o).length() for o in overlaps]) return interval.length() - len_overlap
def total_intersection(itree: IntervalTree, interval: Interval) -> int: if interval.length() <= 0: return 0 total = 0 ovlps = IntervalTree(itree.overlap(interval)) ovlps.merge_overlaps() for ovlp in ovlps: inter = intersect(interval, ovlp) total += inter.length() return total
def filter_by_interval( itree: IntervalTree, interval: Interval, min_length: int, prop_coverage: float ) -> bool: """ """ if itree is None and interval.length() >= min_length: return False len_intersect = total_intersection(itree, interval) if interval.length() <= 0: prop_intersect = 0 else: prop_intersect = len_intersect / interval.length() lt_min_length = (interval.length() - len_intersect) < min_length gt_max_cov = prop_intersect >= prop_coverage return lt_min_length or gt_max_cov
def main(): args = cli(sys.argv[0], sys.argv[1:]) regex = re.compile(r"^>?(?P<genome>[^:\s]+):(?P<seqid>[^:\s]+)" r":(?P<start>\d+)-(?P<end>\d+)") itree = defaultdict(IntervalTree) seqs = {s.id: s for s in SeqIO.parse(args.infile, format="fasta")} for id_ in seqs.keys(): genome, seqid, start, end = parse_id_as_interval(id_, regex) interval = Interval(start, end, data=id_) if interval in itree[(genome, seqid)]: continue envelops = itree[(genome, seqid)].envelop(interval) overlaps = itree[(genome, seqid)].overlap(interval) # If the interval completely overlaps one already in the tree # replace it. This would be covered by overlaps, by should be faster. if len(envelops) > 0: itree[(genome, seqid)].remove_overlap(start, end) itree[(genome, seqid)].add(interval) # If the interval partially overlaps one, or is completely contained # by an interval in the tree, we interrogate further. elif len(overlaps) > 0: to_remove = [] add_to_tree = True for i in overlaps: cov_match = coverage(i, interval) > args.coverage # If the coverage of the shorter interval is above a threshold # and the interval already in the tree is the shorter one, # we flag it for replacement. if cov_match and i.length() < interval.length(): to_remove.append(i) # If the new interval was the shorter of the intervals. elif cov_match: add_to_tree = False break # We reached all the way through without discarding the new # interval. if add_to_tree: for i in to_remove: itree[(genome, seqid)].remove(i) itree[(genome, seqid)].add(interval) # If it doesn't overlap the sequence at all. else: itree[(genome, seqid)].add(interval) for (genome, seqid), subitree in itree.items(): SeqIO.write( (seqs[i.data] for i in subitree), args.outfile, format="fasta" ) return