Ejemplo n.º 1
0
def test_length():
    iv = Interval(0, 0)
    assert iv.length() == 0

    iv = Interval(0, 3)
    assert iv.length() == 3

    iv = Interval(-1, 1, 'data')
    assert iv.length() == 2

    iv = Interval(0.1, 3)
    assert iv.length() == 2.9
Ejemplo n.º 2
0
def test_length():
    iv = Interval(0, 0)
    assert iv.length() == 0

    iv = Interval(0, 3)
    assert iv.length() == 3

    iv = Interval(-1, 1, 'data')
    assert iv.length() == 2

    iv = Interval(0.1, 3)
    assert iv.length() == 2.9
Ejemplo n.º 3
0
def find_len_non_overlap(interval: Interval, itree: IntervalTree) -> int:
    overlaps = IntervalTree(itree.overlap(interval))
    overlaps.merge_overlaps()

    len_overlap = sum([intersection(interval, o).length() for o in overlaps])

    return interval.length() - len_overlap
Ejemplo n.º 4
0
def total_intersection(itree: IntervalTree, interval: Interval) -> int:
    if interval.length() <= 0:
        return 0

    total = 0
    ovlps = IntervalTree(itree.overlap(interval))
    ovlps.merge_overlaps()
    for ovlp in ovlps:
        inter = intersect(interval, ovlp)
        total += inter.length()

    return total
Ejemplo n.º 5
0
def filter_by_interval(
    itree: IntervalTree,
    interval: Interval,
    min_length: int,
    prop_coverage: float
) -> bool:
    """ """

    if itree is None and interval.length() >= min_length:
        return False

    len_intersect = total_intersection(itree, interval)

    if interval.length() <= 0:
        prop_intersect = 0
    else:
        prop_intersect = len_intersect / interval.length()

    lt_min_length = (interval.length() - len_intersect) < min_length
    gt_max_cov = prop_intersect >= prop_coverage
    return lt_min_length or gt_max_cov
Ejemplo n.º 6
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])
    regex = re.compile(r"^>?(?P<genome>[^:\s]+):(?P<seqid>[^:\s]+)"
                       r":(?P<start>\d+)-(?P<end>\d+)")

    itree = defaultdict(IntervalTree)
    seqs = {s.id: s for s in SeqIO.parse(args.infile, format="fasta")}

    for id_ in seqs.keys():
        genome, seqid, start, end = parse_id_as_interval(id_, regex)
        interval = Interval(start, end, data=id_)

        if interval in itree[(genome, seqid)]:
            continue

        envelops = itree[(genome, seqid)].envelop(interval)
        overlaps = itree[(genome, seqid)].overlap(interval)

        # If the interval completely overlaps one already in the tree
        # replace it. This would be covered by overlaps, by should be faster.
        if len(envelops) > 0:
            itree[(genome, seqid)].remove_overlap(start, end)
            itree[(genome, seqid)].add(interval)

        # If the interval partially overlaps one, or is completely contained
        # by an interval in the tree, we interrogate further.
        elif len(overlaps) > 0:
            to_remove = []
            add_to_tree = True

            for i in overlaps:
                cov_match = coverage(i, interval) > args.coverage

                # If the coverage of the shorter interval is above a threshold
                # and the interval already in the tree is the shorter one,
                # we flag it for replacement.
                if cov_match and i.length() < interval.length():
                    to_remove.append(i)

                # If the new interval was the shorter of the intervals.
                elif cov_match:
                    add_to_tree = False
                    break

            # We reached all the way through without discarding the new
            # interval.
            if add_to_tree:
                for i in to_remove:
                    itree[(genome, seqid)].remove(i)

                itree[(genome, seqid)].add(interval)

        # If it doesn't overlap the sequence at all.
        else:
            itree[(genome, seqid)].add(interval)

    for (genome, seqid), subitree in itree.items():
        SeqIO.write(
            (seqs[i.data] for i in subitree),
            args.outfile,
            format="fasta"
        )

    return