Exemple #1
0
def uniq(args):
    """
    %prog uniq bedfile > newbedfile

    Remove overlapping features with higher scores.
    """
    p = OptionParser(uniq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    uniqbedfile = bedfile.split(".")[0] + ".uniq.bed"
    bed = Bed(bedfile)
    if not need_update(bedfile, uniqbedfile):
        return uniqbedfile

    ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \
                for i, x in enumerate(bed)]
    selected, score = range_chain(ranges)
    selected = [bed[x.id] for x in selected]

    newbed = Bed()
    newbed.extend(selected)
    newbed.print_to_file(uniqbedfile, sorted=True)
    logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed)))

    return uniqbedfile
Exemple #2
0
def uniq(args):
    """
    %prog uniq bedfile

    Remove overlapping features with higher scores.
    """
    p = OptionParser(uniq.__doc__)
    p.add_option("--slen",
                 default=False,
                 action="store_true",
                 help="Use sequence length as score [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    uniqbedfile = bedfile.split(".")[0] + ".uniq.bed"
    bed = Bed(bedfile)

    if opts.slen:
        ranges = [Range(x.seqid, x.start, x.end, x.end - x.start, i) \
                    for i, x in enumerate(bed)]
    else:
        ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \
                    for i, x in enumerate(bed)]
    selected, score = range_chain(ranges)
    selected = [x.id for x in selected]
    selected_ids = set(selected)
    selected = [bed[x] for x in selected]
    notselected = [x for i, x in enumerate(bed) if i not in selected_ids]

    newbed = Bed()
    newbed.extend(selected)
    newbed.print_to_file(uniqbedfile, sorted=True)

    if notselected:
        leftoverfile = bedfile.split(".")[0] + ".leftover.bed"
        leftoverbed = Bed()
        leftoverbed.extend(notselected)
        leftoverbed.print_to_file(leftoverfile, sorted=True)

    logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed)))

    return uniqbedfile
Exemple #3
0
def select_bed(bed):
    """
    Return non-overlapping set of ranges, choosing high scoring blocks over low
    scoring alignments when there are conflicts.
    """
    ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) for i, x in enumerate(bed)]
    selected, score = range_chain(ranges)
    selected = [bed[x.id] for x in selected]

    return selected
Exemple #4
0
def get_piles(allgenes):
    """
    Before running uniq, we need to compute all the piles. The piles are a set
    of redundant features we want to get rid of. Input are a list of GffLines
    features. Output are list of list of features distinct "piles".
    """
    from jcvi.utils.range import Range, range_piles

    ranges = [Range(a.seqid, a.start, a.end, 0, i) \
                    for i, a in enumerate(allgenes)]

    for pile in range_piles(ranges):
        yield [allgenes[x] for x in pile]
Exemple #5
0
def get_range(q, s, t, i, order, block_pairs, clip=10):
    pairs = get_best_pair(q, s, t)
    score = len(pairs)
    block_pairs[i].update(pairs)

    q = [order[x][0] for x in q]
    q.sort()
    qmin = q[0]
    qmax = q[-1]
    if qmax - qmin >= 2 * clip:
        qmin += clip / 2
        qmax -= clip / 2

    return Range("0", qmin, qmax, score=score, id=i)
Exemple #6
0
def uniq(args):
    """
    %prog uniq gffile cdsfasta

    Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping
    'piles' are processed, one by one.

    Here, we use a different algorithm, that retains the best non-overlapping
    subset witin each pile, rather than single best model. Scoring function is
    also different, rather than based on score or span, we optimize for the
    subset that show the best combined score. Score is defined by:

    score = (1 - AED) * length
    """
    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gffile, cdsfasta = args
    gff = Gff(gffile)
    sizes = Sizes(cdsfasta).mapping
    gene_register = {}
    for g in gff:
        if g.type != "mRNA":
            continue
        aed = float(g.attributes["_AED"][0])
        gene_register[g.parent] = (1 - aed) * sizes[g.accn]

    allgenes = import_feats(gffile)
    g = get_piles(allgenes)

    bestids = set()
    for group in g:
        ranges = [Range(x.seqid, x.start, x.end, \
                    gene_register[x.accn], x.accn) for x in group]
        selected_chain, score = range_chain(ranges)
        bestids |= set(x.id for x in selected_chain)

    removed = set(x.accn for x in allgenes) - bestids
    fw = open("removed.ids", "w")
    print >> fw, "\n".join(sorted(removed))
    fw.close()
    populate_children(opts.outfile, bestids, gffile, "gene")
Exemple #7
0
def BlastOrCoordsLine(filename, filter="ref", dialect="blast", clip=0):
    allowed_filters = ("ref", "query")
    REF, QUERY = range(len(allowed_filters))

    allowed_dialects = ("blast", "coords")
    BLAST, COORDS = range(len(allowed_dialects))

    assert filter in allowed_filters
    filter = allowed_filters.index(filter)

    assert dialect in allowed_dialects
    dialect = allowed_dialects.index(dialect)

    fp = open(filename)
    for i, row in enumerate(fp):
        if row[0] == '#':
            continue
        if dialect == BLAST:
            b = BlastLine(row)
            if filter == QUERY:
                query, start, end = b.query, b.qstart, b.qstop
            else:
                query, start, end = b.subject, b.sstart, b.sstop
        else:
            try:
                b = CoordsLine(row)
            except AssertionError:
                continue

            if filter == QUERY:
                query, start, end = b.query, b.start2, b.end2
            else:
                query, start, end = b.ref, b.start1, b.end1

        if start > end:
            start, end = end, start

        if clip:
            # clip cannot be more than 5% of the range
            r = end - start + 1
            cc = min(.05 * r, clip)
            start = start + cc
            end = end - cc

        yield Range(query, start, end, b.score, i)
Exemple #8
0
def cluster(args):
    """
    %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile

    Cluster the segments and form PAD. This is the method described in Tang et
    al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks,
    based on which the genome on one or both axis can be chopped up into pieces
    and clustered.
    """
    from jcvi.utils.range import Range

    p = OptionParser(cluster.__doc__)
    p.set_beds()
    p.add_option(
        "--minsize", default=10, type="int", help="Only segment using blocks >= size"
    )
    p.add_option(
        "--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary"
    )

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, anchorfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    minsize = opts.minsize
    ac = AnchorFile(anchorfile)
    qranges, sranges = [], []
    qextra = [x[1:] for x in qbed.get_breaks()]
    sextra = [x[1:] for x in sbed.get_breaks()]

    id = 0
    for block in ac.iter_blocks(minsize=minsize):
        q, s = list(zip(*block))[:2]
        q = [qorder[x][0] for x in q]
        s = [sorder[x][0] for x in s]
        minq, maxq = min(q), max(q)
        mins, maxs = min(s), max(s)
        id += 1

        qr = Range("0", minq, maxq, maxq - minq, id)
        sr = Range("0", mins, maxs, maxs - mins, id)
        qranges.append(qr)
        sranges.append(sr)

    qpads = list(get_segments(qranges, qextra))
    spads = list(get_segments(sranges, sextra))

    suffix = ".pad.bed"
    qpf = opts.qbed.split(".")[0]
    spf = opts.sbed.split(".")[0]
    qpadfile = qpf + suffix
    spadfile = spf + suffix
    qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed)
    snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed)

    qpadbed, spadbed = Bed(qpadfile), Bed(spadfile)

    logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames)
    m, n = logmp.shape

    matrixfile = ".".join((qpf, spf, "logmp.txt"))
    fw = open(matrixfile, "w")
    header = ["o"] + spadnames
    print("\t".join(header), file=fw)
    for i in range(m):
        row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]]
        print("\t".join(row), file=fw)

    fw.close()

    # Run CLUSTER 3.0 (Pearson correlation, average linkage)
    cmd = op.join(opts.path, "cluster")
    cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile)
    pf = matrixfile.rsplit(".", 1)[0]
    cdtfile = pf + ".cdt"
    if need_update(matrixfile, cdtfile):
        sh(cmd)
Exemple #9
0
def deletion(args):
    """
    %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed

    Find IES based on mapping MAC reads to MIC genome.
    """
    p = OptionParser(deletion.__doc__)
    p.add_option("--mindepth",
                 default=3,
                 type="int",
                 help="Minimum depth to call a deletion")
    p.add_option("--minspan",
                 default=30,
                 type="int",
                 help="Minimum span to call a deletion")
    p.add_option("--split",
                 default=False,
                 action="store_true",
                 help="Break at cigar N into separate parts")
    p.set_tmpdir()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, gapsbedfile = args
    if bedfile.endswith(".bam"):
        bamfile = bedfile
        bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed")
        if need_update(bamfile, bedfile):
            cmd = "bamToBed -i {0}".format(bamfile)
            if opts.split:
                cmd += " -split"
            cmd += " | cut -f1-4"
            sh(cmd, outfile=bedfile)

    sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir)
    if bedfile.endswith(".sorted.bed"):
        pf = bedfile.rsplit(".", 2)[0]
        sortedbedfile = bedfile
    else:
        pf = bedfile.rsplit(".", 1)[0]
        sortedbedfile = pf + ".sorted.bed"
        if need_update(bedfile, sortedbedfile):
            sort([bedfile, "-u", "--accn", sort_tmpdir])

    # Find reads that contain multiple matches
    ibedfile = pf + ".d.bed"
    if need_update(sortedbedfile, ibedfile):
        bed = Bed(sortedbedfile, sorted=False)
        fw = open(ibedfile, "w")
        logging.debug("Write deletions to `{0}`.".format(ibedfile))
        for accn, bb in groupby(bed, key=lambda x: x.accn):
            bb = list(bb)
            branges = [(x.seqid, x.start, x.end) for x in bb]
            iranges = range_interleave(branges)
            for seqid, start, end in iranges:
                if end - start + 1 < opts.minspan:
                    continue
                print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, accn + '-d')), file=fw)
        fw.close()

    # Uniqify the insertions and count occurrences
    countbedfile = pf + ".uniq.bed"
    if need_update(ibedfile, countbedfile):
        bed = Bed(ibedfile)
        fw = open(countbedfile, "w")
        logging.debug("Write counts to `{0}`.".format(countbedfile))
        registry = Counter((x.seqid, x.start, x.end) for x in bed)
        ies_id = 1
        for (seqid, start, end), count in registry.items():
            ies_name = "{0:05d}-r{1}".format(ies_id, count)
            if count < opts.mindepth:
                continue
            print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name)), file=fw)
            ies_id += 1
        fw.close()
        sort([countbedfile, "-i", sort_tmpdir])

    # Remove deletions that contain some read depth
    depthbedfile = pf + ".depth.bed"
    if need_update((sortedbedfile, countbedfile), depthbedfile):
        depth([
            sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)
        ])

    validbedfile = pf + ".valid.bed"
    if need_update(depthbedfile, validbedfile):
        fw = open(validbedfile, "w")
        logging.debug("Filter valid deletions to `{0}`.".format(validbedfile))
        bed = Bed(depthbedfile)
        all_scores = [float(b.score) for b in bed]
        lb, ub = outlier_cutoff(all_scores)
        logging.debug(
            "Bounds for depths: LB={0:.2f} (ignored)  UB={1:.2f}".format(
                lb, ub))
        for b in bed:
            if float(b.score) > ub:
                continue
            print(b, file=fw)
        fw.close()

    # Remove deletions that contain sequencing gaps on its flanks
    selectedbedfile = pf + ".selected.bed"
    if need_update(validbedfile, selectedbedfile):
        flanksbedfile = pf + ".flanks.bed"
        fw = open(flanksbedfile, "w")
        bed = Bed(validbedfile)
        flank = 100
        logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile))
        for b in bed:
            start, end = b.start, b.end
            b.start, b.end = start, min(start + flank - 1, end)
            print(b, file=fw)
            b.start, b.end = max(start, end - flank + 1), end
            print(b, file=fw)
        fw.close()

        intersectidsfile = pf + ".intersect.ids"
        cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile)
        cmd += " | cut -f4 | sort -u"
        sh(cmd, outfile=intersectidsfile)
        some([
            validbedfile, intersectidsfile, "-v",
            "--outfile={0}".format(selectedbedfile)
        ])

    # Find best-scoring non-overlapping set
    iesbedfile = pf + ".ies.bed"
    if need_update(selectedbedfile, iesbedfile):
        bed = Bed(selectedbedfile)
        fw = open(iesbedfile, "w")
        logging.debug("Write IES to `{0}`.".format(iesbedfile))
        branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \
                        for i, x in enumerate(bed)]
        iranges, iscore = range_chain(branges)
        logging.debug("Best chain score: {0} ({1} IES)".\
                        format(iscore, len(iranges)))
        ies_id = 1
        for seqid, start, end, score, id in iranges:
            ies_name = "IES-{0:05d}-r{1}".format(ies_id, score)
            span = end - start + 1
            print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name, span)), file=fw)
            ies_id += 1
        fw.close()
Exemple #10
0
def mcscan(args):
    """
    %prog mcscan bedfile anchorfile

    Stack synteny blocks on a reference bed, MCSCAN style. The first column in
    the output is the reference order, given in the bedfile. Then each column
    next to it are separate 'tracks'.
    """
    from jcvi.utils.range import Range, range_chain

    p = OptionParser(mcscan.__doc__)
    p.add_option("--iter",
                 default=100,
                 type="int",
                 help="Max number of chains to output [default: %default]")
    p.add_option(
        "--ascii",
        default=False,
        action="store_true",
        help="Output symbols rather than gene names [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, anchorfile = args
    ascii = opts.ascii
    bed = Bed(bedfile)
    order = bed.order

    ac = AnchorFile(anchorfile)
    ranges = []
    block_pairs = {}
    for i, (q, s) in enumerate(ac.iter_blocks()):
        if q[0] not in order:
            q, s = s, q

        pairs = dict(zip(q, s))
        block_pairs[i] = pairs

        q = [order[x] for x in q]
        q.sort()
        ranges.append(Range("0", q[0], q[-1], score=len(q), id=i))

    tracks = []
    print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges))
    iteration = 0
    while ranges:
        if iteration >= opts.iter:
            break

        selected, score = range_chain(ranges)
        tracks.append(selected)
        selected = set(x.id for x in selected)
        ranges = [x for x in ranges if x.id not in selected]
        msg = "Chain {0}: score={1}".format(iteration, score)
        if ranges:
            msg += " {0} blocks remained..".format(len(ranges))
        else:
            msg += " done!"

        print >> sys.stderr, msg
        iteration += 1

    for b in bed:
        id = b.accn
        atoms = []
        for track in tracks:
            track_ids = [x.id for x in track]
            for tid in track_ids:
                pairs = block_pairs[tid]
                anchor = pairs.get(id, ".")
                if anchor != ".":
                    break
            if ascii and anchor != ".":
                anchor = "x"
            atoms.append(anchor)

        sep = "" if ascii else "\t"
        print "\t".join((id, sep.join(atoms)))
Exemple #11
0
import pytest

from jcvi.utils.range import Range, range_closest


@pytest.mark.parametrize(
    "input,expected",
    [("chr1:1000-1", Range(seqid="chr1", start=1, end=1000, score=0, id=0))],
)
def test_range_parse(input, expected):
    from jcvi.utils.range import range_parse

    assert range_parse(input) == expected


@pytest.mark.parametrize("a,b,expected", [((30, 45), (55, 65), None),
                                          ((48, 65), (45, 55), [48, 55])])
def test_range_intersect(a, b, expected):
    from jcvi.utils.range import range_intersect

    assert range_intersect(a, b) == expected


@pytest.mark.parametrize(
    "a,b,ratio,expected",
    [
        (("1", 30, 45), ("1", 41, 55), False, 5),
        (("1", 21, 45), ("1", 41, 75), True, 0.2),
        (("1", 30, 45), ("1", 15, 55), False, 16),
        (("1", 30, 45), ("1", 15, 55), True, 1.0),
        (("1", 30, 45), ("1", 57, 68), False, 0),