Example #1
0
def uniq(args):
    """
    %prog uniq bedfile > newbedfile

    Remove overlapping features with higher scores.
    """
    p = OptionParser(uniq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    uniqbedfile = bedfile.split(".")[0] + ".uniq.bed"
    bed = Bed(bedfile)
    if not need_update(bedfile, uniqbedfile):
        return uniqbedfile

    ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \
                for i, x in enumerate(bed)]
    selected, score = range_chain(ranges)
    selected = [bed[x.id] for x in selected]

    newbed = Bed()
    newbed.extend(selected)
    newbed.print_to_file(uniqbedfile, sorted=True)
    logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed)))

    return uniqbedfile
Example #2
0
def uniq(args):
    """
    %prog uniq bedfile > newbedfile

    Remove overlapping features with higher scores.
    """
    p = OptionParser(uniq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    uniqbedfile = bedfile.split(".")[0] + ".uniq.bed"
    bed = Bed(bedfile)
    if not need_update(bedfile, uniqbedfile):
        return uniqbedfile

    ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \
                for i, x in enumerate(bed)]
    selected, score = range_chain(ranges)
    selected = [bed[x.id] for x in selected]

    newbed = Bed()
    newbed.extend(selected)
    newbed.print_to_file(uniqbedfile, sorted=True)
    logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed)))

    return uniqbedfile
Example #3
0
def get_segments(ranges, extra, minsegment=40):
    """
    Given a list of Range, perform chaining on the ranges and select a highest
    scoring subset and cut based on their boundaries. Let's say the projection
    of the synteny blocks onto one axis look like the following.

    1=====10......20====30....35====~~

    Then the segmentation will yield a block [1, 20), [20, 35), using an
    arbitrary right extension rule. Extra are additional end breaks for
    chromosomes.
    """
    from jcvi.utils.range import range_chain, LEFT, RIGHT

    NUL = 2
    selected, score = range_chain(ranges)

    endpoints = [(x.start, NUL) for x in selected]
    endpoints += [(x[0], LEFT) for x in extra]
    endpoints += [(x[1], RIGHT) for x in extra]
    endpoints.sort()

    current_left = 0
    for a, ai in endpoints:

        if ai == LEFT:
            current_left = a
        if ai == RIGHT:
            yield current_left, a
        elif ai == NUL:
            if a - current_left < minsegment:
                continue
            yield current_left, a - 1
            current_left = a
Example #4
0
def get_segments(ranges, extra, minsegment=40):
    """
    Given a list of Range, perform chaining on the ranges and select a highest
    scoring subset and cut based on their boundaries. Let's say the projection
    of the synteny blocks onto one axis look like the following.

    1=====10......20====30....35====~~

    Then the segmentation will yield a block [1, 20), [20, 35), using an
    arbitrary right extension rule. Extra are additional end breaks for
    chromosomes.
    """
    from jcvi.utils.range import range_chain, LEFT, RIGHT

    NUL = 2
    selected, score = range_chain(ranges)

    endpoints = [(x.start, NUL) for x in selected]
    endpoints += [(x[0], LEFT) for x in extra]
    endpoints += [(x[1], RIGHT) for x in extra]
    endpoints.sort()

    current_left = 0
    for a, ai in endpoints:

        if ai == LEFT:
            current_left = a
        if ai == RIGHT:
            yield current_left, a
        elif ai == NUL:
            if a - current_left < minsegment:
                continue
            yield current_left, a - 1
            current_left = a
Example #5
0
def select_bed(bed):
    """
    Return non-overlapping set of ranges, choosing high scoring blocks over low
    scoring alignments when there are conflicts.
    """
    ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) for i, x in enumerate(bed)]
    selected, score = range_chain(ranges)
    selected = [bed[x.id] for x in selected]

    return selected
Example #6
0
def uniq(args):
    """
    %prog uniq gffile cdsfasta

    Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping
    'piles' are processed, one by one.

    Here, we use a different algorithm, that retains the best non-overlapping
    subset witin each pile, rather than single best model. Scoring function is
    also different, rather than based on score or span, we optimize for the
    subset that show the best combined score. Score is defined by:

    score = (1 - AED) * length
    """

    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gffile, cdsfasta = args
    gff = Gff(gffile)
    sizes = Sizes(cdsfasta).mapping
    gene_register = {}
    for g in gff:
        if g.type != "mRNA":
            continue
        aed = float(g.attributes["_AED"][0])
        gene_register[g.parent] = (1 - aed) * sizes[g.accn]

    allgenes = import_feats(gffile)
    g = get_piles(allgenes)

    bestids = set()
    for group in g:
        ranges = [
            to_range(x, score=gene_register[x.accn], id=x.accn) for x in group
        ]
        selected_chain, score = range_chain(ranges)
        bestids |= set(x.id for x in selected_chain)

    removed = set(x.accn for x in allgenes) - bestids
    fw = open("removed.ids", "w")
    print("\n".join(sorted(removed)), file=fw)
    fw.close()
    populate_children(opts.outfile, bestids, gffile, "gene")
Example #7
0
File: qc.py Project: arvin580/jcvi
def uniq(args):
    """
    %prog uniq gffile cdsfasta

    Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping
    'piles' are processed, one by one.

    Here, we use a different algorithm, that retains the best non-overlapping
    subset witin each pile, rather than single best model. Scoring function is
    also different, rather than based on score or span, we optimize for the
    subset that show the best combined score. Score is defined by:

    score = (1 - AED) * length
    """

    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gffile, cdsfasta = args
    gff = Gff(gffile)
    sizes = Sizes(cdsfasta).mapping
    gene_register = {}
    for g in gff:
        if g.type != "mRNA":
            continue
        aed = float(g.attributes["_AED"][0])
        gene_register[g.parent] = (1 - aed) * sizes[g.accn]

    allgenes = import_feats(gffile)
    g = get_piles(allgenes)

    bestids = set()
    for group in g:
        ranges = [to_range(x, score=gene_register[x.accn], id=x.accn) \
                    for x in group]
        selected_chain, score = range_chain(ranges)
        bestids |= set(x.id for x in selected_chain)

    removed = set(x.accn for x in allgenes) - bestids
    fw = open("removed.ids", "w")
    print >> fw, "\n".join(sorted(removed))
    fw.close()
    populate_children(opts.outfile, bestids, gffile, "gene")
Example #8
0
File: bed.py Project: radaniba/jcvi
def uniq(args):
    """
    %prog uniq bedfile

    Remove overlapping features with higher scores.
    """
    p = OptionParser(uniq.__doc__)
    p.add_option("--slen",
                 default=False,
                 action="store_true",
                 help="Use sequence length as score [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    uniqbedfile = bedfile.split(".")[0] + ".uniq.bed"
    bed = Bed(bedfile)

    if opts.slen:
        ranges = [Range(x.seqid, x.start, x.end, x.end - x.start, i) \
                    for i, x in enumerate(bed)]
    else:
        ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \
                    for i, x in enumerate(bed)]
    selected, score = range_chain(ranges)
    selected = [x.id for x in selected]
    selected_ids = set(selected)
    selected = [bed[x] for x in selected]
    notselected = [x for i, x in enumerate(bed) if i not in selected_ids]

    newbed = Bed()
    newbed.extend(selected)
    newbed.print_to_file(uniqbedfile, sorted=True)

    if notselected:
        leftoverfile = bedfile.split(".")[0] + ".leftover.bed"
        leftoverbed = Bed()
        leftoverbed.extend(notselected)
        leftoverbed.print_to_file(leftoverfile, sorted=True)

    logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed)))

    return uniqbedfile
Example #9
0
File: bed.py Project: yangjl/jcvi
def uniq(args):
    """
    %prog uniq bedfile

    Remove overlapping features with higher scores.
    """
    p = OptionParser(uniq.__doc__)
    p.add_option("--slen", default=False, action="store_true",
                 help="Use sequence length as score [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    uniqbedfile = bedfile.split(".")[0] + ".uniq.bed"
    bed = Bed(bedfile)

    if opts.slen:
        ranges = [Range(x.seqid, x.start, x.end, x.end - x.start, i) \
                    for i, x in enumerate(bed)]
    else:
        ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \
                    for i, x in enumerate(bed)]
    selected, score = range_chain(ranges)
    selected = [x.id for x in selected]
    selected_ids = set(selected)
    selected = [bed[x] for x in selected]
    notselected = [x for i, x in enumerate(bed) if i not in selected_ids]

    newbed = Bed()
    newbed.extend(selected)
    newbed.print_to_file(uniqbedfile, sorted=True)

    if notselected:
        leftoverfile = bedfile.split(".")[0] + ".leftover.bed"
        leftoverbed = Bed()
        leftoverbed.extend(notselected)
        leftoverbed.print_to_file(leftoverfile, sorted=True)

    logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed)))

    return uniqbedfile
Example #10
0
def test_range_chain(ranges, expected):
    from jcvi.utils.range import range_chain

    assert range_chain(ranges) == expected
Example #11
0
def mcscan(args):
    """
    %prog mcscan bedfile anchorfile [options]

    Stack synteny blocks on a reference bed, MCSCAN style. The first column in
    the output is the reference order, given in the bedfile. Then each column
    next to it are separate 'tracks'.

    If --mergetandem=tandem_file is specified, tandem_file should have each
    tandem cluster as one line, tab separated.
    """
    p = OptionParser(mcscan.__doc__)
    p.add_option("--iter",
                 default=100,
                 type="int",
                 help="Max number of chains to output [default: %default]")
    p.add_option(
        "--ascii",
        default=False,
        action="store_true",
        help="Output symbols rather than gene names [default: %default]")
    p.add_option(
        "--Nm",
        default=10,
        type="int",
        help="Clip block ends to allow slight overlaps [default: %default]")
    p.add_option("--trackids",
                 action="store_true",
                 help="Track block IDs in separate file [default: %default]")
    p.add_option("--mergetandem", default=None,
                 help="merge tandems genes in output acoording to PATH-TO-TANDEM_FILE, "\
                 "cannot be used with --ascii")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, anchorfile = args
    ascii = opts.ascii
    clip = opts.Nm
    trackids = opts.trackids
    ofile = opts.outfile
    mergetandem = opts.mergetandem
    bed = Bed(bedfile)
    order = bed.order

    if trackids:
        olog = ofile + ".tracks"
        fwlog = must_open(olog, "w")

    if mergetandem:
        assert not ascii
        tandems = {}
        for row in file(mergetandem):
            row = row.split()
            s = ";".join(row)
            for atom in row:
                tandems[atom] = s

    ac = AnchorFile(anchorfile)
    ranges = []
    block_pairs = defaultdict(dict)
    blocks = ac.blocks
    for i, ib in enumerate(blocks):
        q, s, t = zip(*ib)
        if q[0] not in order:
            q, s = s, q

        r = get_range(q, s, t, i, order, block_pairs, clip=clip)
        ranges.append(r)

        assert q[0] in order
        if s[0] not in order:
            continue

        # is_self comparison
        q, s = s, q
        r = get_range(q, s, t, i, order, block_pairs, clip=clip)
        ranges.append(r)

    fw = must_open(ofile, "w")

    tracks = []
    print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges))
    iteration = 0
    while ranges:
        if iteration >= opts.iter:
            break

        selected, score = range_chain(ranges)
        tracks.append(selected)
        selected = set(x.id for x in selected)
        if trackids:
            print >> fwlog, ",".join(str(x) for x in sorted(selected))

        ranges = [x for x in ranges if x.id not in selected]
        msg = "Chain {0}: score={1}".format(iteration, score)
        if ranges:
            msg += " {0} blocks remained..".format(len(ranges))
        else:
            msg += " done!"

        print >> sys.stderr, msg
        iteration += 1

    mbed = []
    for b in bed:
        id = b.accn
        atoms = []
        for track in tracks:
            track_ids = [x.id for x in track]
            for tid in track_ids:
                pairs = block_pairs[tid]
                anchor = pairs.get(id, ".")
                if anchor != ".":
                    break
            if ascii and anchor != ".":
                anchor = "x"
            atoms.append(anchor)
        mbed.append((id, atoms))

    for id, atoms in mbed:
        sep = "" if ascii else "\t"
        if mergetandem:
            for i, atom in enumerate(atoms):
                atoms[i] = tandems.get(atom, atom)
        print >> fw, "\t".join((id, sep.join(atoms)))

    logging.debug("MCscan blocks written to `{0}`.".format(ofile))
    if trackids:
        logging.debug("Block IDs written to `{0}`.".format(olog))
Example #12
0
def supermap(blast_file, filter="intersection", dialect="blast", clip=0):
    # filter by query
    if filter != "ref":
        logging.debug("filter by query")
        ranges = list(BlastOrCoordsLine(blast_file, filter="query",
            dialect=dialect, clip=clip))

        query_selected, query_score = range_chain(ranges)
        query_idx = set(x.id for x in query_selected)

    # filter by ref
    if filter != "query":
        logging.debug("filter by ref")
        ranges = list(BlastOrCoordsLine(blast_file, filter="ref",
            dialect=dialect, clip=clip))

        ref_selected, ref_score = range_chain(ranges)
        ref_idx = set(x.id for x in ref_selected)

    if filter == "ref":
        selected_idx = ref_idx

    elif filter == "query":
        selected_idx = query_idx

    elif filter == "intersection":
        logging.debug("perform intersection")
        selected_idx = ref_idx & query_idx

    elif filter == "union":
        logging.debug("perform union")
        selected_idx = ref_idx | query_idx

    assert len(selected_idx) != 0

    # selected_idx is in fact the lineno in the BLAST file
    fp = open(blast_file)

    if filter == "intersection":
        tag = ""
    else:
        tag = "." + filter
    supermapfile = blast_file + tag + ".supermap"
    fw = open(supermapfile, "w")

    selected_idx = iter(sorted(selected_idx))
    selected = selected_idx.next()
    for i, row in enumerate(fp):
        if i < selected:
            continue
        print >> fw, row.rstrip()
        try:
            selected = selected_idx.next()
        except StopIteration:
            break

    logging.debug("Write output file to `{0}`".format(supermapfile))
    fw.close()

    from jcvi.formats.blast import sort
    ofilter = "ref" if filter == "ref" else "query"
    args = [supermapfile, "--" + ofilter]
    if dialect == "coords":
        args += ["--coords"]

    sort(args)

    return supermapfile
Example #13
0
def mcscan(args):
    """
    %prog mcscan bedfile anchorfile

    Stack synteny blocks on a reference bed, MCSCAN style. The first column in
    the output is the reference order, given in the bedfile. Then each column
    next to it are separate 'tracks'.
    """
    from jcvi.utils.range import Range, range_chain

    p = OptionParser(mcscan.__doc__)
    p.add_option("--iter",
                 default=100,
                 type="int",
                 help="Max number of chains to output [default: %default]")
    p.add_option(
        "--ascii",
        default=False,
        action="store_true",
        help="Output symbols rather than gene names [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, anchorfile = args
    ascii = opts.ascii
    bed = Bed(bedfile)
    order = bed.order

    ac = AnchorFile(anchorfile)
    ranges = []
    block_pairs = {}
    for i, (q, s) in enumerate(ac.iter_blocks()):
        if q[0] not in order:
            q, s = s, q

        pairs = dict(zip(q, s))
        block_pairs[i] = pairs

        q = [order[x] for x in q]
        q.sort()
        ranges.append(Range("0", q[0], q[-1], score=len(q), id=i))

    tracks = []
    print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges))
    iteration = 0
    while ranges:
        if iteration >= opts.iter:
            break

        selected, score = range_chain(ranges)
        tracks.append(selected)
        selected = set(x.id for x in selected)
        ranges = [x for x in ranges if x.id not in selected]
        msg = "Chain {0}: score={1}".format(iteration, score)
        if ranges:
            msg += " {0} blocks remained..".format(len(ranges))
        else:
            msg += " done!"

        print >> sys.stderr, msg
        iteration += 1

    for b in bed:
        id = b.accn
        atoms = []
        for track in tracks:
            track_ids = [x.id for x in track]
            for tid in track_ids:
                pairs = block_pairs[tid]
                anchor = pairs.get(id, ".")
                if anchor != ".":
                    break
            if ascii and anchor != ".":
                anchor = "x"
            atoms.append(anchor)

        sep = "" if ascii else "\t"
        print "\t".join((id, sep.join(atoms)))
Example #14
0
File: ies.py Project: Hensonmw/jcvi
def deletion(args):
    """
    %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed

    Find IES based on mapping MAC reads to MIC genome.
    """
    p = OptionParser(deletion.__doc__)
    p.add_option("--mindepth", default=3, type="int",
                 help="Minimum depth to call a deletion")
    p.add_option("--minspan", default=30, type="int",
                 help="Minimum span to call a deletion")
    p.add_option("--split", default=False, action="store_true",
                 help="Break at cigar N into separate parts")
    p.set_tmpdir()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, gapsbedfile = args
    if bedfile.endswith(".bam"):
        bamfile = bedfile
        bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed")
        if need_update(bamfile, bedfile):
            cmd = "bamToBed -i {0}".format(bamfile)
            if opts.split:
                cmd += " -split"
            cmd += " | cut -f1-4"
            sh(cmd, outfile=bedfile)

    sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir)
    if bedfile.endswith(".sorted.bed"):
        pf = bedfile.rsplit(".", 2)[0]
        sortedbedfile = bedfile
    else:
        pf = bedfile.rsplit(".", 1)[0]
        sortedbedfile = pf + ".sorted.bed"
        if need_update(bedfile, sortedbedfile):
            sort([bedfile, "-u", "--accn", sort_tmpdir])

    # Find reads that contain multiple matches
    ibedfile = pf + ".d.bed"
    if need_update(sortedbedfile, ibedfile):
        bed = Bed(sortedbedfile, sorted=False)
        fw = open(ibedfile, "w")
        logging.debug("Write deletions to `{0}`.".format(ibedfile))
        for accn, bb in groupby(bed, key=lambda x: x.accn):
            bb = list(bb)
            branges = [(x.seqid, x.start, x.end) for x in bb]
            iranges = range_interleave(branges)
            for seqid, start, end in iranges:
                if end - start + 1 < opts.minspan:
                    continue
                print >> fw, "\t".join(str(x) for x in \
                            (seqid, start - 1, end, accn + '-d'))
        fw.close()

    # Uniqify the insertions and count occurrences
    countbedfile = pf + ".uniq.bed"
    if need_update(ibedfile, countbedfile):
        bed = Bed(ibedfile)
        fw = open(countbedfile, "w")
        logging.debug("Write counts to `{0}`.".format(countbedfile))
        registry = Counter((x.seqid, x.start, x.end) for x in bed)
        ies_id = 1
        for (seqid, start, end), count in registry.items():
            ies_name = "{0:05d}-r{1}".format(ies_id, count)
            if count < opts.mindepth:
                continue
            print >> fw, "\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name))
            ies_id += 1
        fw.close()
        sort([countbedfile, "-i", sort_tmpdir])

    # Remove deletions that contain some read depth
    depthbedfile = pf + ".depth.bed"
    if need_update((sortedbedfile, countbedfile), depthbedfile):
        depth([sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)])

    validbedfile = pf + ".valid.bed"
    if need_update(depthbedfile, validbedfile):
        fw = open(validbedfile, "w")
        logging.debug("Filter valid deletions to `{0}`.".format(validbedfile))
        bed = Bed(depthbedfile)
        all_scores = [float(b.score) for b in bed]
        lb, ub = outlier_cutoff(all_scores)
        logging.debug("Bounds for depths: LB={0:.2f} (ignored)  UB={1:.2f}".format(lb, ub))
        for b in bed:
            if float(b.score) > ub:
                continue
            print >> fw, b
        fw.close()

    # Remove deletions that contain sequencing gaps on its flanks
    selectedbedfile = pf + ".selected.bed"
    if need_update(validbedfile, selectedbedfile):
        flanksbedfile = pf + ".flanks.bed"
        fw = open(flanksbedfile, "w")
        bed = Bed(validbedfile)
        flank = 100
        logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile))
        for b in bed:
            start, end = b.start, b.end
            b.start, b.end = start, min(start + flank - 1, end)
            print >> fw, b
            b.start, b.end = max(start, end - flank + 1), end
            print >> fw, b
        fw.close()

        intersectidsfile = pf + ".intersect.ids"
        cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile)
        cmd += " | cut -f4 | sort -u"
        sh(cmd, outfile=intersectidsfile)
        some([validbedfile, intersectidsfile, "-v",
                "--outfile={0}".format(selectedbedfile)])

    # Find best-scoring non-overlapping set
    iesbedfile = pf + ".ies.bed"
    if need_update(selectedbedfile, iesbedfile):
        bed = Bed(selectedbedfile)
        fw = open(iesbedfile, "w")
        logging.debug("Write IES to `{0}`.".format(iesbedfile))
        branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \
                        for i, x in enumerate(bed)]
        iranges, iscore = range_chain(branges)
        logging.debug("Best chain score: {0} ({1} IES)".\
                        format(iscore, len(iranges)))
        ies_id = 1
        for seqid, start, end, score, id in iranges:
            ies_name = "IES-{0:05d}-r{1}".format(ies_id, score)
            span = end - start + 1
            print >> fw, "\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name, span))
            ies_id += 1
        fw.close()
Example #15
0
def mcscan(args):
    """
    %prog mcscan bedfile anchorfile [options]

    Stack synteny blocks on a reference bed, MCSCAN style. The first column in
    the output is the reference order, given in the bedfile. Then each column
    next to it are separate 'tracks'.

    If --mergetandem=tandem_file is specified, tandem_file should have each
    tandem cluster as one line, tab separated.
    """
    p = OptionParser(mcscan.__doc__)
    p.add_option("--iter", default=100, type="int",
                 help="Max number of chains to output [default: %default]")
    p.add_option("--ascii", default=False, action="store_true",
                 help="Output symbols rather than gene names [default: %default]")
    p.add_option("--Nm", default=10, type="int",
                 help="Clip block ends to allow slight overlaps [default: %default]")
    p.add_option("--trackids", action="store_true",
                 help="Track block IDs in separate file [default: %default]")
    p.add_option("--mergetandem", default=None,
                 help="merge tandems genes in output acoording to PATH-TO-TANDEM_FILE, "\
                 "cannot be used with --ascii")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, anchorfile = args
    ascii = opts.ascii
    clip = opts.Nm
    trackids = opts.trackids
    ofile = opts.outfile
    mergetandem = opts.mergetandem
    bed = Bed(bedfile)
    order = bed.order

    if trackids:
        olog = ofile + ".tracks"
        fwlog = must_open(olog, "w")

    if mergetandem:
        assert not ascii
        tandems = {}
        for row in file(mergetandem):
            row = row.split()
            s = ";".join(row)
            for atom in row:
                tandems[atom] = s

    ac = AnchorFile(anchorfile)
    ranges = []
    block_pairs = defaultdict(dict)
    blocks = ac.blocks
    for i, ib in enumerate(blocks):
        q, s, t = zip(*ib)
        if q[0] not in order:
            q, s = s, q

        r = get_range(q, s, t, i, order, block_pairs, clip=clip)
        ranges.append(r)

        assert q[0] in order
        if s[0] not in order:
            continue

        # is_self comparison
        q, s = s, q
        r = get_range(q, s, t, i, order, block_pairs, clip=clip)
        ranges.append(r)

    fw = must_open(ofile, "w")

    tracks = []
    print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges))
    iteration = 0
    while ranges:
        if iteration >= opts.iter:
            break

        selected, score = range_chain(ranges)
        tracks.append(selected)
        selected = set(x.id for x in selected)
        if trackids:
            print >> fwlog, ",".join(str(x) for x in sorted(selected))

        ranges = [x for x in ranges if x.id not in selected]
        msg = "Chain {0}: score={1}".format(iteration, score)
        if ranges:
            msg += " {0} blocks remained..".format(len(ranges))
        else:
            msg += " done!"

        print >> sys.stderr, msg
        iteration += 1

    mbed = []
    for b in bed:
        id = b.accn
        atoms = []
        for track in tracks:
            track_ids = [x.id for x in track]
            for tid in track_ids:
                pairs = block_pairs[tid]
                anchor = pairs.get(id, ".")
                if anchor != ".":
                    break
            if ascii and anchor != ".":
                anchor = "x"
            atoms.append(anchor)
        mbed.append((id, atoms))

    for id, atoms in mbed:
        sep = "" if ascii else "\t"
        if mergetandem:
            for i, atom in enumerate(atoms):
                atoms[i] = tandems.get(atom, atom)
        print >> fw, "\t".join((id, sep.join(atoms)))

    logging.debug("MCscan blocks written to `{0}`.".format(ofile))
    if trackids:
        logging.debug("Block IDs written to `{0}`.".format(olog))
Example #16
0
def mcscan(args):
    """
    %prog mcscan bedfile anchorfile

    Stack synteny blocks on a reference bed, MCSCAN style. The first column in
    the output is the reference order, given in the bedfile. Then each column
    next to it are separate 'tracks'.
    """
    from jcvi.utils.range import Range, range_chain

    p = OptionParser(mcscan.__doc__)
    p.add_option("--iter", default=100, type="int",
                 help="Max number of chains to output [default: %default]")
    p.add_option("--ascii", default=False, action="store_true",
                 help="Output symbols rather than gene names [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, anchorfile = args
    ascii = opts.ascii
    bed = Bed(bedfile)
    order = bed.order

    ac = AnchorFile(anchorfile)
    ranges = []
    block_pairs = {}
    for i, (q, s) in enumerate(ac.iter_blocks()):
        if q[0] not in order:
            q, s = s, q

        pairs = dict(zip(q, s))
        block_pairs[i] = pairs

        q = [order[x] for x in q]
        q.sort()
        ranges.append(Range("0", q[0], q[-1], score=len(q), id=i))

    tracks = []
    print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges))
    iteration = 0
    while ranges:
        if iteration >= opts.iter:
            break

        selected, score = range_chain(ranges)
        tracks.append(selected)
        selected = set(x.id for x in selected)
        ranges = [x for x in ranges if x.id not in selected]
        msg = "Chain {0}: score={1}".format(iteration, score)
        if ranges:
            msg += " {0} blocks remained..".format(len(ranges))
        else:
            msg += " done!"

        print >> sys.stderr, msg
        iteration += 1

    for b in bed:
        id = b.accn
        atoms = []
        for track in tracks:
            track_ids = [x.id for x in track]
            for tid in track_ids:
                pairs = block_pairs[tid]
                anchor = pairs.get(id, ".")
                if anchor != ".":
                    break
            if ascii and anchor != ".":
                anchor = "x"
            atoms.append(anchor)

        sep = "" if ascii else "\t"
        print "\t".join((id, sep.join(atoms)))
Example #17
0
def deletion(args):
    """
    %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed

    Find IES based on mapping MAC reads to MIC genome.
    """
    p = OptionParser(deletion.__doc__)
    p.add_option("--mindepth",
                 default=3,
                 type="int",
                 help="Minimum depth to call a deletion")
    p.add_option("--minspan",
                 default=30,
                 type="int",
                 help="Minimum span to call a deletion")
    p.add_option("--split",
                 default=False,
                 action="store_true",
                 help="Break at cigar N into separate parts")
    p.set_tmpdir()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, gapsbedfile = args
    if bedfile.endswith(".bam"):
        bamfile = bedfile
        bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed")
        if need_update(bamfile, bedfile):
            cmd = "bamToBed -i {0}".format(bamfile)
            if opts.split:
                cmd += " -split"
            cmd += " | cut -f1-4"
            sh(cmd, outfile=bedfile)

    sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir)
    if bedfile.endswith(".sorted.bed"):
        pf = bedfile.rsplit(".", 2)[0]
        sortedbedfile = bedfile
    else:
        pf = bedfile.rsplit(".", 1)[0]
        sortedbedfile = pf + ".sorted.bed"
        if need_update(bedfile, sortedbedfile):
            sort([bedfile, "-u", "--accn", sort_tmpdir])

    # Find reads that contain multiple matches
    ibedfile = pf + ".d.bed"
    if need_update(sortedbedfile, ibedfile):
        bed = Bed(sortedbedfile, sorted=False)
        fw = open(ibedfile, "w")
        logging.debug("Write deletions to `{0}`.".format(ibedfile))
        for accn, bb in groupby(bed, key=lambda x: x.accn):
            bb = list(bb)
            branges = [(x.seqid, x.start, x.end) for x in bb]
            iranges = range_interleave(branges)
            for seqid, start, end in iranges:
                if end - start + 1 < opts.minspan:
                    continue
                print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, accn + '-d')), file=fw)
        fw.close()

    # Uniqify the insertions and count occurrences
    countbedfile = pf + ".uniq.bed"
    if need_update(ibedfile, countbedfile):
        bed = Bed(ibedfile)
        fw = open(countbedfile, "w")
        logging.debug("Write counts to `{0}`.".format(countbedfile))
        registry = Counter((x.seqid, x.start, x.end) for x in bed)
        ies_id = 1
        for (seqid, start, end), count in registry.items():
            ies_name = "{0:05d}-r{1}".format(ies_id, count)
            if count < opts.mindepth:
                continue
            print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name)), file=fw)
            ies_id += 1
        fw.close()
        sort([countbedfile, "-i", sort_tmpdir])

    # Remove deletions that contain some read depth
    depthbedfile = pf + ".depth.bed"
    if need_update((sortedbedfile, countbedfile), depthbedfile):
        depth([
            sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)
        ])

    validbedfile = pf + ".valid.bed"
    if need_update(depthbedfile, validbedfile):
        fw = open(validbedfile, "w")
        logging.debug("Filter valid deletions to `{0}`.".format(validbedfile))
        bed = Bed(depthbedfile)
        all_scores = [float(b.score) for b in bed]
        lb, ub = outlier_cutoff(all_scores)
        logging.debug(
            "Bounds for depths: LB={0:.2f} (ignored)  UB={1:.2f}".format(
                lb, ub))
        for b in bed:
            if float(b.score) > ub:
                continue
            print(b, file=fw)
        fw.close()

    # Remove deletions that contain sequencing gaps on its flanks
    selectedbedfile = pf + ".selected.bed"
    if need_update(validbedfile, selectedbedfile):
        flanksbedfile = pf + ".flanks.bed"
        fw = open(flanksbedfile, "w")
        bed = Bed(validbedfile)
        flank = 100
        logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile))
        for b in bed:
            start, end = b.start, b.end
            b.start, b.end = start, min(start + flank - 1, end)
            print(b, file=fw)
            b.start, b.end = max(start, end - flank + 1), end
            print(b, file=fw)
        fw.close()

        intersectidsfile = pf + ".intersect.ids"
        cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile)
        cmd += " | cut -f4 | sort -u"
        sh(cmd, outfile=intersectidsfile)
        some([
            validbedfile, intersectidsfile, "-v",
            "--outfile={0}".format(selectedbedfile)
        ])

    # Find best-scoring non-overlapping set
    iesbedfile = pf + ".ies.bed"
    if need_update(selectedbedfile, iesbedfile):
        bed = Bed(selectedbedfile)
        fw = open(iesbedfile, "w")
        logging.debug("Write IES to `{0}`.".format(iesbedfile))
        branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \
                        for i, x in enumerate(bed)]
        iranges, iscore = range_chain(branges)
        logging.debug("Best chain score: {0} ({1} IES)".\
                        format(iscore, len(iranges)))
        ies_id = 1
        for seqid, start, end, score, id in iranges:
            ies_name = "IES-{0:05d}-r{1}".format(ies_id, score)
            span = end - start + 1
            print("\t".join(str(x) for x in \
                            (seqid, start - 1, end, ies_name, span)), file=fw)
            ies_id += 1
        fw.close()
Example #18
0
def supermap(blast_file, filter="intersection", dialect="blast", clip=0):
    # filter by query
    if filter != "ref":
        logging.debug("filter by query")
        ranges = list(
            BlastOrCoordsLine(blast_file,
                              filter="query",
                              dialect=dialect,
                              clip=clip))

        query_selected, query_score = range_chain(ranges)
        query_idx = set(x.id for x in query_selected)

    # filter by ref
    if filter != "query":
        logging.debug("filter by ref")
        ranges = list(
            BlastOrCoordsLine(blast_file,
                              filter="ref",
                              dialect=dialect,
                              clip=clip))

        ref_selected, ref_score = range_chain(ranges)
        ref_idx = set(x.id for x in ref_selected)

    if filter == "ref":
        selected_idx = ref_idx

    elif filter == "query":
        selected_idx = query_idx

    elif filter == "intersection":
        logging.debug("perform intersection")
        selected_idx = ref_idx & query_idx

    elif filter == "union":
        logging.debug("perform union")
        selected_idx = ref_idx | query_idx

    assert len(selected_idx) != 0

    # selected_idx is in fact the lineno in the BLAST file
    fp = open(blast_file)

    if filter == "intersection":
        tag = ""
    else:
        tag = "." + filter
    supermapfile = blast_file + tag + ".supermap"
    fw = open(supermapfile, "w")

    selected_idx = iter(sorted(selected_idx))
    selected = next(selected_idx)
    for i, row in enumerate(fp):
        if i < selected:
            continue
        print(row.rstrip(), file=fw)
        try:
            selected = next(selected_idx)
        except StopIteration:
            break

    logging.debug("Write output file to `{0}`".format(supermapfile))
    fw.close()

    from jcvi.formats.blast import sort
    ofilter = "ref" if filter == "ref" else "query"
    args = [supermapfile, "--" + ofilter]
    if dialect == "coords":
        args += ["--coords"]

    sort(args)

    return supermapfile