Ejemplo n.º 1
0
def main(blastfile, p, opts):

    sqlite = opts.sqlite
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)
    filtered_blast = read_blast(blastfile, qorder, sorder, \
                                is_self=is_self, ostrip=opts.strip_names)
    all_data = [(b.qi, b.si) for b in filtered_blast]

    c = None
    if sqlite:
        conn = sqlite3.connect(sqlite)
        c = conn.cursor()
        c.execute("drop table if exists synteny")
        c.execute("create table synteny (query text, anchor text, "
                "gray varchar(1), score integer, dr integer, "
                "orientation varchar(1), qnote text, snote text)")
        fw = None
    else:
        fw = must_open(opts.outfile, "w")

    batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=False)
    if qbed.filename == sbed.filename:
        logging.debug("Self comparisons, mirror ignored")
    else:
        batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=True)

    if sqlite:
        c.execute("create index q on synteny (query)")
        conn.commit()
        c.close()
    else:
        fw.close()
Ejemplo n.º 2
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option("--switch", default=False, action="store_true",
                 help="Switch reference and aligned map elements")
    p.add_option("--scale", type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        try:
            newsseqid = get_number(sseqid)
        except ValueError:
            raise ValueError, "`{0}` is on `{1}` with no number to extract".\
                                format(saccn, sseqid)
        bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend,
                            "{0}:{1}".format(newsseqid, sstart)))
        bd.add(bedline)

    bd.print_to_file(filename=opts.outfile, sorted=True)
Ejemplo n.º 3
0
Archivo: hic.py Proyecto: xuanblo/jcvi
def movieframe(args):
    """
    %prog movieframe tour test.clm contigs.ref.anchors

    Draw heatmap and synteny in the same plot.
    """
    p = OptionParser(movieframe.__doc__)
    p.add_option("--label", help="Figure title")
    p.set_beds()
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args, figsize="16x8",
                                            style="white", cmap="coolwarm",
                                            format="png", dpi=120)

    if len(args) != 3:
        sys.exit(not p.print_help())

    tour, clmfile, anchorsfile = args
    tour = tour.split(",")
    image_name = opts.outfile or ("movieframe." + iopts.format)
    label = opts.label or op.basename(image_name).rsplit(".", 1)[0]

    clm = CLMFile(clmfile)
    totalbins, bins, breaks = make_bins(tour, clm.tig_to_size)
    M = read_clm(clm, totalbins, bins)

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])        # whole canvas
    ax1 = fig.add_axes([.05, .1, .4, .8])    # heatmap
    ax2 = fig.add_axes([.55, .1, .4, .8])    # dot plot
    ax2_root = fig.add_axes([.5, 0, .5, 1])  # dot plot canvas

    # Left axis: heatmap
    plot_heatmap(ax1, M, breaks, iopts)

    # Right axis: synteny
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts,
                                                     sorted=False)
    dotplot(anchorsfile, qbed, sbed, fig, ax2_root, ax2, sep=False, title="")

    root.text(.5, .98, clm.name, color="g", ha="center", va="center")
    root.text(.5, .95, label, color="darkslategray", ha="center", va="center")
    normalize_axes(root)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Ejemplo n.º 4
0
def offdiag(args):
    """
    %prog offdiag diploid.napus.1x1.lifted.anchors

    Find gene pairs that are off diagnoal. "Off diagonal" are the pairs that are
    not on the orthologous chromosomes. For example, napus chrA01 and brapa A01.
    """
    p = OptionParser(offdiag.__doc__)
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)

    fp = open(anchorsfile)
    pf = "-".join(anchorsfile.split(".")[:2])
    header = "Block-id|Napus|Diploid|Napus-chr|Diploid-chr|RBH?".split("|")
    print "\t".join(header)
    i = -1
    for row in fp:
        if row[0] == '#':
            i += 1
            continue
        q, s, score = row.split()
        rbh = 'no' if score[-1] == 'L' else 'yes'
        qi, qq = qorder[q]
        si, ss = sorder[s]
        oqseqid = qseqid = qq.seqid
        osseqid = sseqid = ss.seqid
        sseqid = sseqid.split("_")[0][-3:]
        if qseqid[0] == 'A':
            qseqid = qseqid[-3:]       # A09 => A09
        elif qseqid[0] == 'C':
            qseqid = 'C0' + qseqid[-1]  # C9 => C09
        else:
            continue
        if qseqid == sseqid or sseqid[-2:] == 'nn':
            continue
        block_id = pf + "-block-{0}".format(i)
        print "\t".join((block_id, q, s, oqseqid, osseqid, rbh))
Ejemplo n.º 5
0
def mergechrom(args):
    """
    %prog mergechrom a.b.anchors

    merge synteny blocks on the same chromosome
    """
    p = OptionParser(mergechrom.__doc__)
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    af = AnchorFile(anchorfile)
    newanchorfile = anchorfile.rsplit(".", 1)[0] + ".mergechrom.anchors"
    fw = open(newanchorfile, "w")

    qchrom_dic = dict((b.accn,b.seqid) for b in qbed)
    schrom_dic = dict((b.accn,b.seqid) for b in sbed)
    block_dic = dict()
    blocks = af.blocks
    for (i,block) in enumerate(blocks):
        q, s, score = block[0]
        qchrom, schrom = qchrom_dic[q], schrom_dic[s]
        k = "%s_%s" % (qchrom, schrom)
        if k not in block_dic: block_dic[k] = []
        block_dic[k].append(i)

    for (k, idxs) in block_dic.items():
        print("#" * 3, file=fw)
        for i in idxs:
            for q, s, score in blocks[i]:
                print("\t".join((q, s, str(score))), file=fw)

    fw.close()
    print("%d blocks merged to %d" % (len(blocks), len(block_dic.keys())))
Ejemplo n.º 6
0
def collinear(args):
    """
    %prog collinear a.b.anchors

    Reduce synteny blocks to strictly collinear, use dynamic programming in a
    procedure similar to DAGchainer.
    """
    p = OptionParser(collinear.__doc__)
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    af = AnchorFile(anchorfile)
    newanchorfile = anchorfile.rsplit(".", 1)[0] + ".collinear.anchors"
    fw = open(newanchorfile, "w")

    blocks = af.blocks
    for block in blocks:
        print("#" * 3, file=fw)
        iblock = []
        for q, s, score in block:
            qi, q = qorder[q]
            si, s = sorder[s]
            score = int(long(score))
            iblock.append([qi, si, score])

        block = get_collinear(iblock)

        for q, s, score in block:
            q = qbed[q].accn
            s = sbed[s].accn
            print("\t".join((q, s, str(score))), file=fw)

    fw.close()
Ejemplo n.º 7
0
def dotplot_main(args):
    p = OptionParser(__doc__)
    p.set_beds()
    p.add_option("--synteny", default=False, action="store_true",
            help="Run a fast synteny scan and display blocks [default: %default]")
    p.add_option("--cmaptext", help="Draw colormap box on the bottom-left corner")
    p.add_option("--vmin", dest="vmin", type="float", default=0,
            help="Minimum value in the colormap [default: %default]")
    p.add_option("--vmax", dest="vmax", type="float", default=2,
            help="Maximum value in the colormap [default: %default]")
    p.add_option("--genomenames", type="string", default=None,
            help="genome names for labeling axes in the form of qname_sname, " \
            "eg. \"Vitis vinifera_Oryza sativa\"")
    p.add_option("--nmax", dest="sample_number", type="int", default=10000,
            help="Maximum number of data points to plot [default: %default]")
    p.add_option("--minfont", type="int", default=4,
            help="Do not render labels with size smaller than")
    p.add_option("--colormap",
            help="Two column file, block id to color mapping [default: %default]")
    p.add_option("--nosort", default=False, action="store_true",
            help="Do not sort the seqids along the axes")
    p.add_option("--nosep", default=False, action="store_true",
            help="Do not add contig lines")
    p.add_option("--nostdpf", default=False, action="store_true",
            help="Do not standardize contig names")
    p.add_option("--skipempty", default=False, action="store_true",
            help="Skip seqids that do not have matches")
    p.add_option("--title", help="Title of the dot plot")
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args, figsize="8x8",
                                            style="dark", dpi=90, cmap="copper")

    if len(args) != 1:
        sys.exit(not p.print_help())

    palette = opts.colormap
    if palette:
        palette = Palette(palette)

    anchorfile, = args
    cmaptext = opts.cmaptext
    if anchorfile.endswith(".ks"):
        from jcvi.apps.ks import KsFile

        logging.debug("Anchors contain Ks values")
        cmaptext = cmaptext or "*Ks* values"
        anchorksfile = anchorfile + ".anchors"
        if need_update(anchorfile, anchorksfile):
            ksfile = KsFile(anchorfile)
            ksfile.print_to_anchors(anchorksfile)
        anchorfile = anchorksfile

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts,
                sorted=(not opts.nosort))

    if opts.skipempty:
        ac = AnchorFile(anchorfile)
        if is_self:
            qseqids = sseqids = set()
        else:
            qseqids, sseqids = set(), set()

        for pair in ac.iter_pairs():
            q, s = pair[:2]
            qi, q = qorder[q]
            si, s = sorder[s]
            qseqids.add(q.seqid)
            sseqids.add(s.seqid)

        if is_self:
            qbed = sbed = subset_bed(qbed, qseqids)
        else:
            qbed = subset_bed(qbed, qseqids)
            sbed = subset_bed(sbed, sseqids)

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])  # the whole canvas
    ax = fig.add_axes([.1, .1, .8, .8])  # the dot plot

    dotplot(anchorfile, qbed, sbed, fig, root, ax,
            vmin=opts.vmin, vmax=opts.vmax, is_self=is_self,
            synteny=opts.synteny, cmap_text=opts.cmaptext, cmap=iopts.cmap,
            genomenames=opts.genomenames, sample_number=opts.sample_number,
            minfont=opts.minfont, palette=palette, sep=(not opts.nosep),
            title=opts.title, stdpf=(not opts.nostdpf))

    image_name = opts.outfile or \
            (op.splitext(anchorfile)[0] + "." + opts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    fig.clear()
Ejemplo n.º 8
0
def loss(args):
    """
    %prog loss a.b.i1.blocks [a.b-genomic.blast]

    Extract likely gene loss candidates between genome a and b.
    """
    p = OptionParser(loss.__doc__)
    p.add_option("--bed", default=False, action="store_true",
                 help="Genomic BLAST is in bed format [default: %default]")
    p.add_option("--gdist", default=20, type="int",
                 help="Gene distance [default: %default]")
    p.add_option("--bdist", default=20000, type="int",
                 help="Base pair distance [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    blocksfile = args[0]
    emptyblast = (len(args) == 1)
    if emptyblast:
        genomicblast = "empty.blast"
        sh("touch {0}".format(genomicblast))
    else:
        genomicblast = args[1]

    gdist, bdist = opts.gdist, opts.bdist
    qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts)
    blocks = []
    fp = open(blocksfile)
    genetrack = {}
    proxytrack = {}
    for row in fp:
        a, b = row.split()
        genetrack[a] = b
        blocks.append((a, b))

    data = []
    for key, rows in groupby(blocks, key=lambda x: x[-1]):
        rows = list(rows)
        data.append((key, rows))

    imax = len(data) - 1
    for i, (key, rows) in enumerate(data):
        if i == 0 or i == imax:
            continue
        if key != '.':
            continue

        before, br = data[i - 1]
        after, ar = data[i + 1]
        bi, bx = sorder[before]
        ai, ax = sorder[after]
        dist = abs(bi - ai)
        if bx.seqid != ax.seqid or dist > gdist:
            continue

        start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end)))
        start, end = max(start - bdist, 1), end + bdist
        proxy = (bx.seqid, start, end)
        for a, b in rows:
            proxytrack[a] = proxy

    tags = {}
    if opts.bed:
        bed = Bed(genomicblast, sorted=False)
        key = lambda x: gene_name(x.accn.rsplit(".", 1)[0])
        for query, bb in groupby(bed, key=key):
            bb = list(bb)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.seqid, b.start, b.end)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.seqid, best_b.start, best_b.end)
            proxytrack[query] = hsp
            tags[query] = tag

    else:
        blast = Blast(genomicblast)
        for query, bb in blast.iter_hits():
            bb = list(bb)
            query = gene_name(query)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.subject, b.sstart, b.sstop)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.subject, best_b.sstart, best_b.sstop)
            proxytrack[query] = hsp
            tags[query] = tag

    for b in qbed:
        accn = b.accn
        target_region = genetrack[accn]
        if accn in proxytrack:
            target_region = region_str(proxytrack[accn])
            if accn in tags:
                ptag = "[{0}]".format(tags[accn])
            else:
                ptag = "[NF]"
            target_region = ptag + target_region

        print "\t".join((b.seqid, accn, target_region))

    if emptyblast:
        sh("rm -f {0}".format(genomicblast))
Ejemplo n.º 9
0
def pad(args):
    """
    %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed

    Test and reconstruct candidate PADs.
    """
    from jcvi.formats.cdt import CDT

    p = OptionParser(pad.__doc__)
    p.set_beds()
    p.add_option(
        "--cutoff",
        default=.3,
        type="float",
        help="The clustering cutoff to call similar [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cutoff = opts.cutoff
    blastfile, cdtfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    cdt = CDT(cdtfile)
    qparts = list(cdt.iter_partitions(cutoff=cutoff))
    sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False))

    qid, sid = {}, {}
    for i, part in enumerate(qparts):
        qid.update(dict((x, i) for x in part))
    for i, part in enumerate(sparts):
        sid.update(dict((x, i) for x in part))

    # Without writing files, conversion from PAD to merged PAD is done in memory
    for q in qbed:
        q.seqid = qid[q.seqid]
    for s in sbed:
        s.seqid = sid[s.seqid]

    qnames = range(len(qparts))
    snames = range(len(sparts))

    logmp = make_arrays(blastfile, qbed, sbed, qnames, snames)
    m, n = logmp.shape
    pvalue_cutoff = 1e-30
    cutoff = -log(pvalue_cutoff)

    significant = []
    for i in xrange(m):
        for j in xrange(n):
            score = logmp[i, j]
            if score < cutoff:
                continue
            significant.append((qparts[i], sparts[j], score))

    for a, b, score in significant:
        print("|".join(a), "|".join(b), score)

    logging.debug("Collected {0} PAR comparisons significant at (P < {1}).".\
                    format(len(significant), pvalue_cutoff))

    return significant
Ejemplo n.º 10
0
def blastfilter_main(blast_file, p, opts):

    qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts)

    tandem_Nmax = opts.tandem_Nmax
    cscore = opts.cscore

    fp = open(blast_file)
    total_lines = sum(1 for line in fp if line[0] != '#')
    logging.debug("Load BLAST file `%s` (total %d lines)" % \
            (blast_file, total_lines))
    bl = Blast(blast_file)
    blasts = sorted(list(bl), key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set()
    ostrip = opts.strip_names
    nwarnings = 0
    for b in blasts:
        query, subject = b.query, b.subject
        if query == subject:
            continue

        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(query,
                    qbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue
        if subject not in sorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(subject,
                    sbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid

        filtered_blasts.append(b)

    if cscore:
        before_filter = len(filtered_blasts)
        logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore)
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        logging.debug("after filter (%d->%d) .." % (before_filter,
            len(filtered_blasts)))

    if tandem_Nmax:
        logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \
                tandem_Nmax)

        qtandems = tandem_grouper(qbed, filtered_blasts,
                flip=True, tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed, filtered_blasts,
                flip=False, tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(opts.qbed)[0] + ".localdups", "w") \
                if opts.tandems_only else None

        if is_self:
            for s in standems:
                qtandems.join(*s)
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_fh = open(op.splitext(opts.sbed)[0] + ".localdups", "w") \
                    if opts.tandems_only else None
            sdups_to_mother = write_localdups(standems, sbed, sdups_fh)

        if opts.tandems_only:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)

            # just want to use this script as a tandem finder.
            #sys.exit()

        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        logging.debug("after filter (%d->%d) .." % \
                (before_filter, len(filtered_blasts)))

    blastfilteredfile = blast_file + ".filtered"
    fw = open(blastfilteredfile, "w")
    write_new_blast(filtered_blasts, fh=fw)
    fw.close()
Ejemplo n.º 11
0
            help="genome names for labeling axes in the form of qname_sname, " \
            "eg. \"Vitis vinifera_Oryza sativa\"")
    p.add_option("--nmax", dest="sample_number", type="int", default=10000,
            help="Maximum number of data points to plot [default: %default]")
    p.add_option("--ignore", type="float", default=.005,
            help="Do not render labels for chr less than portion of genome [default: %default]")
    p.add_option("--palette",
            help="Two column file, block id to color mapping [default: %default]")
    opts, args, iopts = p.set_image_options(sys.argv[1:], figsize="8x8", dpi=90)

    if len(args) != 1:
        sys.exit(not p.print_help())

    synteny = opts.synteny
    vmin, vmax = opts.vmin, opts.vmax
    cmap_text = opts.cmap
    genomenames = opts.genomenames
    sample_number = opts.sample_number
    palette = opts.palette
    if palette:
        palette = Palette(palette)

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    image_name = op.splitext(anchorfile)[0] + "." + opts.format
    dotplot_main(anchorfile, qbed, sbed, image_name, iopts, vmin=0, vmax=1,
            is_self=is_self, synteny=synteny, cmap_text=cmap_text, \
            genomenames=genomenames, sample_number=sample_number,
            ignore=opts.ignore, palette=palette)
Ejemplo n.º 12
0
Archivo: quota.py Proyecto: rrane/jcvi
def main(args):
    p = OptionParser(__doc__)

    p.set_beds()
    p.add_option("--quota", default="1:1",
            help="`quota mapping` procedure -- screen blocks to constrain mapping"\
                    " (useful for orthology), "\
                    "put in the format like (#subgenomes expected for genome X):"\
                    "(#subgenomes expected for genome Y) "\
                    "[default: %default]")
    p.add_option("--Nm", dest="Nmax", type="int", default=10,
            help="distance cutoff to tolerate two blocks that are "\
                    "slightly overlapping (cutoff for `quota mapping`) "\
                    "[default: %default units (gene or bp dist)]")

    supported_solvers = ("SCIP", "GLPK")
    p.add_option("--self", dest="self_match",
            action="store_true", default=False,
            help="you might turn this on when screening paralogous blocks, "\
                 "esp. if you have reduced mirrored blocks into non-redundant set")
    p.add_option("--solver", default="SCIP", choices=supported_solvers,
            help="use MIP solver [default: %default]")
    p.add_option("--verbose", action="store_true",
            default=False, help="show verbose solver output")

    p.add_option("--screen", default=False, action="store_true",
            help="generate new anchors file [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    qa_file, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(qa_file, p, opts)

    # sanity check for the quota
    if opts.quota:
        try:
            qa, qb = opts.quota.split(":")
            qa, qb = int(qa), int(qb)
        except:
            print >> sys.stderr, "quota string should be the form x:x (2:4, 1:3, etc.)"
            sys.exit(1)

        if opts.self_match and qa != qb:
            raise Exception, "when comparing genome to itself, " \
                    "quota must be the same number " \
                    "(like 1:1, 2:2) you have %s" % opts.quota
        quota = (qa, qb)

    self_match = opts.self_match

    clusters = read_clusters(qa_file, qorder, sorder)
    for cluster in clusters:
        assert len(cluster) > 0

    # below runs `quota mapping`
    work_dir = op.join(op.dirname(op.abspath(qa_file)), "work")

    selected_ids = solve_lp(clusters, quota, work_dir=work_dir, \
            Nmax=opts.Nmax, self_match=self_match, \
            solver=opts.solver, verbose=opts.verbose)

    logging.debug("Selected {0} blocks.".format(len(selected_ids)))
    prefix = qa_file.rsplit(".", 1)[0]
    suffix = "{0}x{1}".format(qa, qb)
    outfile = ".".join((prefix, suffix))
    fw = must_open(outfile, "w")
    print >> fw, ",".join(str(x) for x in selected_ids)
    fw.close()
    logging.debug("Screened blocks ids written to `{0}`.".format(outfile))

    if opts.screen:
        from jcvi.compara.synteny import screen

        new_qa_file = ".".join((prefix, suffix, "anchors"))
        largs = [qa_file, new_qa_file, "--ids", outfile]
        if opts.qbed and opts.sbed:
            largs += ["--qbed={0}".format(opts.qbed)]
            largs += ["--sbed={0}".format(opts.sbed)]
        screen(largs)
Ejemplo n.º 13
0
def blastfilter_main(blast_file, p, opts):

    qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts)

    tandem_Nmax = opts.tandem_Nmax
    cscore = opts.cscore
    exclude = opts.exclude

    fp = open(blast_file)
    total_lines = sum(1 for line in fp if line[0] != "#")
    logging.debug(
        "Load BLAST file `{}` (total {} lines)".format(blast_file, total_lines)
    )
    bl = Blast(blast_file)
    blasts = sorted(list(bl), key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set()
    ostrip = opts.strip_names
    nwarnings = 0
    for b in blasts:
        query, subject = b.query, b.subject
        if query == subject:
            continue

        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            if nwarnings < 100:
                logging.warning("{} not in {}".format(query, qbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue
        if subject not in sorder:
            if nwarnings < 100:
                logging.warning("{} not in {}".format(subject, sbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)
        b.query, b.subject = [str(k) for k in key]

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid

        filtered_blasts.append(b)

    if exclude:
        before_filter = len(filtered_blasts)
        logging.debug("running excluded pairs (--exclude `{}`) ..".format(exclude))
        filtered_blasts = list(filter_exclude(filtered_blasts, exclude=exclude))
        logging.debug(
            "after filter ({}->{}) ..".format(before_filter, len(filtered_blasts))
        )

    if cscore:
        before_filter = len(filtered_blasts)
        logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore)
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        logging.debug(
            "after filter ({}->{}) ..".format(before_filter, len(filtered_blasts))
        )

    if tandem_Nmax:
        logging.debug(
            "running the local dups filter (tandem_Nmax={}) ..".format(tandem_Nmax)
        )

        qtandems = tandem_grouper(filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax)

        qdups_fh = (
            open(op.splitext(opts.qbed)[0] + ".localdups", "w")
            if opts.tandems_only
            else None
        )

        if is_self:
            for s in standems:
                qtandems.join(*s)
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_fh = (
                open(op.splitext(opts.sbed)[0] + ".localdups", "w")
                if opts.tandems_only
                else None
            )
            sdups_to_mother = write_localdups(standems, sbed, sdups_fh)

        if opts.tandems_only:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)

            # just want to use this script as a tandem finder.
            # sys.exit()

        before_filter = len(filtered_blasts)
        filtered_blasts = list(
            filter_tandem(filtered_blasts, qdups_to_mother, sdups_to_mother)
        )
        logging.debug(
            "after filter ({}->{}) ..".format(before_filter, len(filtered_blasts))
        )

    blastfilteredfile = blast_file + ".filtered"
    fw = open(blastfilteredfile, "w")
    write_new_blast(filtered_blasts, fh=fw)
    fw.close()
Ejemplo n.º 14
0
def pad(args):
    """
    %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed

    Test and reconstruct candidate PADs.
    """
    from jcvi.formats.cdt import CDT

    p = OptionParser(pad.__doc__)
    p.set_beds()
    p.add_option("--cutoff", default=.3, type="float",
                 help="The clustering cutoff to call similar [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cutoff = opts.cutoff
    blastfile, cdtfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    cdt = CDT(cdtfile)
    qparts = list(cdt.iter_partitions(cutoff=cutoff))
    sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False))

    qid, sid = {}, {}
    for i, part in enumerate(qparts):
        qid.update(dict((x, i) for x in part))
    for i, part in enumerate(sparts):
        sid.update(dict((x, i) for x in part))

    # Without writing files, conversion from PAD to merged PAD is done in memory
    for q in qbed:
        q.seqid = qid[q.seqid]
    for s in sbed:
        s.seqid = sid[s.seqid]

    qnames = range(len(qparts))
    snames = range(len(sparts))

    logmp = make_arrays(blastfile, qbed, sbed, qnames, snames)
    m, n = logmp.shape
    pvalue_cutoff = 1e-30
    cutoff = - log(pvalue_cutoff)

    significant = []
    for i in xrange(m):
        for j in xrange(n):
            score = logmp[i, j]
            if score < cutoff:
                continue
            significant.append((qparts[i], sparts[j], score))

    for a, b, score in significant:
        print("|".join(a), "|".join(b), score)

    logging.debug("Collected {0} PAR comparisons significant at (P < {1}).".\
                    format(len(significant), pvalue_cutoff))

    return significant
Ejemplo n.º 15
0
def cluster(args):
    """
    %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile

    Cluster the segments and form PAD. This is the method described in Tang et
    al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks,
    based on which the genome on one or both axis can be chopped up into pieces
    and clustered.
    """
    from jcvi.utils.range import Range

    p = OptionParser(cluster.__doc__)
    p.set_beds()
    p.add_option("--minsize",
                 default=10,
                 type="int",
                 help="Only segment using blocks >= size [default: %default]")
    p.add_option("--path",
                 default="~/scratch/bin",
                 help="Path to the CLUSTER 3.0 binary [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, anchorfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    minsize = opts.minsize
    ac = AnchorFile(anchorfile)
    qranges, sranges = [], []
    qextra = [x[1:] for x in qbed.get_breaks()]
    sextra = [x[1:] for x in sbed.get_breaks()]

    id = 0
    for block in ac.iter_blocks(minsize=minsize):
        q, s = zip(*block)[:2]
        q = [qorder[x][0] for x in q]
        s = [sorder[x][0] for x in s]
        minq, maxq = min(q), max(q)
        mins, maxs = min(s), max(s)
        id += 1

        qr = Range("0", minq, maxq, maxq - minq, id)
        sr = Range("0", mins, maxs, maxs - mins, id)
        qranges.append(qr)
        sranges.append(sr)

    qpads = list(get_segments(qranges, qextra))
    spads = list(get_segments(sranges, sextra))

    suffix = ".pad.bed"
    qpf = opts.qbed.split(".")[0]
    spf = opts.sbed.split(".")[0]
    qpadfile = qpf + suffix
    spadfile = spf + suffix
    qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed)
    snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed)

    qpadbed, spadbed = Bed(qpadfile), Bed(spadfile)

    logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames)
    m, n = logmp.shape

    matrixfile = ".".join((qpf, spf, "logmp.txt"))
    fw = open(matrixfile, "w")
    header = ["o"] + spadnames
    print("\t".join(header), file=fw)
    for i in xrange(m):
        row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]]
        print("\t".join(row), file=fw)

    fw.close()

    # Run CLUSTER 3.0 (Pearson correlation, average linkage)
    cmd = op.join(opts.path, "cluster")
    cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile)
    pf = matrixfile.rsplit(".", 1)[0]
    cdtfile = pf + ".cdt"
    if need_update(matrixfile, cdtfile):
        sh(cmd)
Ejemplo n.º 16
0
def cluster(args):
    """
    %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile

    Cluster the segments and form PAD. This is the method described in Tang et
    al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks,
    based on which the genome on one or both axis can be chopped up into pieces
    and clustered.
    """
    from jcvi.utils.range import Range

    p = OptionParser(cluster.__doc__)
    p.set_beds()
    p.add_option("--minsize", default=10, type="int",
                 help="Only segment using blocks >= size [default: %default]")
    p.add_option("--path", default="~/scratch/bin",
                 help="Path to the CLUSTER 3.0 binary [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, anchorfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    minsize = opts.minsize
    ac = AnchorFile(anchorfile)
    qranges, sranges = [], []
    qextra = [x[1:] for x in qbed.get_breaks()]
    sextra = [x[1:] for x in sbed.get_breaks()]

    id = 0
    for block in ac.iter_blocks(minsize=minsize):
        q, s = zip(*block)[:2]
        q = [qorder[x][0] for x in q]
        s = [sorder[x][0] for x in s]
        minq, maxq = min(q), max(q)
        mins, maxs = min(s), max(s)
        id += 1

        qr = Range("0", minq, maxq, maxq - minq, id)
        sr = Range("0", mins, maxs, maxs - mins, id)
        qranges.append(qr)
        sranges.append(sr)

    qpads = list(get_segments(qranges, qextra))
    spads = list(get_segments(sranges, sextra))

    suffix = ".pad.bed"
    qpf = opts.qbed.split(".")[0]
    spf = opts.sbed.split(".")[0]
    qpadfile = qpf + suffix
    spadfile = spf + suffix
    qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed)
    snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed)

    qpadbed, spadbed = Bed(qpadfile), Bed(spadfile)

    logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames)
    m, n = logmp.shape

    matrixfile = ".".join((qpf, spf, "logmp.txt"))
    fw = open(matrixfile, "w")
    header = ["o"] + spadnames
    print("\t".join(header), file=fw)
    for i in xrange(m):
        row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]]
        print("\t".join(row), file=fw)

    fw.close()

    # Run CLUSTER 3.0 (Pearson correlation, average linkage)
    cmd = op.join(opts.path, "cluster")
    cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile)
    pf = matrixfile.rsplit(".", 1)[0]
    cdtfile = pf + ".cdt"
    if need_update(matrixfile, cdtfile):
        sh(cmd)
Ejemplo n.º 17
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option(
        "--switch",
        default=False,
        action="store_true",
        help="Switch reference and aligned map elements",
    )
    p.add_option("--scale",
                 type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (anchorsfile, ) = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        try:
            newsseqid = get_number(sseqid)
        except ValueError:
            raise ValueError(
                "`{0}` is on `{1}` with no number to extract".format(
                    saccn, sseqid))
        bedline = "\t".join(
            str(x) for x in (qseqid, qstart - 1, qend,
                             "{0}:{1}".format(newsseqid, sstart)))
        bd.add(bedline)

    bd.print_to_file(filename=opts.outfile, sorted=True)
Ejemplo n.º 18
0
def dotplot_main(args):
    p = OptionParser(__doc__)
    p.set_beds()
    p.add_option(
        "--synteny",
        default=False,
        action="store_true",
        help="Run a fast synteny scan and display blocks",
    )
    p.add_option("--cmaptext",
                 help="Draw colormap box on the bottom-left corner")
    p.add_option(
        "--vmin",
        dest="vmin",
        type="float",
        default=0,
        help="Minimum value in the colormap",
    )
    p.add_option(
        "--vmax",
        dest="vmax",
        type="float",
        default=2,
        help="Maximum value in the colormap",
    )
    p.add_option(
        "--nmax",
        dest="sample_number",
        type="int",
        default=10000,
        help="Maximum number of data points to plot",
    )
    p.add_option(
        "--minfont",
        type="int",
        default=4,
        help="Do not render labels with size smaller than",
    )
    p.add_option("--colormap",
                 help="Two column file, block id to color mapping")
    p.add_option(
        "--colororientation",
        action="store_true",
        default=False,
        help="Color the blocks based on orientation, similar to mummerplot",
    )
    p.add_option(
        "--nosort",
        default=False,
        action="store_true",
        help="Do not sort the seqids along the axes",
    )
    p.add_option("--nosep",
                 default=False,
                 action="store_true",
                 help="Do not add contig lines")
    p.add_option("--title", help="Title of the dot plot")
    p.set_dotplot_opts()
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args,
                                            figsize="9x9",
                                            style="dark",
                                            dpi=90,
                                            cmap="copper")

    if len(args) != 1:
        sys.exit(not p.print_help())

    (anchorfile, ) = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile,
                                                     p,
                                                     opts,
                                                     sorted=(not opts.nosort))

    palette = opts.colormap
    if palette:
        palette = Palette(palettefile=palette)
    elif opts.colororientation:
        palette = Palette.from_block_orientation(anchorfile, qbed, sbed)

    cmaptext = opts.cmaptext
    if anchorfile.endswith(".ks"):
        from jcvi.apps.ks import KsFile

        logging.debug("Anchors contain Ks values")
        cmaptext = cmaptext or "*Ks* values"
        anchorksfile = anchorfile + ".anchors"
        if need_update(anchorfile, anchorksfile):
            ksfile = KsFile(anchorfile)
            ksfile.print_to_anchors(anchorksfile)
        anchorfile = anchorksfile

    if opts.skipempty:
        ac = AnchorFile(anchorfile)
        if is_self:
            qseqids = sseqids = set()
        else:
            qseqids, sseqids = set(), set()

        for pair in ac.iter_pairs():
            q, s = pair[:2]
            qi, q = qorder[q]
            si, s = sorder[s]
            qseqids.add(q.seqid)
            sseqids.add(s.seqid)

        if is_self:
            qbed = sbed = subset_bed(qbed, qseqids)
        else:
            qbed = subset_bed(qbed, qseqids)
            sbed = subset_bed(sbed, sseqids)

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])  # the whole canvas
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])  # the dot plot

    dotplot(
        anchorfile,
        qbed,
        sbed,
        fig,
        root,
        ax,
        vmin=opts.vmin,
        vmax=opts.vmax,
        is_self=is_self,
        synteny=opts.synteny,
        cmap_text=opts.cmaptext,
        cmap=iopts.cmap,
        genomenames=opts.genomenames,
        sample_number=opts.sample_number,
        minfont=opts.minfont,
        palette=palette,
        sep=(not opts.nosep),
        sepcolor=set1[int(opts.theme)],
        title=opts.title,
        stdpf=(not opts.nostdpf),
        chpf=(not opts.nochpf),
    )

    image_name = opts.outfile or (op.splitext(anchorfile)[0] + "." +
                                  opts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    fig.clear()
Ejemplo n.º 19
0
def omgprepare(args):
    """
    %prog omgprepare ploidy anchorsfile blastfile

    Prepare to run Sankoff's OMG algorithm to get orthologs.
    """
    from jcvi.formats.blast import cscore
    from jcvi.formats.base import DictFile

    p = OptionParser(omgprepare.__doc__)
    p.add_option("--norbh", action="store_true",
                 help="Disable RBH hits [default: %default]")
    p.add_option("--pctid", default=0, type="int",
                 help="Percent id cutoff for RBH hits [default: %default]")
    p.add_option("--cscore", default=90, type="int",
                 help="C-score cutoff for RBH hits [default: %default]")
    p.set_stripnames()
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ploidy, anchorfile, blastfile = args
    norbh = opts.norbh
    pctid = opts.pctid
    cs = opts.cscore
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    fp = open(ploidy)
    genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp))
    fp.close()

    ploidy = DictFile(ploidy)

    geneinfo(qbed, qorder, genomeidx, ploidy)
    geneinfo(sbed, sorder, genomeidx, ploidy)

    pf = blastfile.rsplit(".", 1)[0]
    cscorefile = pf + ".cscore"
    cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"])
    ac = AnchorFile(anchorfile)
    pairs = set((a, b) for a, b, i in ac.iter_pairs())
    logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile))

    weightsfile = pf + ".weights"
    fp = open(cscorefile)
    fw = open(weightsfile, "w")
    npairs = 0
    for row in fp:
        a, b, c, pct = row.split()
        c, pct = float(c), float(pct)
        c = int(c * 100)
        if (a, b) not in pairs:
            if norbh:
                continue
            if c < cs:
                continue
            if pct < pctid:
                continue
            c /= 10  # This severely penalizes RBH against synteny

        print >> fw, "\t".join((a, b, str(c)))
        npairs += 1
    fw.close()

    logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
Ejemplo n.º 20
0
def omgprepare(args):
    """
    %prog omgprepare ploidy anchorsfile blastfile

    Prepare to run Sankoff's OMG algorithm to get orthologs.
    """
    from jcvi.formats.blast import cscore
    from jcvi.formats.base import DictFile

    p = OptionParser(omgprepare.__doc__)
    p.add_option("--norbh", action="store_true", help="Disable RBH hits")
    p.add_option(
        "--pctid", default=0, type="int", help="Percent id cutoff for RBH hits"
    )
    p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits")
    p.set_stripnames()
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ploidy, anchorfile, blastfile = args
    norbh = opts.norbh
    pctid = opts.pctid
    cs = opts.cscore
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    fp = open(ploidy)
    genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp))
    fp.close()

    ploidy = DictFile(ploidy)

    geneinfo(qbed, qorder, genomeidx, ploidy)
    geneinfo(sbed, sorder, genomeidx, ploidy)

    pf = blastfile.rsplit(".", 1)[0]
    cscorefile = pf + ".cscore"
    cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"])
    ac = AnchorFile(anchorfile)
    pairs = set((a, b) for a, b, i in ac.iter_pairs())
    logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile))

    weightsfile = pf + ".weights"
    fp = open(cscorefile)
    fw = open(weightsfile, "w")
    npairs = 0
    for row in fp:
        a, b, c, pct = row.split()
        c, pct = float(c), float(pct)
        c = int(c * 100)
        if (a, b) not in pairs:
            if norbh:
                continue
            if c < cs:
                continue
            if pct < pctid:
                continue
            c /= 10  # This severely penalizes RBH against synteny

        print("\t".join((a, b, str(c))), file=fw)
        npairs += 1
    fw.close()

    logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
Ejemplo n.º 21
0
def ancestral(args):
    """
    %prog ancestral vplanifoliaA.vplanifoliaA.anchors > vplanifoliaA_blocks.bed

    Paint 14 chromosomes following alpha WGD.
    """
    p = OptionParser(ancestral.__doc__)
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (anchorsfile, ) = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)

    # We focus on the following chromosome pairs
    target_pairs = set((
        (1, 1),
        (1, 6),
        (1, 8),
        (1, 13),
        (2, 4),
        (3, 12),
        (3, 14),
        (5, 6),
        (5, 8),
        (7, 9),
        (7, 11),
        (9, 10),
        (10, 11),
    ))

    def get_target(achr, bchr):
        if "chr" not in achr and "chr" not in bchr:
            return None
        achr, bchr = get_number(achr), get_number(bchr)
        if achr > bchr:
            achr, bchr = bchr, achr
        if (achr, bchr) in target_pairs:
            return achr, bchr
        return None

    def build_bedline(astart, aend, target_pair):
        # target_name = "{:02d}-{:02d}".format(*target_pair)
        target_name = [
            str(x) for x in target_pair if x in (1, 2, 3, 5, 7, 10)
        ][0]
        return "\t".join(
            str(x)
            for x in (astart.seqid, astart.start, aend.end, target_name))

    # Iterate through the blocks, store any regions that has hits to one of the
    # target_pairs
    ac = AnchorFile(anchorsfile)
    blocks = ac.blocks
    outbed = Bed()
    for i, block in enumerate(blocks):
        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        astart, aend = min(a)[1], max(a)[1]
        bstart, bend = min(b)[1], max(b)[1]
        # Now convert to BED lines with new accn
        achr, bchr = astart.seqid, bstart.seqid
        target = get_target(achr, bchr)
        if target is None:
            continue
        outbed.add(build_bedline(astart, aend, target))
        outbed.add(build_bedline(bstart, bend, target))
    outbed.print_to_file(sorted=True)
Ejemplo n.º 22
0
def main(args):
    p = OptionParser(__doc__)

    p.set_beds()
    p.add_option("--quota", default="1:1",
            help="`quota mapping` procedure -- screen blocks to constrain mapping"\
                    " (useful for orthology), "\
                    "put in the format like (#subgenomes expected for genome X):"\
                    "(#subgenomes expected for genome Y) "\
                    "[default: %default]")
    p.add_option("--Nm", dest="Nmax", type="int", default=10,
            help="distance cutoff to tolerate two blocks that are "\
                    "slightly overlapping (cutoff for `quota mapping`) "\
                    "[default: %default units (gene or bp dist)]")

    supported_solvers = ("SCIP", "GLPK")
    p.add_option("--self", dest="self_match",
            action="store_true", default=False,
            help="you might turn this on when screening paralogous blocks, "\
                 "esp. if you have reduced mirrored blocks into non-redundant set")
    p.add_option("--solver",
                 default="SCIP",
                 choices=supported_solvers,
                 help="use MIP solver [default: %default]")
    p.set_verbose(help="Show verbose solver output")

    p.add_option("--screen",
                 default=False,
                 action="store_true",
                 help="generate new anchors file [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    qa_file, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(qa_file, p, opts)

    # sanity check for the quota
    if opts.quota:
        try:
            qa, qb = opts.quota.split(":")
            qa, qb = int(qa), int(qb)
        except:
            print >> sys.stderr, "quota string should be the form x:x (2:4, 1:3, etc.)"
            sys.exit(1)

        if opts.self_match and qa != qb:
            raise Exception, "when comparing genome to itself, " \
                    "quota must be the same number " \
                    "(like 1:1, 2:2) you have %s" % opts.quota
        quota = (qa, qb)

    self_match = opts.self_match

    clusters = read_clusters(qa_file, qorder, sorder)
    for cluster in clusters:
        assert len(cluster) > 0

    # below runs `quota mapping`
    work_dir = op.join(op.dirname(op.abspath(qa_file)), "work")

    selected_ids = solve_lp(clusters, quota, work_dir=work_dir, \
            Nmax=opts.Nmax, self_match=self_match, \
            solver=opts.solver, verbose=opts.verbose)

    logging.debug("Selected {0} blocks.".format(len(selected_ids)))
    prefix = qa_file.rsplit(".", 1)[0]
    suffix = "{0}x{1}".format(qa, qb)
    outfile = ".".join((prefix, suffix))
    fw = must_open(outfile, "w")
    print >> fw, ",".join(str(x) for x in selected_ids)
    fw.close()
    logging.debug("Screened blocks ids written to `{0}`.".format(outfile))

    if opts.screen:
        from jcvi.compara.synteny import screen

        new_qa_file = ".".join((prefix, suffix, "anchors"))
        largs = [qa_file, new_qa_file, "--ids", outfile]
        if opts.qbed and opts.sbed:
            largs += ["--qbed={0}".format(opts.qbed)]
            largs += ["--sbed={0}".format(opts.sbed)]
        screen(largs)