Example #1
0
def main(args):

    p = OptionParser(__doc__)
    p.set_beds()
    p.set_stripnames()
    p.add_option(
        "--tandems_only",
        dest="tandems_only",
        action="store_true",
        default=False,
        help="only calculate tandems, write .localdup file and exit.",
    )
    p.add_option("--tandem_Nmax", type="int", default=10, help="merge tandem genes within distance [default: %default]")
    p.add_option(
        "--cscore",
        type="float",
        default=0.7,
        help="retain hits that have good bitscore. a value of 0.5 means "
        "keep all values that are 50% or greater of the best hit. "
        "higher is more stringent [default: %default]",
    )

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args
    blastfilter_main(blastfile, p, opts)
Example #2
0
def rebuild(args):
    """
    %prog rebuild blocksfile blastfile

    Rebuild anchors file from pre-built blocks file.
    """
    p = OptionParser(rebuild.__doc__)
    p.add_option("--header", default=False, action="store_true",
                 help="First line is header [default: %default]")
    p.add_option("--write_blast", default=False, action="store_true",
                 help="Get blast records of rebuilt anchors [default: %default]")
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blocksfile, blastfile = args
    bk = BlockFile(blocksfile, header=opts.header)
    fw = open("pairs", "w")
    for a, b, h in bk.iter_all_pairs():
        print >> fw, "\t".join((a, b))
    fw.close()

    if opts.write_blast:
        AnchorFile("pairs").blast(blastfile, "pairs.blast")

    fw = open("tracks", "w")
    for g, col in bk.iter_gene_col():
        print >> fw, "\t".join(str(x) for x in (g, col))
    fw.close()
Example #3
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option("--switch", default=False, action="store_true",
                 help="Switch reference and aligned map elements")
    p.add_option("--scale", type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        try:
            newsseqid = get_number(sseqid)
        except ValueError:
            raise ValueError, "`{0}` is on `{1}` with no number to extract".\
                                format(saccn, sseqid)
        bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend,
                            "{0}:{1}".format(newsseqid, sstart)))
        bd.add(bedline)

    bd.print_to_file(filename=opts.outfile, sorted=True)
Example #4
0
def depth(args):
    """
    %prog depth anchorfile --qbed qbedfile --sbed sbedfile

    Calculate the depths in the two genomes in comparison, given in --qbed and
    --sbed. The synteny blocks will be layered on the genomes, and the
    multiplicity will be summarized to stderr.
    """
    from jcvi.utils.range import range_depth

    p = OptionParser(depth.__doc__)
    p.add_option("--depthfile",
                 help="Generate file with gene and depth [default: %default]")
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    depthfile = opts.depthfile
    ac = AnchorFile(anchorfile)
    qranges = []
    sranges = []
    blocks = ac.blocks
    for ib in blocks:
        q, s, t = zip(*ib)
        q = [qorder[x] for x in q]
        s = [sorder[x] for x in s]
        qrange = (min(q)[0], max(q)[0])
        srange = (min(s)[0], max(s)[0])
        qranges.append(qrange)
        sranges.append(srange)
        if is_self:
            qranges.append(srange)

    qgenome = op.basename(qbed.filename).split(".")[0]
    sgenome = op.basename(sbed.filename).split(".")[0]
    print >> sys.stderr, "Genome {0} depths:".format(qgenome)
    ds, details = range_depth(qranges, len(qbed))
    if depthfile:
        fw = open(depthfile, "w")
        write_details(fw, details, qbed)

    if is_self:
        return

    print >> sys.stderr, "Genome {0} depths:".format(sgenome)
    ds, details = range_depth(sranges, len(sbed))
    if depthfile:
        write_details(fw, details, sbed)
        fw.close()
        logging.debug("Depth written to `{0}`.".format(depthfile))
Example #5
0
def layout(args):
    """
    %prog layout query.subject.simple query.seqids subject.seqids

    Compute optimal seqids order in a second genome, based on seqids on one
    genome, given the pairwise blocks in .simple format.
    """
    from jcvi.algorithms.ec import GA_setup, GA_run

    p = OptionParser(layout.__doc__)
    p.set_beds()
    p.set_cpus(cpus=32)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    simplefile, qseqids, sseqids = args
    qbed, sbed, qorder, sorder, is_self = check_beds(simplefile, p, opts)

    qseqids = qseqids.strip().split(",")
    sseqids = sseqids.strip().split(",")
    qseqids_ii = dict((s, i) for i, s in enumerate(qseqids))
    sseqids_ii = dict((s, i) for i, s in enumerate(sseqids))

    blocks = SimpleFile(simplefile).blocks
    scores = defaultdict(int)
    for a, b, c, d, score, orientation, hl in blocks:
        qi, q = qorder[a]
        si, s = sorder[c]
        qseqid, sseqid = q.seqid, s.seqid
        if sseqid not in sseqids:
            continue
        scores[sseqids_ii[sseqid], qseqid] += score

    data = []
    for (a, b), score in sorted(scores.items()):
        if b not in qseqids_ii:
            continue
        data.append((qseqids_ii[b], score))

    tour = range(len(qseqids))
    toolbox = GA_setup(tour)
    toolbox.register("evaluate", colinear_evaluate_weights, data=data)
    tour, fitness = GA_run(toolbox, ngen=100, npop=100, cpus=opts.cpus)
    tour = [qseqids[x] for x in tour]

    print ",".join(tour)
Example #6
0
def mtdotplots(args):
    """
    %prog mtdotplots Mt3.5 Mt4.0 medicago.medicago.lifted.1x1.anchors

    Plot Mt3.5 and Mt4.0 side-by-side. This is essentially combined from two
    graphics.dotplot() function calls as panel A and B.
    """
    from jcvi.graphics.dotplot import check_beds, dotplot

    p = OptionParser(mtdotplots.__doc__)
    p.set_beds()
    opts, args, iopts = p.set_image_options(args, figsize="16x8", dpi=90)

    if len(args) != 3:
        sys.exit(not p.print_help())

    a, b, ac = args
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    r1 = fig.add_axes([0, 0, .5, 1])
    r2 = fig.add_axes([.5, 0, .5, 1])
    a1 = fig.add_axes([.05, .1, .4, .8])
    a2 = fig.add_axes([.55, .1, .4, .8])

    anchorfile = op.join(a, ac)
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    dotplot(anchorfile, qbed, sbed, fig, r1, a1, is_self=is_self,
            genomenames="Mt3.5_Mt3.5")

    opts.qbed = opts.sbed = None
    anchorfile = op.join(b, ac)
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    dotplot(anchorfile, qbed, sbed, fig, r2, a2, is_self=is_self,
            genomenames="Mt4.0_Mt4.0")

    root.text(.03, .95, "A", ha="center", va="center", size=36)
    root.text(.53, .95, "B", ha="center", va="center", size=36)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    pf = "mtdotplots"
    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Example #7
0
File: hic.py Project: xuanblo/jcvi
def movieframe(args):
    """
    %prog movieframe tour test.clm contigs.ref.anchors

    Draw heatmap and synteny in the same plot.
    """
    p = OptionParser(movieframe.__doc__)
    p.add_option("--label", help="Figure title")
    p.set_beds()
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args, figsize="16x8",
                                            style="white", cmap="coolwarm",
                                            format="png", dpi=120)

    if len(args) != 3:
        sys.exit(not p.print_help())

    tour, clmfile, anchorsfile = args
    tour = tour.split(",")
    image_name = opts.outfile or ("movieframe." + iopts.format)
    label = opts.label or op.basename(image_name).rsplit(".", 1)[0]

    clm = CLMFile(clmfile)
    totalbins, bins, breaks = make_bins(tour, clm.tig_to_size)
    M = read_clm(clm, totalbins, bins)

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])        # whole canvas
    ax1 = fig.add_axes([.05, .1, .4, .8])    # heatmap
    ax2 = fig.add_axes([.55, .1, .4, .8])    # dot plot
    ax2_root = fig.add_axes([.5, 0, .5, 1])  # dot plot canvas

    # Left axis: heatmap
    plot_heatmap(ax1, M, breaks, iopts)

    # Right axis: synteny
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts,
                                                     sorted=False)
    dotplot(anchorsfile, qbed, sbed, fig, ax2_root, ax2, sep=False, title="")

    root.text(.5, .98, clm.name, color="g", ha="center", va="center")
    root.text(.5, .95, label, color="darkslategray", ha="center", va="center")
    normalize_axes(root)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Example #8
0
def offdiag(args):
    """
    %prog offdiag diploid.napus.1x1.lifted.anchors

    Find gene pairs that are off diagnoal. "Off diagonal" are the pairs that are
    not on the orthologous chromosomes. For example, napus chrA01 and brapa A01.
    """
    p = OptionParser(offdiag.__doc__)
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)

    fp = open(anchorsfile)
    pf = "-".join(anchorsfile.split(".")[:2])
    header = "Block-id|Napus|Diploid|Napus-chr|Diploid-chr|RBH?".split("|")
    print "\t".join(header)
    i = -1
    for row in fp:
        if row[0] == '#':
            i += 1
            continue
        q, s, score = row.split()
        rbh = 'no' if score[-1] == 'L' else 'yes'
        qi, qq = qorder[q]
        si, ss = sorder[s]
        oqseqid = qseqid = qq.seqid
        osseqid = sseqid = ss.seqid
        sseqid = sseqid.split("_")[0][-3:]
        if qseqid[0] == 'A':
            qseqid = qseqid[-3:]       # A09 => A09
        elif qseqid[0] == 'C':
            qseqid = 'C0' + qseqid[-1]  # C9 => C09
        else:
            continue
        if qseqid == sseqid or sseqid[-2:] == 'nn':
            continue
        block_id = pf + "-block-{0}".format(i)
        print "\t".join((block_id, q, s, oqseqid, osseqid, rbh))
Example #9
0
def collinear(args):
    """
    %prog collinear a.b.anchors

    Reduce synteny blocks to strictly collinear, use dynamic programming in a
    procedure similar to DAGchainer.
    """
    p = OptionParser(collinear.__doc__)
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    af = AnchorFile(anchorfile)
    newanchorfile = anchorfile.rsplit(".", 1)[0] + ".collinear.anchors"
    fw = open(newanchorfile, "w")

    blocks = af.blocks
    for block in blocks:
        print >> fw, "#" * 3
        iblock = []
        for q, s, score in block:
            qi, q = qorder[q]
            si, s = sorder[s]
            score = int(long(score))
            iblock.append([qi, si, score])

        block = get_collinear(iblock)

        for q, s, score in block:
            q = qbed[q].accn
            s = sbed[s].accn
            print >> fw, "\t".join((q, s, str(score)))

    fw.close()
Example #10
0
def collinear(args):
    """
    %prog collinear a.b.anchors

    Reduce synteny blocks to strictly collinear, use dynamic programming in a
    procedure similar to DAGchainer.
    """
    p = OptionParser(collinear.__doc__)
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    af = AnchorFile(anchorfile)
    newanchorfile = anchorfile.rsplit(".", 1)[0] + ".collinear.anchors"
    fw = open(newanchorfile, "w")

    blocks = af.blocks
    for block in blocks:
        print("#" * 3, file=fw)
        iblock = []
        for q, s, score in block:
            qi, q = qorder[q]
            si, s = sorder[s]
            score = int(long(score))
            iblock.append([qi, si, score])

        block = get_collinear(iblock)

        for q, s, score in block:
            q = qbed[q].accn
            s = sbed[s].accn
            print("\t".join((q, s, str(score))), file=fw)

    fw.close()
Example #11
0
def mergechrom(args):
    """
    %prog mergechrom a.b.anchors

    merge synteny blocks on the same chromosome
    """
    p = OptionParser(mergechrom.__doc__)
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    af = AnchorFile(anchorfile)
    newanchorfile = anchorfile.rsplit(".", 1)[0] + ".mergechrom.anchors"
    fw = open(newanchorfile, "w")

    qchrom_dic = dict((b.accn,b.seqid) for b in qbed)
    schrom_dic = dict((b.accn,b.seqid) for b in sbed)
    block_dic = dict()
    blocks = af.blocks
    for (i,block) in enumerate(blocks):
        q, s, score = block[0]
        qchrom, schrom = qchrom_dic[q], schrom_dic[s]
        k = "%s_%s" % (qchrom, schrom)
        if k not in block_dic: block_dic[k] = []
        block_dic[k].append(i)

    for (k, idxs) in block_dic.items():
        print("#" * 3, file=fw)
        for i in idxs:
            for q, s, score in blocks[i]:
                print("\t".join((q, s, str(score))), file=fw)

    fw.close()
    print("%d blocks merged to %d" % (len(blocks), len(block_dic.keys())))
Example #12
0
def mergechrom(args):
    """
    %prog mergechrom a.b.anchors

    merge synteny blocks on the same chromosome
    """
    p = OptionParser(mergechrom.__doc__)
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    af = AnchorFile(anchorfile)
    newanchorfile = anchorfile.rsplit(".", 1)[0] + ".mergechrom.anchors"
    fw = open(newanchorfile, "w")

    qchrom_dic = dict((b.accn, b.seqid) for b in qbed)
    schrom_dic = dict((b.accn, b.seqid) for b in sbed)
    block_dic = dict()
    blocks = af.blocks
    for (i, block) in enumerate(blocks):
        q, s, score = block[0]
        qchrom, schrom = qchrom_dic[q], schrom_dic[s]
        k = "%s_%s" % (qchrom, schrom)
        if k not in block_dic: block_dic[k] = []
        block_dic[k].append(i)

    for (k, idxs) in block_dic.items():
        print("#" * 3, file=fw)
        for i in idxs:
            for q, s, score in blocks[i]:
                print("\t".join((q, s, str(score))), file=fw)

    fw.close()
    print("%d blocks merged to %d" % (len(blocks), len(block_dic.keys())))
Example #13
0
def main(args):

    p = OptionParser(__doc__)
    p.set_beds()
    p.set_stripnames()
    p.add_option("--tandems_only", dest="tandems_only",
            action="store_true", default=False,
            help="only calculate tandems, write .localdup file and exit.")
    p.add_option("--tandem_Nmax", type="int", default=10,
            help="merge tandem genes within distance [default: %default]")
    p.add_option("--cscore", type="float", default=.7,
            help="retain hits that have good bitscore. a value of 0.5 means "
                 "keep all values that are 50% or greater of the best hit. "
                 "higher is more stringent [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args
    blastfilter_main(blastfile, p, opts)
Example #14
0
def rebuild(args):
    """
    %prog rebuild blocksfile blastfile

    Rebuild anchors file from pre-built blocks file.
    """
    p = OptionParser(rebuild.__doc__)
    p.add_option("--header",
                 default=False,
                 action="store_true",
                 help="First line is header [default: %default]")
    p.add_option(
        "--write_blast",
        default=False,
        action="store_true",
        help="Get blast records of rebuilt anchors [default: %default]")
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blocksfile, blastfile = args
    bk = BlockFile(blocksfile, header=opts.header)
    fw = open("pairs", "w")
    for a, b, h in bk.iter_all_pairs():
        print >> fw, "\t".join((a, b))
    fw.close()

    if opts.write_blast:
        AnchorFile("pairs").blast(blastfile, "pairs.blast")

    fw = open("tracks", "w")
    for g, col in bk.iter_gene_col():
        print >> fw, "\t".join(str(x) for x in (g, col))
    fw.close()
Example #15
0
def cluster(args):
    """
    %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile

    Cluster the segments and form PAD. This is the method described in Tang et
    al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks,
    based on which the genome on one or both axis can be chopped up into pieces
    and clustered.
    """
    from jcvi.utils.range import Range

    p = OptionParser(cluster.__doc__)
    p.set_beds()
    p.add_option("--minsize", default=10, type="int",
                 help="Only segment using blocks >= size [default: %default]")
    p.add_option("--path", default="~/scratch/bin",
                 help="Path to the CLUSTER 3.0 binary [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, anchorfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    minsize = opts.minsize
    ac = AnchorFile(anchorfile)
    qranges, sranges = [], []
    qextra = [x[1:] for x in qbed.get_breaks()]
    sextra = [x[1:] for x in sbed.get_breaks()]

    id = 0
    for block in ac.iter_blocks(minsize=minsize):
        q, s = zip(*block)[:2]
        q = [qorder[x][0] for x in q]
        s = [sorder[x][0] for x in s]
        minq, maxq = min(q), max(q)
        mins, maxs = min(s), max(s)
        id += 1

        qr = Range("0", minq, maxq, maxq - minq, id)
        sr = Range("0", mins, maxs, maxs - mins, id)
        qranges.append(qr)
        sranges.append(sr)

    qpads = list(get_segments(qranges, qextra))
    spads = list(get_segments(sranges, sextra))

    suffix = ".pad.bed"
    qpf = opts.qbed.split(".")[0]
    spf = opts.sbed.split(".")[0]
    qpadfile = qpf + suffix
    spadfile = spf + suffix
    qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed)
    snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed)

    qpadbed, spadbed = Bed(qpadfile), Bed(spadfile)

    logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames)
    m, n = logmp.shape

    matrixfile = ".".join((qpf, spf, "logmp.txt"))
    fw = open(matrixfile, "w")
    header = ["o"] + spadnames
    print("\t".join(header), file=fw)
    for i in xrange(m):
        row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]]
        print("\t".join(row), file=fw)

    fw.close()

    # Run CLUSTER 3.0 (Pearson correlation, average linkage)
    cmd = op.join(opts.path, "cluster")
    cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile)
    pf = matrixfile.rsplit(".", 1)[0]
    cdtfile = pf + ".cdt"
    if need_update(matrixfile, cdtfile):
        sh(cmd)
Example #16
0
def pad(args):
    """
    %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed

    Test and reconstruct candidate PADs.
    """
    from jcvi.formats.cdt import CDT

    p = OptionParser(pad.__doc__)
    p.set_beds()
    p.add_option("--cutoff", default=.3, type="float",
                 help="The clustering cutoff to call similar [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cutoff = opts.cutoff
    blastfile, cdtfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    cdt = CDT(cdtfile)
    qparts = list(cdt.iter_partitions(cutoff=cutoff))
    sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False))

    qid, sid = {}, {}
    for i, part in enumerate(qparts):
        qid.update(dict((x, i) for x in part))
    for i, part in enumerate(sparts):
        sid.update(dict((x, i) for x in part))

    # Without writing files, conversion from PAD to merged PAD is done in memory
    for q in qbed:
        q.seqid = qid[q.seqid]
    for s in sbed:
        s.seqid = sid[s.seqid]

    qnames = range(len(qparts))
    snames = range(len(sparts))

    logmp = make_arrays(blastfile, qbed, sbed, qnames, snames)
    m, n = logmp.shape
    pvalue_cutoff = 1e-30
    cutoff = - log(pvalue_cutoff)

    significant = []
    for i in xrange(m):
        for j in xrange(n):
            score = logmp[i, j]
            if score < cutoff:
                continue
            significant.append((qparts[i], sparts[j], score))

    for a, b, score in significant:
        print("|".join(a), "|".join(b), score)

    logging.debug("Collected {0} PAR comparisons significant at (P < {1}).".\
                    format(len(significant), pvalue_cutoff))

    return significant
Example #17
0
def movie(args):
    """
    %prog movie test.tour test.clm ref.contigs.last

    Plot optimization history.
    """
    p = OptionParser(movie.__doc__)
    p.add_option("--frames",
                 default=500,
                 type="int",
                 help="Only plot every N frames")
    p.add_option("--engine",
                 default="ffmpeg",
                 choices=("ffmpeg", "gifsicle"),
                 help="Movie engine, output MP4 or GIF")
    p.set_beds()
    opts, args, iopts = p.set_image_options(args,
                                            figsize="16x8",
                                            style="white",
                                            cmap="coolwarm",
                                            format="png",
                                            dpi=300)

    if len(args) != 3:
        sys.exit(not p.print_help())

    tourfile, clmfile, lastfile = args
    tourfile = op.abspath(tourfile)
    clmfile = op.abspath(clmfile)
    lastfile = op.abspath(lastfile)
    cwd = os.getcwd()
    odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie"
    anchorsfile, qbedfile, contig_to_beds = \
                prepare_synteny(tourfile, lastfile, odir, p, opts)

    args = []
    for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames):
        padi = "{:06d}".format(i)
        # Make sure the anchorsfile and bedfile has the serial number in,
        # otherwise parallelization may fail
        a, b = op.basename(anchorsfile).split(".", 1)
        ianchorsfile = a + "_" + padi + "." + b
        symlink(anchorsfile, ianchorsfile)

        # Make BED file with new order
        qb = Bed()
        for contig, o in zip(tour, tour_o):
            if contig not in contig_to_beds:
                continue
            bedlines = contig_to_beds[contig][:]
            if o == '-':
                bedlines.reverse()
            for x in bedlines:
                qb.append(x)

        a, b = op.basename(qbedfile).split(".", 1)
        ibedfile = a + "_" + padi + "." + b
        qb.print_to_file(ibedfile)
        # Plot dot plot, but do not sort contigs by name (otherwise losing
        # order)
        image_name = padi + "." + iopts.format

        tour = ",".join(tour)
        args.append([[
            tour, clmfile, ianchorsfile, "--outfile", image_name, "--label",
            label
        ]])

    Jobs(movieframe, args).run()

    os.chdir(cwd)
    make_movie(odir, odir, engine=opts.engine, format=iopts.format)
Example #18
0
def simple(args):
    """
    %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options]

    Write the block ends for each block in the anchorfile.
    GeneA1    GeneA2    GeneB1    GeneB2   +/-      score

    Optional additional columns:
    orderA1   orderA2   orderB1   orderB2  sizeA    sizeB   size    block_id

    With base coordinates (--coords):
    block_id  seqidA    startA    endA     bpSpanA  GeneA1   GeneA2  geneSpanA
    block_id  seqidB    startB    endB     bpSpanB  GeneB1   GeneB2  geneSpanB
    """
    p = OptionParser(simple.__doc__)
    p.add_option("--rich", default=False, action="store_true", \
                help="Output additional columns [default: %default]")
    p.add_option(
        "--coords",
        default=False,
        action="store_true",
        help="Output columns with base coordinates [default: %default]")
    p.add_option("--bed",
                 default=False,
                 action="store_true",
                 help="Generate BED file for the blocks")
    p.add_option("--noheader",
                 default=False,
                 action="store_true",
                 help="Don't output header [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    additional = opts.rich
    coords = opts.coords
    header = not opts.noheader
    bed = opts.bed
    if bed:
        coords = True
        bbed = Bed()

    ac = AnchorFile(anchorfile)
    simplefile = anchorfile.rsplit(".", 1)[0] + ".simple"

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    pf = "-".join(anchorfile.split(".", 2)[:2])
    blocks = ac.blocks

    if coords:
        h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation"
    else:
        h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score"
        if additional:
            h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\
                  "SizeA|SizeB|Size|Block"

    fws = open(simplefile, "w")
    if header:
        print >> fws, "\t".join(h.split("|"))

    atotalbase = btotalbase = 0
    for i, block in enumerate(blocks):

        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        ia, oa = zip(*a)
        ib, ob = zip(*b)

        astarti, aendi = min(ia), max(ia)
        bstarti, bendi = min(ib), max(ib)
        astart, aend = min(a)[1].accn, max(a)[1].accn
        bstart, bend = min(b)[1].accn, max(b)[1].accn

        sizeA = len(set(ia))
        sizeB = len(set(ib))
        size = len(block)

        slope, intercept = np.polyfit(ia, ib, 1)
        orientation = "+" if slope >= 0 else '-'
        aspan = aendi - astarti + 1
        bspan = bendi - bstarti + 1
        score = int((aspan * bspan)**.5)
        score = str(score)
        block_id = pf + "-block-{0}".format(i)

        if coords:

            aseqid, astartbase, aendbase = \
                    get_boundary_bases(astart, aend, qorder)
            bseqid, bstartbase, bendbase = \
                    get_boundary_bases(bstart, bend, sorder)
            abase = aendbase - astartbase + 1
            bbase = bendbase - bstartbase + 1
            atotalbase += abase
            btotalbase += bbase

            # Write dual lines
            aargs = [
                block_id, aseqid, astartbase, aendbase, abase, astart, aend,
                aspan, "+"
            ]
            bargs = [
                block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend,
                bspan, orientation
            ]

            if bed:
                bbed.append(BedLine("\t".join(str(x) for x in \
                           (bseqid, bstartbase - 1, bendbase,
                           "{}:{}-{}".format(aseqid, astartbase, aendbase),
                           size, orientation))))

            for args in (aargs, bargs):
                print >> fws, "\t".join(str(x) for x in args)
            continue

        args = [astart, aend, bstart, bend, score, orientation]
        if additional:
            args += [
                astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id
            ]
        print >> fws, "\t".join(str(x) for x in args)

    fws.close()
    logging.debug("A total of {0} blocks written to `{1}`.".format(
        i + 1, simplefile))

    if coords:
        print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \
                        human_size(atotalbase, precision=2))
        print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \
                        human_size(btotalbase, precision=2))
        print >> sys.stderr, "Ratio: {0:.1f}x".format(\
                        max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase))

    if bed:
        bedfile = simplefile + ".bed"
        bbed.print_to_file(filename=bedfile, sorted=True)
        logging.debug("Bed file written to `{}`".format(bedfile))
Example #19
0
def pad(args):
    """
    %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed

    Test and reconstruct candidate PADs.
    """
    from jcvi.formats.cdt import CDT

    p = OptionParser(pad.__doc__)
    p.set_beds()
    p.add_option(
        "--cutoff",
        default=0.3,
        type="float",
        help="The clustering cutoff to call similar",
    )

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cutoff = opts.cutoff
    blastfile, cdtfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    cdt = CDT(cdtfile)
    qparts = list(cdt.iter_partitions(cutoff=cutoff))
    sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False))

    qid, sid = {}, {}
    for i, part in enumerate(qparts):
        qid.update(dict((x, i) for x in part))
    for i, part in enumerate(sparts):
        sid.update(dict((x, i) for x in part))

    # Without writing files, conversion from PAD to merged PAD is done in memory
    for q in qbed:
        q.seqid = qid[q.seqid]
    for s in sbed:
        s.seqid = sid[s.seqid]

    qnames = range(len(qparts))
    snames = range(len(sparts))

    logmp = make_arrays(blastfile, qbed, sbed, qnames, snames)
    m, n = logmp.shape
    pvalue_cutoff = 1e-30
    cutoff = -log(pvalue_cutoff)

    significant = []
    for i in range(m):
        for j in range(n):
            score = logmp[i, j]
            if score < cutoff:
                continue
            significant.append((qparts[i], sparts[j], score))

    for a, b, score in significant:
        print("|".join(a), "|".join(b), score)

    logging.debug(
        "Collected {0} PAR comparisons significant at (P < {1}).".format(
            len(significant), pvalue_cutoff
        )
    )

    return significant
Example #20
0
        logging.debug("Self comparisons, mirror ignored")
    else:
        batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=True)

    if sqlite:
        c.execute("create index q on synteny (query)")
        conn.commit()
        c.close()
    else:
        fw.close()


if __name__ == '__main__':

    p = OptionParser(__doc__)
    p.set_beds()
    p.set_stripnames()
    p.set_outfile()

    coge_group = OptionGroup(p, "CoGe-specific options")
    coge_group.add_option("--sqlite", help="Write sqlite database")
    coge_group.add_option("--qnote",
                          default="null",
                          help="Query dataset group id")
    coge_group.add_option("--snote",
                          default="null",
                          help="Subject dataset group id")

    params_group = OptionGroup(p, "Synteny parameters")
    params_group.add_option("--window",
                            type="int",
Example #21
0
def omgprepare(args):
    """
    %prog omgprepare ploidy anchorsfile blastfile

    Prepare to run Sankoff's OMG algorithm to get orthologs.
    """
    from jcvi.formats.blast import cscore
    from jcvi.formats.base import DictFile

    p = OptionParser(omgprepare.__doc__)
    p.add_option("--norbh", action="store_true",
                 help="Disable RBH hits [default: %default]")
    p.add_option("--pctid", default=0, type="int",
                 help="Percent id cutoff for RBH hits [default: %default]")
    p.add_option("--cscore", default=90, type="int",
                 help="C-score cutoff for RBH hits [default: %default]")
    p.set_stripnames()
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ploidy, anchorfile, blastfile = args
    norbh = opts.norbh
    pctid = opts.pctid
    cs = opts.cscore
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    fp = open(ploidy)
    genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp))
    fp.close()

    ploidy = DictFile(ploidy)

    geneinfo(qbed, qorder, genomeidx, ploidy)
    geneinfo(sbed, sorder, genomeidx, ploidy)

    pf = blastfile.rsplit(".", 1)[0]
    cscorefile = pf + ".cscore"
    cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"])
    ac = AnchorFile(anchorfile)
    pairs = set((a, b) for a, b, i in ac.iter_pairs())
    logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile))

    weightsfile = pf + ".weights"
    fp = open(cscorefile)
    fw = open(weightsfile, "w")
    npairs = 0
    for row in fp:
        a, b, c, pct = row.split()
        c, pct = float(c), float(pct)
        c = int(c * 100)
        if (a, b) not in pairs:
            if norbh:
                continue
            if c < cs:
                continue
            if pct < pctid:
                continue
            c /= 10  # This severely penalizes RBH against synteny

        print >> fw, "\t".join((a, b, str(c)))
        npairs += 1
    fw.close()

    logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
Example #22
0
def main(args):
    p = OptionParser(__doc__)

    p.set_beds()
    p.add_option(
        "--quota",
        default="1:1",
        help="`quota mapping` procedure -- screen blocks to constrain mapping"
        " (useful for orthology), "
        "put in the format like (#subgenomes expected for genome X):"
        "(#subgenomes expected for genome Y) "
        "[default: %default]")
    p.add_option("--Nm",
                 dest="Nmax",
                 type="int",
                 default=10,
                 help="distance cutoff to tolerate two blocks that are "
                 "slightly overlapping (cutoff for `quota mapping`) "
                 "[default: %default units (gene or bp dist)]")

    supported_solvers = ("SCIP", "GLPK")
    p.add_option(
        "--self",
        dest="self_match",
        action="store_true",
        default=False,
        help="you might turn this on when screening paralogous blocks, "
        "esp. if you have reduced mirrored blocks into non-redundant set")
    p.add_option("--solver",
                 default="SCIP",
                 choices=supported_solvers,
                 help="use MIP solver [default: %default]")
    p.set_verbose(help="Show verbose solver output")

    p.add_option("--screen",
                 default=False,
                 action="store_true",
                 help="generate new anchors file [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    qa_file, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(qa_file, p, opts)

    # sanity check for the quota
    if opts.quota:
        try:
            qa, qb = opts.quota.split(":")
            qa, qb = int(qa), int(qb)
        except:
            print("quota string should be the form x:x (2:4, 1:3, etc.)",
                  file=sys.stderr)
            sys.exit(1)

        if opts.self_match and qa != qb:
            raise Exception("when comparing genome to itself, "
                            "quota must be the same number "
                            "(like 1:1, 2:2) you have %s" % opts.quota)
        quota = (qa, qb)

    self_match = opts.self_match

    clusters = read_clusters(qa_file, qorder, sorder)
    for cluster in clusters:
        assert len(cluster) > 0

    # below runs `quota mapping`
    work_dir = op.join(op.dirname(op.abspath(qa_file)), "work")

    selected_ids = solve_lp(clusters,
                            quota,
                            work_dir=work_dir,
                            Nmax=opts.Nmax,
                            self_match=self_match,
                            solver=opts.solver,
                            verbose=opts.verbose)

    logging.debug("Selected {0} blocks.".format(len(selected_ids)))
    prefix = qa_file.rsplit(".", 1)[0]
    suffix = "{0}x{1}".format(qa, qb)
    outfile = ".".join((prefix, suffix))
    fw = must_open(outfile, "w")
    print(",".join(str(x) for x in selected_ids), file=fw)
    fw.close()
    logging.debug("Screened blocks ids written to `{0}`.".format(outfile))

    if opts.screen:
        from jcvi.compara.synteny import screen

        new_qa_file = ".".join((prefix, suffix, "anchors"))
        largs = [qa_file, new_qa_file, "--ids", outfile]
        if opts.qbed and opts.sbed:
            largs += ["--qbed={0}".format(opts.qbed)]
            largs += ["--sbed={0}".format(opts.sbed)]
        screen(largs)
Example #23
0
File: hic.py Project: xuanblo/jcvi
def movie(args):
    """
    %prog movie test.tour test.clm ref.contigs.last

    Plot optimization history.
    """
    p = OptionParser(movie.__doc__)
    p.add_option("--frames", default=500, type="int",
                 help="Only plot every N frames")
    p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"),
                 help="Movie engine, output MP4 or GIF")
    p.set_beds()
    opts, args, iopts = p.set_image_options(args, figsize="16x8",
                                            style="white", cmap="coolwarm",
                                            format="png", dpi=300)

    if len(args) != 3:
        sys.exit(not p.print_help())

    tourfile, clmfile, lastfile = args
    tourfile = op.abspath(tourfile)
    clmfile = op.abspath(clmfile)
    lastfile = op.abspath(lastfile)
    cwd = os.getcwd()
    odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie"
    anchorsfile, qbedfile, contig_to_beds = \
        prepare_synteny(tourfile, lastfile, odir, p, opts)

    args = []
    for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames):
        padi = "{:06d}".format(i)
        # Make sure the anchorsfile and bedfile has the serial number in,
        # otherwise parallelization may fail
        a, b = op.basename(anchorsfile).split(".", 1)
        ianchorsfile = a + "_" + padi + "." + b
        symlink(anchorsfile, ianchorsfile)

        # Make BED file with new order
        qb = Bed()
        for contig, o in zip(tour, tour_o):
            if contig not in contig_to_beds:
                continue
            bedlines = contig_to_beds[contig][:]
            if o == '-':
                bedlines.reverse()
            for x in bedlines:
                qb.append(x)

        a, b = op.basename(qbedfile).split(".", 1)
        ibedfile = a + "_" + padi + "." + b
        qb.print_to_file(ibedfile)
        # Plot dot plot, but do not sort contigs by name (otherwise losing
        # order)
        image_name = padi + "." + iopts.format

        tour = ",".join(tour)
        args.append([[tour, clmfile, ianchorsfile,
                    "--outfile", image_name, "--label", label]])

    Jobs(movieframe, args).run()

    os.chdir(cwd)
    make_movie(odir, odir, engine=opts.engine, format=iopts.format)
Example #24
0
File: quota.py Project: rrane/jcvi
def main(args):
    p = OptionParser(__doc__)

    p.set_beds()
    p.add_option("--quota", default="1:1",
            help="`quota mapping` procedure -- screen blocks to constrain mapping"\
                    " (useful for orthology), "\
                    "put in the format like (#subgenomes expected for genome X):"\
                    "(#subgenomes expected for genome Y) "\
                    "[default: %default]")
    p.add_option("--Nm", dest="Nmax", type="int", default=10,
            help="distance cutoff to tolerate two blocks that are "\
                    "slightly overlapping (cutoff for `quota mapping`) "\
                    "[default: %default units (gene or bp dist)]")

    supported_solvers = ("SCIP", "GLPK")
    p.add_option("--self", dest="self_match",
            action="store_true", default=False,
            help="you might turn this on when screening paralogous blocks, "\
                 "esp. if you have reduced mirrored blocks into non-redundant set")
    p.add_option("--solver", default="SCIP", choices=supported_solvers,
            help="use MIP solver [default: %default]")
    p.add_option("--verbose", action="store_true",
            default=False, help="show verbose solver output")

    p.add_option("--screen", default=False, action="store_true",
            help="generate new anchors file [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    qa_file, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(qa_file, p, opts)

    # sanity check for the quota
    if opts.quota:
        try:
            qa, qb = opts.quota.split(":")
            qa, qb = int(qa), int(qb)
        except:
            print >> sys.stderr, "quota string should be the form x:x (2:4, 1:3, etc.)"
            sys.exit(1)

        if opts.self_match and qa != qb:
            raise Exception, "when comparing genome to itself, " \
                    "quota must be the same number " \
                    "(like 1:1, 2:2) you have %s" % opts.quota
        quota = (qa, qb)

    self_match = opts.self_match

    clusters = read_clusters(qa_file, qorder, sorder)
    for cluster in clusters:
        assert len(cluster) > 0

    # below runs `quota mapping`
    work_dir = op.join(op.dirname(op.abspath(qa_file)), "work")

    selected_ids = solve_lp(clusters, quota, work_dir=work_dir, \
            Nmax=opts.Nmax, self_match=self_match, \
            solver=opts.solver, verbose=opts.verbose)

    logging.debug("Selected {0} blocks.".format(len(selected_ids)))
    prefix = qa_file.rsplit(".", 1)[0]
    suffix = "{0}x{1}".format(qa, qb)
    outfile = ".".join((prefix, suffix))
    fw = must_open(outfile, "w")
    print >> fw, ",".join(str(x) for x in selected_ids)
    fw.close()
    logging.debug("Screened blocks ids written to `{0}`.".format(outfile))

    if opts.screen:
        from jcvi.compara.synteny import screen

        new_qa_file = ".".join((prefix, suffix, "anchors"))
        largs = [qa_file, new_qa_file, "--ids", outfile]
        if opts.qbed and opts.sbed:
            largs += ["--qbed={0}".format(opts.qbed)]
            largs += ["--sbed={0}".format(opts.sbed)]
        screen(largs)
Example #25
0
def loss(args):
    """
    %prog loss a.b.i1.blocks [a.b-genomic.blast]

    Extract likely gene loss candidates between genome a and b.
    """
    p = OptionParser(loss.__doc__)
    p.add_option("--bed",
                 default=False,
                 action="store_true",
                 help="Genomic BLAST is in bed format [default: %default]")
    p.add_option("--gdist",
                 default=20,
                 type="int",
                 help="Gene distance [default: %default]")
    p.add_option("--bdist",
                 default=20000,
                 type="int",
                 help="Base pair distance [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    blocksfile = args[0]
    emptyblast = (len(args) == 1)
    if emptyblast:
        genomicblast = "empty.blast"
        sh("touch {0}".format(genomicblast))
    else:
        genomicblast = args[1]

    gdist, bdist = opts.gdist, opts.bdist
    qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts)
    blocks = []
    fp = open(blocksfile)
    genetrack = {}
    proxytrack = {}
    for row in fp:
        a, b = row.split()
        genetrack[a] = b
        blocks.append((a, b))

    data = []
    for key, rows in groupby(blocks, key=lambda x: x[-1]):
        rows = list(rows)
        data.append((key, rows))

    imax = len(data) - 1
    for i, (key, rows) in enumerate(data):
        if i == 0 or i == imax:
            continue
        if key != '.':
            continue

        before, br = data[i - 1]
        after, ar = data[i + 1]
        bi, bx = sorder[before]
        ai, ax = sorder[after]
        dist = abs(bi - ai)
        if bx.seqid != ax.seqid or dist > gdist:
            continue

        start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end)))
        start, end = max(start - bdist, 1), end + bdist
        proxy = (bx.seqid, start, end)
        for a, b in rows:
            proxytrack[a] = proxy

    tags = {}
    if opts.bed:
        bed = Bed(genomicblast, sorted=False)
        key = lambda x: gene_name(x.accn.rsplit(".", 1)[0])
        for query, bb in groupby(bed, key=key):
            bb = list(bb)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.seqid, b.start, b.end)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.seqid, best_b.start, best_b.end)
            proxytrack[query] = hsp
            tags[query] = tag

    else:
        blast = Blast(genomicblast)
        for query, bb in blast.iter_hits():
            bb = list(bb)
            query = gene_name(query)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.subject, b.sstart, b.sstop)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.subject, best_b.sstart, best_b.sstop)
            proxytrack[query] = hsp
            tags[query] = tag

    for b in qbed:
        accn = b.accn
        target_region = genetrack[accn]
        if accn in proxytrack:
            target_region = region_str(proxytrack[accn])
            if accn in tags:
                ptag = "[{0}]".format(tags[accn])
            else:
                ptag = "[NF]"
            target_region = ptag + target_region

        print "\t".join((b.seqid, accn, target_region))

    if emptyblast:
        sh("rm -f {0}".format(genomicblast))
Example #26
0
def ancestral(args):
    """
    %prog ancestral vplanifoliaA.vplanifoliaA.anchors > vplanifoliaA_blocks.bed

    Paint 14 chromosomes following alpha WGD.
    """
    p = OptionParser(ancestral.__doc__)
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (anchorsfile, ) = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)

    # We focus on the following chromosome pairs
    target_pairs = {
        (1, 1),
        (1, 6),
        (1, 8),
        (1, 13),
        (2, 4),
        (3, 12),
        (3, 14),
        (5, 6),
        (5, 8),
        (7, 9),
        (7, 11),
        (9, 10),
        (10, 11),
    }

    def get_target(achr, bchr):
        if "chr" not in achr and "chr" not in bchr:
            return None
        achr, bchr = get_number(achr), get_number(bchr)
        if achr > bchr:
            achr, bchr = bchr, achr
        if (achr, bchr) in target_pairs:
            return achr, bchr
        return None

    def build_bedline(astart, aend, target_pair):
        # target_name = "{:02d}-{:02d}".format(*target_pair)
        target_name = [
            str(x) for x in target_pair if x in (1, 2, 3, 5, 7, 10)
        ][0]
        return "\t".join(
            str(x)
            for x in (astart.seqid, astart.start, aend.end, target_name))

    # Iterate through the blocks, store any regions that has hits to one of the
    # target_pairs
    ac = AnchorFile(anchorsfile)
    blocks = ac.blocks
    outbed = Bed()
    for i, block in enumerate(blocks):
        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        astart, aend = min(a)[1], max(a)[1]
        bstart, bend = min(b)[1], max(b)[1]
        # Now convert to BED lines with new accn
        achr, bchr = astart.seqid, bstart.seqid
        target = get_target(achr, bchr)
        if target is None:
            continue
        outbed.add(build_bedline(astart, aend, target))
        outbed.add(build_bedline(bstart, bend, target))
    outbed.print_to_file(sorted=True)
Example #27
0
def loss(args):
    """
    %prog loss a.b.i1.blocks [a.b-genomic.blast]

    Extract likely gene loss candidates between genome a and b.
    """
    p = OptionParser(loss.__doc__)
    p.add_option("--bed", default=False, action="store_true",
                 help="Genomic BLAST is in bed format [default: %default]")
    p.add_option("--gdist", default=20, type="int",
                 help="Gene distance [default: %default]")
    p.add_option("--bdist", default=20000, type="int",
                 help="Base pair distance [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    blocksfile = args[0]
    emptyblast = (len(args) == 1)
    if emptyblast:
        genomicblast = "empty.blast"
        sh("touch {0}".format(genomicblast))
    else:
        genomicblast = args[1]

    gdist, bdist = opts.gdist, opts.bdist
    qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts)
    blocks = []
    fp = open(blocksfile)
    genetrack = {}
    proxytrack = {}
    for row in fp:
        a, b = row.split()
        genetrack[a] = b
        blocks.append((a, b))

    data = []
    for key, rows in groupby(blocks, key=lambda x: x[-1]):
        rows = list(rows)
        data.append((key, rows))

    imax = len(data) - 1
    for i, (key, rows) in enumerate(data):
        if i == 0 or i == imax:
            continue
        if key != '.':
            continue

        before, br = data[i - 1]
        after, ar = data[i + 1]
        bi, bx = sorder[before]
        ai, ax = sorder[after]
        dist = abs(bi - ai)
        if bx.seqid != ax.seqid or dist > gdist:
            continue

        start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end)))
        start, end = max(start - bdist, 1), end + bdist
        proxy = (bx.seqid, start, end)
        for a, b in rows:
            proxytrack[a] = proxy

    tags = {}
    if opts.bed:
        bed = Bed(genomicblast, sorted=False)
        key = lambda x: gene_name(x.accn.rsplit(".", 1)[0])
        for query, bb in groupby(bed, key=key):
            bb = list(bb)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.seqid, b.start, b.end)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.seqid, best_b.start, best_b.end)
            proxytrack[query] = hsp
            tags[query] = tag

    else:
        blast = Blast(genomicblast)
        for query, bb in blast.iter_hits():
            bb = list(bb)
            query = gene_name(query)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.subject, b.sstart, b.sstop)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.subject, best_b.sstart, best_b.sstop)
            proxytrack[query] = hsp
            tags[query] = tag

    for b in qbed:
        accn = b.accn
        target_region = genetrack[accn]
        if accn in proxytrack:
            target_region = region_str(proxytrack[accn])
            if accn in tags:
                ptag = "[{0}]".format(tags[accn])
            else:
                ptag = "[NF]"
            target_region = ptag + target_region

        print "\t".join((b.seqid, accn, target_region))

    if emptyblast:
        sh("rm -f {0}".format(genomicblast))
Example #28
0
def dotplot_main(args):
    p = OptionParser(__doc__)
    p.set_beds()
    p.add_option("--synteny", default=False, action="store_true",
            help="Run a fast synteny scan and display blocks [default: %default]")
    p.add_option("--cmaptext", help="Draw colormap box on the bottom-left corner")
    p.add_option("--vmin", dest="vmin", type="float", default=0,
            help="Minimum value in the colormap [default: %default]")
    p.add_option("--vmax", dest="vmax", type="float", default=2,
            help="Maximum value in the colormap [default: %default]")
    p.add_option("--genomenames", type="string", default=None,
            help="genome names for labeling axes in the form of qname_sname, " \
            "eg. \"Vitis vinifera_Oryza sativa\"")
    p.add_option("--nmax", dest="sample_number", type="int", default=10000,
            help="Maximum number of data points to plot [default: %default]")
    p.add_option("--minfont", type="int", default=4,
            help="Do not render labels with size smaller than")
    p.add_option("--colormap",
            help="Two column file, block id to color mapping [default: %default]")
    p.add_option("--nosort", default=False, action="store_true",
            help="Do not sort the seqids along the axes")
    p.add_option("--nosep", default=False, action="store_true",
            help="Do not add contig lines")
    p.add_option("--nostdpf", default=False, action="store_true",
            help="Do not standardize contig names")
    p.add_option("--skipempty", default=False, action="store_true",
            help="Skip seqids that do not have matches")
    p.add_option("--title", help="Title of the dot plot")
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args, figsize="8x8",
                                            style="dark", dpi=90, cmap="copper")

    if len(args) != 1:
        sys.exit(not p.print_help())

    palette = opts.colormap
    if palette:
        palette = Palette(palette)

    anchorfile, = args
    cmaptext = opts.cmaptext
    if anchorfile.endswith(".ks"):
        from jcvi.apps.ks import KsFile

        logging.debug("Anchors contain Ks values")
        cmaptext = cmaptext or "*Ks* values"
        anchorksfile = anchorfile + ".anchors"
        if need_update(anchorfile, anchorksfile):
            ksfile = KsFile(anchorfile)
            ksfile.print_to_anchors(anchorksfile)
        anchorfile = anchorksfile

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts,
                sorted=(not opts.nosort))

    if opts.skipempty:
        ac = AnchorFile(anchorfile)
        if is_self:
            qseqids = sseqids = set()
        else:
            qseqids, sseqids = set(), set()

        for pair in ac.iter_pairs():
            q, s = pair[:2]
            qi, q = qorder[q]
            si, s = sorder[s]
            qseqids.add(q.seqid)
            sseqids.add(s.seqid)

        if is_self:
            qbed = sbed = subset_bed(qbed, qseqids)
        else:
            qbed = subset_bed(qbed, qseqids)
            sbed = subset_bed(sbed, sseqids)

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])  # the whole canvas
    ax = fig.add_axes([.1, .1, .8, .8])  # the dot plot

    dotplot(anchorfile, qbed, sbed, fig, root, ax,
            vmin=opts.vmin, vmax=opts.vmax, is_self=is_self,
            synteny=opts.synteny, cmap_text=opts.cmaptext, cmap=iopts.cmap,
            genomenames=opts.genomenames, sample_number=opts.sample_number,
            minfont=opts.minfont, palette=palette, sep=(not opts.nosep),
            title=opts.title, stdpf=(not opts.nostdpf))

    image_name = opts.outfile or \
            (op.splitext(anchorfile)[0] + "." + opts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    fig.clear()
Example #29
0
def depth(args):
    """
    %prog depth anchorfile --qbed qbedfile --sbed sbedfile

    Calculate the depths in the two genomes in comparison, given in --qbed and
    --sbed. The synteny blocks will be layered on the genomes, and the
    multiplicity will be summarized to stderr.
    """
    from jcvi.utils.range import range_depth

    p = OptionParser(depth.__doc__)
    p.add_option("--depthfile",
                 help="Generate file with gene and depth [default: %default]")
    p.add_option("--histogram", default=False, action="store_true",
                 help="Plot histograms in PDF")
    p.add_option("--xmax", type="int", help="x-axis maximum to display in plot")
    p.add_option("--title", default=None, help="Title to display in plot")
    p.add_option("--quota", help="Force to use this quota, e.g. 1:1, 1:2 ...")
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    depthfile = opts.depthfile
    ac = AnchorFile(anchorfile)
    qranges = []
    sranges = []
    blocks = ac.blocks
    for ib in blocks:
        q, s, t = zip(*ib)
        q = [qorder[x] for x in q]
        s = [sorder[x] for x in s]
        qrange = (min(q)[0], max(q)[0])
        srange = (min(s)[0], max(s)[0])
        qranges.append(qrange)
        sranges.append(srange)
        if is_self:
            qranges.append(srange)

    qgenome = op.basename(qbed.filename).split(".")[0]
    sgenome = op.basename(sbed.filename).split(".")[0]
    qtag = "Genome {0} depths".format(qgenome)
    print >> sys.stderr, "{}:".format(qtag)
    dsq, details = range_depth(qranges, len(qbed))
    if depthfile:
        fw = open(depthfile, "w")
        write_details(fw, details, qbed)

    if is_self:
        return

    stag = "Genome {0} depths".format(sgenome)
    print >> sys.stderr, "{}:".format(stag)
    dss, details = range_depth(sranges, len(sbed))
    if depthfile:
        write_details(fw, details, sbed)
        fw.close()
        logging.debug("Depth written to `{0}`.".format(depthfile))

    if not opts.histogram:
        return

    from jcvi.graphics.base import plt, quickplot_ax, savefig, normalize_axes

    # Plot two histograms one for query genome, one for subject genome
    plt.figure(1, (6, 3))
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

    xmax = opts.xmax or max(4, max(dsq.keys() + dss.keys()))
    if opts.quota:
        speak, qpeak = opts.quota.split(":")
        qpeak, speak = int(qpeak), int(speak)
    else:
        qpeak = find_peak(dsq)
        speak = find_peak(dss)

    qtag = "# of {} blocks per {} gene".format(sgenome, qgenome)
    stag = "# of {} blocks per {} gene".format(qgenome, sgenome)
    quickplot_ax(ax1, dss, 0, xmax, stag, ylabel="Percentage of genome",
                 highlight=range(1, speak + 1))
    quickplot_ax(ax2, dsq, 0, xmax, qtag, ylabel=None,
                 highlight=range(1, qpeak + 1))

    title = opts.title or "{} vs {} syntenic depths\n{}:{} pattern"\
                    .format(qgenome, sgenome, speak, qpeak)
    root = f.add_axes([0, 0, 1, 1])
    vs, pattern = title.split('\n')
    root.text(.5, .97, vs, ha="center", va="center", color="darkslategray")
    root.text(.5, .925, pattern, ha="center", va="center",
                                 color="tomato", size=16)
    print >> sys.stderr, title

    normalize_axes(root)

    pf = anchorfile.rsplit(".", 1)[0] + ".depth"
    image_name = pf + ".pdf"
    savefig(image_name)
Example #30
0
        logging.debug("Self comparisons, mirror ignored")
    else:
        batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=True)

    if sqlite:
        c.execute("create index q on synteny (query)")
        conn.commit()
        c.close()
    else:
        fw.close()


if __name__ == '__main__':

    p = OptionParser(__doc__)
    p.set_beds()
    p.set_stripnames()
    p.set_outfile()

    coge_group = OptionGroup(p, "CoGe-specific options")
    coge_group.add_option("--sqlite", help="Write sqlite database")
    coge_group.add_option("--qnote", default="null",
                          help="Query dataset group id")
    coge_group.add_option("--snote", default="null",
                          help="Subject dataset group id")

    params_group = OptionGroup(p, "Synteny parameters")
    params_group.add_option("--window", type="int", default=40,
                            help="Synteny window size")
    params_group.add_option("--cutoff", type="float", default=.1,
                            help="Minimum number of anchors to call synteny")
Example #31
0
def simple(args):
    """
    %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options]

    Write the block ends for each block in the anchorfile.
    GeneA1    GeneA2    GeneB1    GeneB2   +/-      score

    Optional additional columns:
    orderA1   orderA2   orderB1   orderB2  sizeA    sizeB   size    block_id

    With base coordinates (--coords):
    block_id  seqidA    startA    endA     bpSpanA  GeneA1   GeneA2  geneSpanA
    block_id  seqidB    startB    endB     bpSpanB  GeneB1   GeneB2  geneSpanB
    """
    p = OptionParser(simple.__doc__)
    p.add_option("--rich", default=False, action="store_true", \
                help="Output additional columns [default: %default]")
    p.add_option("--coords", default=False, action="store_true",
                help="Output columns with base coordinates [default: %default]")
    p.add_option("--bed", default=False, action="store_true",
                help="Generate BED file for the blocks")
    p.add_option("--noheader", default=False, action="store_true",
                help="Don't output header [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    additional = opts.rich
    coords = opts.coords
    header = not opts.noheader
    bed = opts.bed
    if bed:
        coords = True
        bbed = Bed()

    ac = AnchorFile(anchorfile)
    simplefile = anchorfile.rsplit(".", 1)[0] + ".simple"

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    pf = "-".join(anchorfile.split(".", 2)[:2])
    blocks = ac.blocks

    if coords:
        h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation"
    else:
        h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score"
        if additional:
            h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\
                  "SizeA|SizeB|Size|Block"

    fws = open(simplefile, "w")
    if header:
        print >> fws, "\t".join(h.split("|"))

    atotalbase = btotalbase = 0
    for i, block in enumerate(blocks):

        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        ia, oa = zip(*a)
        ib, ob = zip(*b)

        astarti, aendi = min(ia), max(ia)
        bstarti, bendi = min(ib), max(ib)
        astart, aend = min(a)[1].accn, max(a)[1].accn
        bstart, bend = min(b)[1].accn, max(b)[1].accn

        sizeA = len(set(ia))
        sizeB = len(set(ib))
        size = len(block)

        slope, intercept = np.polyfit(ia, ib, 1)
        orientation = "+" if slope >= 0 else '-'
        aspan = aendi - astarti + 1
        bspan = bendi - bstarti + 1
        score = int((aspan * bspan) ** .5)
        score = str(score)
        block_id = pf + "-block-{0}".format(i)

        if coords:

            aseqid, astartbase, aendbase = \
                    get_boundary_bases(astart, aend, qorder)
            bseqid, bstartbase, bendbase = \
                    get_boundary_bases(bstart, bend, sorder)
            abase = aendbase - astartbase + 1
            bbase = bendbase - bstartbase + 1
            atotalbase += abase
            btotalbase += bbase

            # Write dual lines
            aargs = [block_id, aseqid, astartbase, aendbase,
                     abase, astart, aend, aspan, "+"]
            bargs = [block_id, bseqid, bstartbase, bendbase,
                     bbase, bstart, bend, bspan, orientation]

            if bed:
                bbed.append(BedLine("\t".join(str(x) for x in \
                           (bseqid, bstartbase - 1, bendbase,
                           "{}:{}-{}".format(aseqid, astartbase, aendbase),
                           size, orientation))))

            for args in (aargs, bargs):
                print >> fws, "\t".join(str(x) for x in args)
            continue

        args = [astart, aend, bstart, bend, score, orientation]
        if additional:
            args += [astarti, aendi, bstarti, bendi,
                     sizeA, sizeB, size, block_id]
        print >> fws, "\t".join(str(x) for x in args)

    fws.close()
    logging.debug("A total of {0} blocks written to `{1}`.".format(i + 1, simplefile))

    if coords:
        print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \
                        human_size(atotalbase, precision=2))
        print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \
                        human_size(btotalbase, precision=2))
        print >> sys.stderr, "Ratio: {0:.1f}x".format(\
                        max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase))

    if bed:
        bedfile = simplefile + ".bed"
        bbed.print_to_file(filename=bedfile, sorted=True)
        logging.debug("Bed file written to `{}`".format(bedfile))
Example #32
0
def cluster(args):
    """
    %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile

    Cluster the segments and form PAD. This is the method described in Tang et
    al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks,
    based on which the genome on one or both axis can be chopped up into pieces
    and clustered.
    """
    from jcvi.utils.range import Range

    p = OptionParser(cluster.__doc__)
    p.set_beds()
    p.add_option(
        "--minsize", default=10, type="int", help="Only segment using blocks >= size"
    )
    p.add_option(
        "--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary"
    )

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, anchorfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    minsize = opts.minsize
    ac = AnchorFile(anchorfile)
    qranges, sranges = [], []
    qextra = [x[1:] for x in qbed.get_breaks()]
    sextra = [x[1:] for x in sbed.get_breaks()]

    id = 0
    for block in ac.iter_blocks(minsize=minsize):
        q, s = list(zip(*block))[:2]
        q = [qorder[x][0] for x in q]
        s = [sorder[x][0] for x in s]
        minq, maxq = min(q), max(q)
        mins, maxs = min(s), max(s)
        id += 1

        qr = Range("0", minq, maxq, maxq - minq, id)
        sr = Range("0", mins, maxs, maxs - mins, id)
        qranges.append(qr)
        sranges.append(sr)

    qpads = list(get_segments(qranges, qextra))
    spads = list(get_segments(sranges, sextra))

    suffix = ".pad.bed"
    qpf = opts.qbed.split(".")[0]
    spf = opts.sbed.split(".")[0]
    qpadfile = qpf + suffix
    spadfile = spf + suffix
    qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed)
    snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed)

    qpadbed, spadbed = Bed(qpadfile), Bed(spadfile)

    logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames)
    m, n = logmp.shape

    matrixfile = ".".join((qpf, spf, "logmp.txt"))
    fw = open(matrixfile, "w")
    header = ["o"] + spadnames
    print("\t".join(header), file=fw)
    for i in range(m):
        row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]]
        print("\t".join(row), file=fw)

    fw.close()

    # Run CLUSTER 3.0 (Pearson correlation, average linkage)
    cmd = op.join(opts.path, "cluster")
    cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile)
    pf = matrixfile.rsplit(".", 1)[0]
    cdtfile = pf + ".cdt"
    if need_update(matrixfile, cdtfile):
        sh(cmd)
Example #33
0
def screen(args):
    """
    %prog screen anchorfile newanchorfile --qbed=qbedfile --sbed=sbedfile [options]

    Extract subset of blocks from anchorfile. Provide several options:

    1. Option --ids: a file with IDs, 0-based, comma separated, all in one line.
    2. Option --seqids: only allow seqids in this file.
    3. Option --seqpairs: only allow seqpairs in this file, one per line, e.g. "Chr01,Chr05".
    4. Option --minspan: remove blocks with less span than this.
    5. Option --minsize: remove blocks with less number of anchors than this.
    """
    p = OptionParser(screen.__doc__)
    p.set_beds()

    p.add_option("--ids", help="File with block IDs (0-based) [default: %default]")
    p.add_option("--seqids", help="File with seqids [default: %default]")
    p.add_option("--seqpairs", help="File with seqpairs [default: %default]")
    p.add_option("--nointra", action="store_true",
                 help="Remove intra-chromosomal blocks [default: %default]")
    p.add_option("--minspan", default=0, type="int",
                 help="Only blocks with span >= [default: %default]")
    p.add_option("--minsize", default=0, type="int",
                 help="Only blocks with anchors >= [default: %default]")
    p.add_option("--simple", action="store_true",
                 help="Write simple anchorfile with block ends [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    anchorfile, newanchorfile = args
    ac = AnchorFile(anchorfile)
    idsfile = opts.ids
    seqidsfile = opts.seqids
    seqpairsfile = opts.seqpairs
    minspan = opts.minspan
    minsize = opts.minsize
    osimple = opts.simple
    nointra = opts.nointra
    ids, seqids, seqpairs = None, None, None

    if idsfile:
        ids = SetFile(idsfile, delimiter=',')
        ids = set(int(x) for x in ids)
    if seqidsfile:
        seqids = SetFile(seqidsfile, delimiter=',')
    if seqpairsfile:
        fp = open(seqpairsfile)
        seqpairs = set()
        for row in fp:
            a, b = row.strip().split(",")
            seqpairs.add((a, b))
            seqpairs.add((b, a))

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    blocks = ac.blocks
    selected = 0
    fw = open(newanchorfile, "w")

    for i, block in enumerate(blocks):
        if ids and i not in ids:
            continue

        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        ia, oa = zip(*a)
        ib, ob = zip(*b)
        aspan = max(ia) - min(ia) + 1
        bspan = max(ib) - min(ib) + 1
        aseqid = oa[0].seqid
        bseqid = ob[0].seqid

        if seqids:
            if (aseqid not in seqids) or (bseqid not in seqids):
                continue

        if seqpairs:
            if (aseqid, bseqid) not in seqpairs:
                continue

        if nointra and aseqid == bseqid:
            continue

        if minsize:
            if len(block) < minsize:
                continue

        if minspan:
            if aspan < minspan or bspan < minspan:
                continue

        selected += 1
        print >> fw, "###"
        for line in block:
            print >> fw, "\t".join(line)

    fw.close()

    if osimple:
        simple([newanchorfile, "--noheader", \
                "--qbed=" + qbed.filename, "--sbed=" + sbed.filename])

    logging.debug("Before: {0} blocks, After: {1} blocks".\
                  format(len(blocks), selected))
Example #34
0
def omgprepare(args):
    """
    %prog omgprepare ploidy anchorsfile blastfile

    Prepare to run Sankoff's OMG algorithm to get orthologs.
    """
    from jcvi.formats.blast import cscore
    from jcvi.formats.base import DictFile

    p = OptionParser(omgprepare.__doc__)
    p.add_option("--norbh",
                 action="store_true",
                 help="Disable RBH hits [default: %default]")
    p.add_option("--pctid",
                 default=0,
                 type="int",
                 help="Percent id cutoff for RBH hits [default: %default]")
    p.add_option("--cscore",
                 default=90,
                 type="int",
                 help="C-score cutoff for RBH hits [default: %default]")
    p.set_stripnames()
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ploidy, anchorfile, blastfile = args
    norbh = opts.norbh
    pctid = opts.pctid
    cs = opts.cscore
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    fp = open(ploidy)
    genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp))
    fp.close()

    ploidy = DictFile(ploidy)

    geneinfo(qbed, qorder, genomeidx, ploidy)
    geneinfo(sbed, sorder, genomeidx, ploidy)

    pf = blastfile.rsplit(".", 1)[0]
    cscorefile = pf + ".cscore"
    cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"])
    ac = AnchorFile(anchorfile)
    pairs = set((a, b) for a, b, i in ac.iter_pairs())
    logging.debug("Imported {0} pairs from `{1}`.".format(
        len(pairs), anchorfile))

    weightsfile = pf + ".weights"
    fp = open(cscorefile)
    fw = open(weightsfile, "w")
    npairs = 0
    for row in fp:
        a, b, c, pct = row.split()
        c, pct = float(c), float(pct)
        c = int(c * 100)
        if (a, b) not in pairs:
            if norbh:
                continue
            if c < cs:
                continue
            if pct < pctid:
                continue
            c /= 10  # This severely penalizes RBH against synteny

        print >> fw, "\t".join((a, b, str(c)))
        npairs += 1
    fw.close()

    logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
Example #35
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option(
        "--switch",
        default=False,
        action="store_true",
        help="Switch reference and aligned map elements",
    )
    p.add_option(
        "--scale", type="float", help="Scale the aligned map distance by factor"
    )
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (anchorsfile,) = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        try:
            newsseqid = get_number(sseqid)
        except ValueError:
            raise ValueError(
                "`{0}` is on `{1}` with no number to extract".format(saccn, sseqid)
            )
        bedline = "\t".join(
            str(x)
            for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))
        )
        bd.add(bedline)

    bd.print_to_file(filename=opts.outfile, sorted=True)
Example #36
0
def screen(args):
    """
    %prog screen anchorfile newanchorfile --qbed=qbedfile --sbed=sbedfile [options]

    Extract subset of blocks from anchorfile. Provide several options:

    1. Option --ids: a file with IDs, 0-based, comma separated, all in one line.
    2. Option --seqids: only allow seqids in this file.
    3. Option --seqpairs: only allow seqpairs in this file, one per line, e.g. "Chr01,Chr05".
    4. Option --minspan: remove blocks with less span than this.
    5. Option --minsize: remove blocks with less number of anchors than this.
    """
    p = OptionParser(screen.__doc__)
    p.set_beds()

    p.add_option("--ids",
                 help="File with block IDs (0-based) [default: %default]")
    p.add_option("--seqids", help="File with seqids [default: %default]")
    p.add_option("--seqpairs", help="File with seqpairs [default: %default]")
    p.add_option("--nointra",
                 action="store_true",
                 help="Remove intra-chromosomal blocks [default: %default]")
    p.add_option("--minspan",
                 default=0,
                 type="int",
                 help="Only blocks with span >= [default: %default]")
    p.add_option("--minsize",
                 default=0,
                 type="int",
                 help="Only blocks with anchors >= [default: %default]")
    p.add_option(
        "--simple",
        action="store_true",
        help="Write simple anchorfile with block ends [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    anchorfile, newanchorfile = args
    ac = AnchorFile(anchorfile)
    idsfile = opts.ids
    seqidsfile = opts.seqids
    seqpairsfile = opts.seqpairs
    minspan = opts.minspan
    minsize = opts.minsize
    osimple = opts.simple
    nointra = opts.nointra
    ids, seqids, seqpairs = None, None, None

    if idsfile:
        ids = SetFile(idsfile, delimiter=',')
        ids = set(int(x) for x in ids)
    if seqidsfile:
        seqids = SetFile(seqidsfile, delimiter=',')
    if seqpairsfile:
        fp = open(seqpairsfile)
        seqpairs = set()
        for row in fp:
            a, b = row.strip().split(",")
            seqpairs.add((a, b))
            seqpairs.add((b, a))

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    blocks = ac.blocks
    selected = 0
    fw = open(newanchorfile, "w")

    for i, block in enumerate(blocks):
        if ids and i not in ids:
            continue

        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        ia, oa = zip(*a)
        ib, ob = zip(*b)
        aspan = max(ia) - min(ia) + 1
        bspan = max(ib) - min(ib) + 1
        aseqid = oa[0].seqid
        bseqid = ob[0].seqid

        if seqids:
            if (aseqid not in seqids) or (bseqid not in seqids):
                continue

        if seqpairs:
            if (aseqid, bseqid) not in seqpairs:
                continue

        if nointra and aseqid == bseqid:
            continue

        if minsize:
            if len(block) < minsize:
                continue

        if minspan:
            if aspan < minspan or bspan < minspan:
                continue

        selected += 1
        print >> fw, "###"
        for line in block:
            print >> fw, "\t".join(line)

    fw.close()

    if osimple:
        simple([newanchorfile, "--noheader", \
                "--qbed=" + qbed.filename, "--sbed=" + sbed.filename])

    logging.debug("Before: {0} blocks, After: {1} blocks".\
                  format(len(blocks), selected))
Example #37
0
def depth(args):
    """
    %prog depth anchorfile --qbed qbedfile --sbed sbedfile

    Calculate the depths in the two genomes in comparison, given in --qbed and
    --sbed. The synteny blocks will be layered on the genomes, and the
    multiplicity will be summarized to stderr.
    """
    from jcvi.utils.range import range_depth

    p = OptionParser(depth.__doc__)
    p.add_option("--depthfile",
                 help="Generate file with gene and depth [default: %default]")
    p.add_option("--histogram", default=False, action="store_true",
                 help="Plot histograms in PDF")
    p.add_option("--xmax", type="int", help="x-axis maximum to display in plot")
    p.add_option("--title", default=None, help="Title to display in plot")
    p.add_option("--quota", help="Force to use this quota, e.g. 1:1, 1:2 ...")
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    depthfile = opts.depthfile
    ac = AnchorFile(anchorfile)
    qranges = []
    sranges = []
    blocks = ac.blocks
    for ib in blocks:
        q, s, t = zip(*ib)
        q = [qorder[x] for x in q]
        s = [sorder[x] for x in s]
        qrange = (min(q)[0], max(q)[0])
        srange = (min(s)[0], max(s)[0])
        qranges.append(qrange)
        sranges.append(srange)
        if is_self:
            qranges.append(srange)

    qgenome = op.basename(qbed.filename).split(".")[0]
    sgenome = op.basename(sbed.filename).split(".")[0]
    qtag = "Genome {0} depths".format(qgenome)
    print("{}:".format(qtag), file=sys.stderr)
    dsq, details = range_depth(qranges, len(qbed))
    if depthfile:
        fw = open(depthfile, "w")
        write_details(fw, details, qbed)

    if is_self:
        return

    stag = "Genome {0} depths".format(sgenome)
    print("{}:".format(stag), file=sys.stderr)
    dss, details = range_depth(sranges, len(sbed))
    if depthfile:
        write_details(fw, details, sbed)
        fw.close()
        logging.debug("Depth written to `{0}`.".format(depthfile))

    if not opts.histogram:
        return

    from jcvi.graphics.base import plt, quickplot_ax, savefig, normalize_axes

    # Plot two histograms one for query genome, one for subject genome
    plt.figure(1, (6, 3))
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

    xmax = opts.xmax or max(4, max(dsq.keys() + dss.keys()))
    if opts.quota:
        speak, qpeak = opts.quota.split(":")
        qpeak, speak = int(qpeak), int(speak)
    else:
        qpeak = find_peak(dsq)
        speak = find_peak(dss)

    qtag = "# of {} blocks per {} gene".format(sgenome, qgenome)
    stag = "# of {} blocks per {} gene".format(qgenome, sgenome)
    quickplot_ax(ax1, dss, 0, xmax, stag, ylabel="Percentage of genome",
                 highlight=range(1, speak + 1))
    quickplot_ax(ax2, dsq, 0, xmax, qtag, ylabel=None,
                 highlight=range(1, qpeak + 1))

    title = opts.title or "{} vs {} syntenic depths\n{}:{} pattern"\
                    .format(qgenome, sgenome, speak, qpeak)
    root = f.add_axes([0, 0, 1, 1])
    vs, pattern = title.split('\n')
    root.text(.5, .97, vs, ha="center", va="center", color="darkslategray")
    root.text(.5, .925, pattern, ha="center", va="center",
                                 color="tomato", size=16)
    print(title, file=sys.stderr)

    normalize_axes(root)

    pf = anchorfile.rsplit(".", 1)[0] + ".depth"
    image_name = pf + ".pdf"
    savefig(image_name)