Example #1
0
def omgprepare(args):
    """
    %prog omgprepare ploidy anchorsfile blastfile

    Prepare to run Sankoff's OMG algorithm to get orthologs.
    """
    from jcvi.formats.blast import cscore
    from jcvi.formats.base import DictFile

    p = OptionParser(omgprepare.__doc__)
    p.add_option("--norbh", action="store_true",
                 help="Disable RBH hits [default: %default]")
    p.add_option("--pctid", default=0, type="int",
                 help="Percent id cutoff for RBH hits [default: %default]")
    p.add_option("--cscore", default=90, type="int",
                 help="C-score cutoff for RBH hits [default: %default]")
    p.set_stripnames()
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ploidy, anchorfile, blastfile = args
    norbh = opts.norbh
    pctid = opts.pctid
    cs = opts.cscore
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    fp = open(ploidy)
    genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp))
    fp.close()

    ploidy = DictFile(ploidy)

    geneinfo(qbed, qorder, genomeidx, ploidy)
    geneinfo(sbed, sorder, genomeidx, ploidy)

    pf = blastfile.rsplit(".", 1)[0]
    cscorefile = pf + ".cscore"
    cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"])
    ac = AnchorFile(anchorfile)
    pairs = set((a, b) for a, b, i in ac.iter_pairs())
    logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile))

    weightsfile = pf + ".weights"
    fp = open(cscorefile)
    fw = open(weightsfile, "w")
    npairs = 0
    for row in fp:
        a, b, c, pct = row.split()
        c, pct = float(c), float(pct)
        c = int(c * 100)
        if (a, b) not in pairs:
            if norbh:
                continue
            if c < cs:
                continue
            if pct < pctid:
                continue
            c /= 10  # This severely penalizes RBH against synteny

        print >> fw, "\t".join((a, b, str(c)))
        npairs += 1
    fw.close()

    logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
Example #2
0
def ortholog(args):
    """
    %prog ortholog species_a species_b

    Run a sensitive pipeline to find orthologs between two species a and b.
    The pipeline runs LAST and generate .lifted.anchors.

    `--full` mode would assume 1-to-1 quota synteny blocks as the backbone of
    such predictions. Extra orthologs will be recruited from reciprocal best
    match (RBH).
    """
    from jcvi.apps.align import last as last_main
    from jcvi.compara.blastfilter import main as blastfilter_main
    from jcvi.compara.quota import main as quota_main
    from jcvi.compara.synteny import scan, mcscan, liftover
    from jcvi.formats.blast import cscore, filter

    p = OptionParser(ortholog.__doc__)
    p.add_option("--full", default=False, action="store_true",
                 help="Run in full mode, including blocks and RBH")
    p.add_option("--cscore", default=0.7, type="float",
                 help="C-score cutoff [default: %default]")
    p.add_option("--dist", default=20, type="int",
                 help="Extent of flanking regions to search")
    p.add_option("--quota", help="Quota align parameter")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    a, b = args
    abed, afasta = a + ".bed", a + ".cds"
    bbed, bfasta = b + ".bed", b + ".cds"
    ccscore = opts.cscore
    quota = opts.quota
    dist = "--dist={0}".format(opts.dist)

    aprefix = afasta.split(".")[0]
    bprefix = bfasta.split(".")[0]
    pprefix = ".".join((aprefix, bprefix))
    qprefix = ".".join((bprefix, aprefix))
    last = pprefix + ".last"
    if need_update((afasta, bfasta), last):
        last_main([bfasta, afasta])

    if a == b:
        last = filter([last, "--hitlen=0", "--pctid=98", "--inverse", "--noself"])

    filtered_last = last + ".filtered"
    if need_update(last, filtered_last):
        blastfilter_main([last, "--cscore={0}".format(ccscore)])

    anchors = pprefix + ".anchors"
    lifted_anchors = pprefix + ".lifted.anchors"
    if not opts.full:
        if need_update(filtered_last, lifted_anchors):
            scan([filtered_last, anchors, dist,
                    "--liftover={0}".format(last)])
        if quota:
            quota_main([lifted_anchors,
                        "--quota={0}".format(quota), "--screen"])
        return

    if need_update(filtered_last, anchors):
        scan([filtered_last, anchors, dist])

    ooanchors = pprefix + ".1x1.anchors"
    if need_update(anchors, ooanchors):
        quota_main([anchors, "--quota=1:1", "--screen"])

    lifted_anchors = pprefix + ".1x1.lifted.anchors"
    if need_update((last, ooanchors), lifted_anchors):
        liftover([last, ooanchors, dist])

    pblocks = pprefix + ".1x1.blocks"
    qblocks = qprefix + ".1x1.blocks"
    if need_update(lifted_anchors, [pblocks, qblocks]):
        mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks])
        mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks])

    rbh = pprefix + ".rbh"
    if need_update(last, rbh):
        cscore([last, "-o", rbh])

    portho = pprefix + ".ortholog"
    qortho = qprefix + ".ortholog"
    if need_update([pblocks, qblocks, rbh], [portho, qortho]):
        make_ortholog(pblocks, rbh, portho)
        make_ortholog(qblocks, rbh, qortho)
Example #3
0
def ortholog(args):
    """
    %prog ortholog species_a species_b

    Run a sensitive pipeline to find orthologs between two species a and b.
    The pipeline runs LAST and generate .lifted.anchors.

    `--full` mode would assume 1-to-1 quota synteny blocks as the backbone of
    such predictions. Extra orthologs will be recruited from reciprocal best
    match (RBH).
    """
    from jcvi.apps.last import main as last_main
    from jcvi.compara.blastfilter import main as blastfilter_main
    from jcvi.compara.quota import main as quota_main
    from jcvi.compara.synteny import scan, mcscan, liftover
    from jcvi.formats.blast import cscore, filter

    p = OptionParser(ortholog.__doc__)
    p.add_option("--full",
                 default=False,
                 action="store_true",
                 help="Run in full mode, including blocks and RBH")
    p.add_option("--cscore",
                 default=0.7,
                 type="float",
                 help="C-score cutoff [default: %default]")
    p.add_option("--dist",
                 default=20,
                 type="int",
                 help="Extent of flanking regions to search")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    a, b = args
    abed, afasta = a + ".bed", a + ".cds"
    bbed, bfasta = b + ".bed", b + ".cds"
    ccscore = opts.cscore
    dist = "--dist={0}".format(opts.dist)

    aprefix = afasta.split(".")[0]
    bprefix = bfasta.split(".")[0]
    pprefix = ".".join((aprefix, bprefix))
    qprefix = ".".join((bprefix, aprefix))
    last = pprefix + ".last"
    if need_update((afasta, bfasta), last):
        last_main([bfasta, afasta, "-o", last])

    if a == b:
        last = filter([last, "--hitlen=0", "--pctid=98", "--inverse"])

    filtered_last = last + ".filtered"
    if need_update(last, filtered_last):
        blastfilter_main([last, "--cscore={0}".format(ccscore)])

    anchors = pprefix + ".anchors"
    lifted_anchors = pprefix + ".lifted.anchors"
    if not opts.full:
        if need_update(filtered_last, lifted_anchors):
            scan([filtered_last, anchors, dist, "--liftover={0}".format(last)])
        return

    if need_update(filtered_last, anchors):
        scan([filtered_last, anchors, dist])

    ooanchors = pprefix + ".1x1.anchors"
    if need_update(anchors, ooanchors):
        quota_main([anchors, "--quota=1:1", "--screen"])

    lifted_anchors = pprefix + ".1x1.lifted.anchors"
    if need_update((last, ooanchors), lifted_anchors):
        liftover([last, ooanchors, dist])

    pblocks = pprefix + ".1x1.blocks"
    qblocks = qprefix + ".1x1.blocks"
    if need_update(lifted_anchors, [pblocks, qblocks]):
        mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks])
        mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks])

    rbh = pprefix + ".rbh"
    if need_update(last, rbh):
        cscore([last, "-o", rbh])

    portho = pprefix + ".ortholog"
    qortho = qprefix + ".ortholog"
    if need_update([pblocks, qblocks, rbh], [portho, qortho]):
        make_ortholog(pblocks, rbh, portho)
        make_ortholog(qblocks, rbh, qortho)
Example #4
0
def omgprepare(args):
    """
    %prog omgprepare ploidy anchorsfile blastfile

    Prepare to run Sankoff's OMG algorithm to get orthologs.
    """
    from jcvi.formats.blast import cscore
    from jcvi.formats.base import DictFile

    p = OptionParser(omgprepare.__doc__)
    p.add_option("--norbh",
                 action="store_true",
                 help="Disable RBH hits [default: %default]")
    p.add_option("--pctid",
                 default=0,
                 type="int",
                 help="Percent id cutoff for RBH hits [default: %default]")
    p.add_option("--cscore",
                 default=90,
                 type="int",
                 help="C-score cutoff for RBH hits [default: %default]")
    p.set_stripnames()
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ploidy, anchorfile, blastfile = args
    norbh = opts.norbh
    pctid = opts.pctid
    cs = opts.cscore
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    fp = open(ploidy)
    genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp))
    fp.close()

    ploidy = DictFile(ploidy)

    geneinfo(qbed, qorder, genomeidx, ploidy)
    geneinfo(sbed, sorder, genomeidx, ploidy)

    pf = blastfile.rsplit(".", 1)[0]
    cscorefile = pf + ".cscore"
    cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"])
    ac = AnchorFile(anchorfile)
    pairs = set((a, b) for a, b, i in ac.iter_pairs())
    logging.debug("Imported {0} pairs from `{1}`.".format(
        len(pairs), anchorfile))

    weightsfile = pf + ".weights"
    fp = open(cscorefile)
    fw = open(weightsfile, "w")
    npairs = 0
    for row in fp:
        a, b, c, pct = row.split()
        c, pct = float(c), float(pct)
        c = int(c * 100)
        if (a, b) not in pairs:
            if norbh:
                continue
            if c < cs:
                continue
            if pct < pctid:
                continue
            c /= 10  # This severely penalizes RBH against synteny

        print >> fw, "\t".join((a, b, str(c)))
        npairs += 1
    fw.close()

    logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
Example #5
0
def ortholog(args):
    """
    %prog ortholog species_a species_b

    Run a sensitive pipeline to find orthologs between two species a and b.
    The pipeline runs LAST and generate .lifted.anchors.

    `--full` mode would assume 1-to-1 quota synteny blocks as the backbone of
    such predictions. Extra orthologs will be recruited from reciprocal best
    match (RBH).
    """
    from jcvi.apps.align import last as last_main
    from jcvi.compara.blastfilter import main as blastfilter_main
    from jcvi.compara.quota import main as quota_main
    from jcvi.compara.synteny import scan, mcscan, liftover
    from jcvi.formats.blast import cscore, filter

    p = OptionParser(ortholog.__doc__)
    p.add_option(
        "--dbtype",
        default="nucl",
        choices=("nucl", "prot"),
        help="Molecule type of subject database",
    )
    p.add_option(
        "--full",
        default=False,
        action="store_true",
        help="Run in full 1x1 mode, including blocks and RBH",
    )
    p.add_option("--cscore", default=0.7, type="float", help="C-score cutoff")
    p.add_option(
        "--dist", default=20, type="int", help="Extent of flanking regions to search"
    )
    p.add_option(
        "-n",
        "--min_size",
        dest="n",
        type="int",
        default=4,
        help="minimum number of anchors in a cluster",
    )
    p.add_option("--quota", help="Quota align parameter")
    p.add_option(
        "--no_strip_names",
        default=False,
        action="store_true",
        help="Do not strip alternative splicing (e.g. At5g06540.1 -> At5g06540)",
    )
    p.set_cpus()
    p.set_dotplot_opts()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    a, b = args
    dbtype = opts.dbtype
    suffix = ".cds" if dbtype == "nucl" else ".pep"
    abed, afasta = a + ".bed", a + suffix
    bbed, bfasta = b + ".bed", b + suffix
    ccscore = opts.cscore
    quota = opts.quota
    dist = "--dist={0}".format(opts.dist)
    minsize_flag = "--min_size={}".format(opts.n)
    cpus_flag = "--cpus={}".format(opts.cpus)

    aprefix = afasta.split(".")[0]
    bprefix = bfasta.split(".")[0]
    pprefix = ".".join((aprefix, bprefix))
    qprefix = ".".join((bprefix, aprefix))
    last = pprefix + ".last"
    if need_update((afasta, bfasta), last):
        last_main([bfasta, afasta, cpus_flag], dbtype)

    if a == b:
        lastself = last + ".P98L0.inverse"
        if need_update(last, lastself):
            filter([last, "--hitlen=0", "--pctid=98", "--inverse", "--noself"])
        last = lastself

    filtered_last = last + ".filtered"
    if need_update(last, filtered_last):
        if opts.no_strip_names:
            blastfilter_main([last, "--cscore={0}".format(ccscore), "--no_strip_names"])
        else:
            blastfilter_main([last, "--cscore={0}".format(ccscore)])

    anchors = pprefix + ".anchors"
    lifted_anchors = pprefix + ".lifted.anchors"
    pdf = pprefix + ".pdf"
    if not opts.full:
        if need_update(filtered_last, lifted_anchors):
            dargs = [
                filtered_last,
                anchors,
                minsize_flag,
                dist,
                "--liftover={0}".format(last),
            ]
            if opts.no_strip_names:
                dargs += [
                    "--no_strip_names",
                ]
            scan(dargs)
        if quota:
            quota_main([lifted_anchors, "--quota={0}".format(quota), "--screen"])
        if need_update(anchors, pdf):
            from jcvi.graphics.dotplot import dotplot_main

            dargs = [anchors]
            if opts.nostdpf:
                dargs += ["--nostdpf"]
            if opts.nochpf:
                dargs += ["--nochpf"]
            if opts.skipempty:
                dargs += ["--skipempty"]
            if opts.genomenames:
                dargs += ["--genomenames", opts.genomenames]
            if opts.theme:
                dargs += ["--theme", opts.theme]
            dotplot_main(dargs)
        return

    if need_update(filtered_last, anchors):
        if opts.no_strip_names:
            scan([filtered_last, anchors, dist, "--no_strip_names"])
        else:
            scan([filtered_last, anchors, dist])

    ooanchors = pprefix + ".1x1.anchors"
    if need_update(anchors, ooanchors):
        quota_main([anchors, "--quota=1:1", "--screen"])

    lifted_anchors = pprefix + ".1x1.lifted.anchors"
    if need_update((last, ooanchors), lifted_anchors):
        if opts.no_strip_names:
            liftover([last, ooanchors, dist, "--no_strip_names"])
        else:
            liftover([last, ooanchors, dist])

    pblocks = pprefix + ".1x1.blocks"
    qblocks = qprefix + ".1x1.blocks"
    if need_update(lifted_anchors, [pblocks, qblocks]):
        mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks])
        mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks])

    rbh = pprefix + ".rbh"
    if need_update(last, rbh):
        cscore([last, "-o", rbh])

    portho = pprefix + ".ortholog"
    qortho = qprefix + ".ortholog"
    if need_update([pblocks, qblocks, rbh], [portho, qortho]):
        make_ortholog(pblocks, rbh, portho)
        make_ortholog(qblocks, rbh, qortho)
Example #6
0
def ortholog(args):
    """
    %prog ortholog a.bed a.cds b.bed b.cds

    Run a sensitive pipeline to find orthologs between two species a and b.
    The pipeline runs LAST and 1-to-1 quota synteny blocks as the backbone of
    such predictions. Extra orthologs will be recruited from reciprocal best
    match (RBH).
    """
    from jcvi.apps.last import main as last_main
    from jcvi.compara.blastfilter import main as blastfilter_main
    from jcvi.compara.quota import main as quota_main
    from jcvi.compara.synteny import scan, mcscan, liftover
    from jcvi.formats.blast import cscore

    p = OptionParser(ortholog.__doc__)
    p.add_option("--cscore", default=0.99, type="float", help="C-score cutoff [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    abed, afasta, bbed, bfasta = args
    ccscore = opts.cscore
    aprefix = afasta.split(".")[0]
    bprefix = bfasta.split(".")[0]
    pprefix = ".".join((aprefix, bprefix))
    qprefix = ".".join((bprefix, aprefix))
    last = pprefix + ".last"
    if need_update((afasta, bfasta), last):
        last_main([bfasta, afasta, "-o", last])

    filtered_last = last + ".filtered"
    bstring = ["--qbed=" + abed, "--sbed=" + bbed]
    if need_update(last, filtered_last):
        blastfilter_main([last, "--cscore={0}".format(ccscore), "--tandem_Nmax=10"] + bstring)

    anchors = pprefix + ".anchors"
    lifted_anchors = pprefix + ".lifted.anchors"
    if need_update(filtered_last, lifted_anchors):
        scan([filtered_last, anchors] + bstring)

    ooanchors = pprefix + ".1x1.anchors"
    if need_update(anchors, ooanchors):
        quota_main([anchors, "--quota=1:1", "--screen"] + bstring)

    lifted_anchors = pprefix + ".1x1.lifted.anchors"
    if need_update((last, ooanchors), lifted_anchors):
        liftover([last, ooanchors] + bstring)

    pblocks = pprefix + ".1x1.blocks"
    qblocks = qprefix + ".1x1.blocks"
    if need_update(lifted_anchors, [pblocks, qblocks]):
        mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks])
        mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks])

    rbh = pprefix + ".rbh"
    if need_update(last, rbh):
        cscore([last, "-o", rbh])

    portho = pprefix + ".ortholog"
    qortho = qprefix + ".ortholog"
    if need_update([pblocks, qblocks, rbh], [portho, qortho]):
        make_ortholog(pblocks, rbh, portho)
        make_ortholog(qblocks, rbh, qortho)
Example #7
0
def ortholog(args):
    """
    %prog ortholog species_a species_b

    Run a sensitive pipeline to find orthologs between two species a and b.
    The pipeline runs LAST and generate .lifted.anchors.

    `--full` mode would assume 1-to-1 quota synteny blocks as the backbone of
    such predictions. Extra orthologs will be recruited from reciprocal best
    match (RBH).
    """
    from jcvi.apps.align import last as last_main
    from jcvi.compara.blastfilter import main as blastfilter_main
    from jcvi.compara.quota import main as quota_main
    from jcvi.compara.synteny import scan, mcscan, liftover
    from jcvi.formats.blast import cscore, filter

    p = OptionParser(ortholog.__doc__)
    p.add_option("--dbtype", default="nucl",
                 choices=("nucl", "prot"),
                 help="Molecule type of subject database")
    p.add_option("--full", default=False, action="store_true",
                 help="Run in full mode, including blocks and RBH")
    p.add_option("--cscore", default=0.7, type="float",
                 help="C-score cutoff [default: %default]")
    p.add_option("--dist", default=20, type="int",
                 help="Extent of flanking regions to search")
    p.add_option("--quota", help="Quota align parameter")
    p.add_option("--nostdpf", default=False, action="store_true",
            help="Do not standardize contig names")
    p.add_option("--no_strip_names", default=False, action="store_true",
            help="Do not strip alternative splicing "
            "(e.g. At5g06540.1 -> At5g06540)")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    a, b = args
    dbtype = opts.dbtype
    suffix = ".cds" if dbtype == "nucl" else ".pep"
    abed, afasta = a + ".bed", a + suffix
    bbed, bfasta = b + ".bed", b + suffix
    ccscore = opts.cscore
    quota = opts.quota
    dist = "--dist={0}".format(opts.dist)

    aprefix = afasta.split(".")[0]
    bprefix = bfasta.split(".")[0]
    pprefix = ".".join((aprefix, bprefix))
    qprefix = ".".join((bprefix, aprefix))
    last = pprefix + ".last"
    if need_update((afasta, bfasta), last):
        last_main([bfasta, afasta], dbtype)

    if a == b:
        lastself = last + ".P98L0.inverse"
        if need_update(last, lastself):
            filter([last, "--hitlen=0", "--pctid=98", "--inverse", "--noself"])
        last = lastself

    filtered_last = last + ".filtered"
    if need_update(last, filtered_last):
        if opts.no_strip_names:
            blastfilter_main([last, "--cscore={0}".format(ccscore), "--no_strip_names"])
        else:
            blastfilter_main([last, "--cscore={0}".format(ccscore)])

    anchors = pprefix + ".anchors"
    lifted_anchors = pprefix + ".lifted.anchors"
    pdf = pprefix + ".pdf"
    if not opts.full:
        if need_update(filtered_last, lifted_anchors):
            if opts.no_strip_names:
                scan([filtered_last, anchors, dist,
                        "--liftover={0}".format(last), "--no_strip_names"])
            else:
                scan([filtered_last, anchors, dist,
                        "--liftover={0}".format(last)])
        if quota:
            quota_main([lifted_anchors,
                        "--quota={0}".format(quota), "--screen"])
        if need_update(anchors, pdf):
            from jcvi.graphics.dotplot import dotplot_main
            dargs = [anchors]
            if opts.nostdpf:
                dargs += ["--nostdpf", "--skipempty"]
            dotplot_main(dargs)
        return

    if need_update(filtered_last, anchors):
        if opts.no_strip_names:
            scan([filtered_last, anchors, dist, "--no_strip_names"])
        else:
            scan([filtered_last, anchors, dist])

    ooanchors = pprefix + ".1x1.anchors"
    if need_update(anchors, ooanchors):
        quota_main([anchors, "--quota=1:1", "--screen"])

    lifted_anchors = pprefix + ".1x1.lifted.anchors"
    if need_update((last, ooanchors), lifted_anchors):
        if opts.no_strip_names:
            liftover([last, ooanchors, dist, "--no_strip_names"])
        else:
            liftover([last, ooanchors, dist])

    pblocks = pprefix + ".1x1.blocks"
    qblocks = qprefix + ".1x1.blocks"
    if need_update(lifted_anchors, [pblocks, qblocks]):
        mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks])
        mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks])

    rbh = pprefix + ".rbh"
    if need_update(last, rbh):
        cscore([last, "-o", rbh])

    portho = pprefix + ".ortholog"
    qortho = qprefix + ".ortholog"
    if need_update([pblocks, qblocks, rbh], [portho, qortho]):
        make_ortholog(pblocks, rbh, portho)
        make_ortholog(qblocks, rbh, qortho)