Exemple #1
0
def augustus(args):
    """
    %prog augustus augustus.gff3 > reformatted.gff3

    AUGUSTUS does generate a gff3 (--gff3=on) but need some refinement.
    """
    from jcvi.formats.gff import Gff

    p = OptionParser(augustus.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    ingff3, = args
    gff = Gff(ingff3)
    fw = must_open(opts.outfile, "w")
    seen = defaultdict(int)
    for g in gff:
        if g.type not in ("gene", "transcript", "CDS"):
            continue

        if g.type == "transcript":
            g.type = "mRNA"

        prefix = g.seqid + "_"
        pid = prefix + g.id
        newid = "{0}-{1}".format(pid, seen[pid]) if pid in seen else pid
        seen[pid] += 1
        g.attributes["ID"] = [newid]
        g.attributes["Parent"] = [(prefix + x) for x in g.attributes["Parent"]]
        g.update_attributes()
        print >> fw, g
    fw.close()
Exemple #2
0
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.
    """
    p = OptionParser(trimUTR.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    g = make_index(gffile)
    gff = Gff(gffile)
    mRNA_register = {}
    fw = must_open(opts.outfile, "w")
    for c in gff:
        cid, ctype = c.accn, c.type
        if ctype == "gene":
            start, end = get_cds_minmax(g, cid)
            trim(c, start, end)
        elif ctype == "mRNA":
            start, end = get_cds_minmax(g, cid, level=1)
            trim(c, start, end)
            mRNA_register[cid] = (start, end)
        elif ctype != "CDS":
            start, end = mRNA_register[c.parent]
            trim(c, start, end)
        if c.start > c.end:
            print >> sys.stderr, cid, \
                    "destroyed [{0} > {1}]".format(c.start, c.end)
        else:
            print >> fw, c
Exemple #3
0
def pasa(args):
    """
    %prog ${pasadb}.assemblies.fasta ${pasadb}.pasa_assemblies.gff3

    Wraps `pasa_asmbls_to_training_set.dbi`.
    """
    from jcvi.formats.base import SetFile
    from jcvi.formats.gff import Gff

    p = OptionParser(pasa.__doc__)
    p.set_home("pasa")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, gffile = args
    transcodergff = fastafile + ".transdecoder.gff3"
    transcodergenomegff = fastafile + ".transdecoder.genome.gff3"
    if need_update((fastafile, gffile), (transcodergff, transcodergenomegff)):
        cmd = "{0}/scripts/pasa_asmbls_to_training_set.dbi".format(
            opts.pasa_home)
        cmd += " --pasa_transcripts_fasta {0} --pasa_transcripts_gff3 {1}".\
                format(fastafile, gffile)
        sh(cmd)

    completeids = fastafile.rsplit(".", 1)[0] + ".complete.ids"
    if need_update(transcodergff, completeids):
        cmd = "grep complete {0} | cut -f1 | sort -u".format(transcodergff)
        sh(cmd, outfile=completeids)

    complete = SetFile(completeids)
    seen = set()
    completegff = transcodergenomegff.rsplit(".", 1)[0] + ".complete.gff3"
    fw = open(completegff, "w")
    gff = Gff(transcodergenomegff)
    for g in gff:
        a = g.attributes
        if "Parent" in a:
            id = a["Parent"][0]
        else:
            id = a["ID"][0]
        asmbl_id = id.split("|")[0]
        if asmbl_id not in complete:
            continue
        print >> fw, g
        if g.type == "gene":
            seen.add(id)

    fw.close()
    logging.debug("A total of {0} complete models extracted to `{1}`.".\
                    format(len(seen), completegff))
Exemple #4
0
def yeasttruth(args):
    """
    %prog yeasttruth Pillars.tab *.gff

    Prepare pairs data for 14 yeasts.
    """
    p = OptionParser(yeasttruth.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    pillars = args[0]
    gffiles = args[1:]
    aliases = {}
    pivot = {}
    for gffile in gffiles:
        is_pivot = op.basename(gffile).startswith("Saccharomyces_cerevisiae")
        gff = Gff(gffile)
        for g in gff:
            if g.type != "gene":
                continue
            for a in g.attributes["Alias"]:
                aliases[a] = g.accn
                if is_pivot:
                    pivot[a] = g.accn
    logging.debug("Aliases imported: {0}".format(len(aliases)))
    logging.debug("Pivot imported: {0}".format(len(pivot)))
    fw = open("yeast.aliases", "w")
    for k, v in sorted(aliases.items()):
        print("\t".join((k, v)), file=fw)
    fw.close()

    fp = open(pillars)
    pairs = set()
    fw = must_open(opts.outfile, "w")
    for row in fp:
        atoms = [x for x in row.split() if x != "---"]
        pps = [pivot[x] for x in atoms if x in pivot]
        atoms = [aliases[x] for x in atoms if x in aliases]
        for p in pps:
            for a in atoms:
                if p == a:
                    continue
                pairs.add(tuple(sorted((p, a))))

    for a, b in sorted(pairs):
        print("\t".join((a, b)), file=fw)
    fw.close()
Exemple #5
0
def uniq(args):
    """
    %prog uniq gffile cdsfasta

    Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping
    'piles' are processed, one by one.

    Here, we use a different algorithm, that retains the best non-overlapping
    subset witin each pile, rather than single best model. Scoring function is
    also different, rather than based on score or span, we optimize for the
    subset that show the best combined score. Score is defined by:

    score = (1 - AED) * length
    """

    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gffile, cdsfasta = args
    gff = Gff(gffile)
    sizes = Sizes(cdsfasta).mapping
    gene_register = {}
    for g in gff:
        if g.type != "mRNA":
            continue
        aed = float(g.attributes["_AED"][0])
        gene_register[g.parent] = (1 - aed) * sizes[g.accn]

    allgenes = import_feats(gffile)
    g = get_piles(allgenes)

    bestids = set()
    for group in g:
        ranges = [
            to_range(x, score=gene_register[x.accn], id=x.accn) for x in group
        ]
        selected_chain, score = range_chain(ranges)
        bestids |= set(x.id for x in selected_chain)

    removed = set(x.accn for x in allgenes) - bestids
    fw = open("removed.ids", "w")
    print("\n".join(sorted(removed)), file=fw)
    fw.close()
    populate_children(opts.outfile, bestids, gffile, "gene")
Exemple #6
0
def get_cds_beds(gffile, noUTR=False):
    from jcvi.formats.gff import Gff

    mrnabed = None
    cdsbeds = []
    gf = Gff(gffile)
    for g in gf:
        if g.type == "mRNA":
            mrnabed = g.bedline
        elif g.type == "CDS":
            cdsbeds.append(g.bedline)

    if noUTR:
        mrnabed.start = min(x.start for x in cdsbeds)
        mrnabed.end = max(x.end for x in cdsbeds)

    return mrnabed, cdsbeds
Exemple #7
0
def augustus(args):
    """
    %prog augustus augustus.gff3 > reformatted.gff3

    AUGUSTUS does generate a gff3 (--gff3=on) but need some refinement.
    """
    p = OptionParser(augustus.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    ingff3, = args
    gff = Gff(ingff3)
    for g in gff:
        if g.type not in ("gene", "transcript", "CDS"):
            continue

        if g.type == "transcript":
            g.type = "mRNA"

        print g