Example #1
0
def uniq(args):
    """
    %prog uniq gffile cdsfasta

    Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping
    'piles' are processed, one by one.

    Here, we use a different algorithm, that retains the best non-overlapping
    subset witin each pile, rather than single best model. Scoring function is
    also different, rather than based on score or span, we optimize for the
    subset that show the best combined score. Score is defined by:

    score = (1 - AED) * length
    """

    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gffile, cdsfasta = args
    gff = Gff(gffile)
    sizes = Sizes(cdsfasta).mapping
    gene_register = {}
    for g in gff:
        if g.type != "mRNA":
            continue
        aed = float(g.attributes["_AED"][0])
        gene_register[g.parent] = (1 - aed) * sizes[g.accn]

    allgenes = import_feats(gffile)
    g = get_piles(allgenes)

    bestids = set()
    for group in g:
        ranges = [
            to_range(x, score=gene_register[x.accn], id=x.accn) for x in group
        ]
        selected_chain, score = range_chain(ranges)
        bestids |= set(x.id for x in selected_chain)

    removed = set(x.accn for x in allgenes) - bestids
    fw = open("removed.ids", "w")
    print("\n".join(sorted(removed)), file=fw)
    fw.close()
    populate_children(opts.outfile, bestids, gffile, "gene")
Example #2
0
File: qc.py Project: arvin580/jcvi
def uniq(args):
    """
    %prog uniq gffile cdsfasta

    Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping
    'piles' are processed, one by one.

    Here, we use a different algorithm, that retains the best non-overlapping
    subset witin each pile, rather than single best model. Scoring function is
    also different, rather than based on score or span, we optimize for the
    subset that show the best combined score. Score is defined by:

    score = (1 - AED) * length
    """

    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gffile, cdsfasta = args
    gff = Gff(gffile)
    sizes = Sizes(cdsfasta).mapping
    gene_register = {}
    for g in gff:
        if g.type != "mRNA":
            continue
        aed = float(g.attributes["_AED"][0])
        gene_register[g.parent] = (1 - aed) * sizes[g.accn]

    allgenes = import_feats(gffile)
    g = get_piles(allgenes)

    bestids = set()
    for group in g:
        ranges = [to_range(x, score=gene_register[x.accn], id=x.accn) \
                    for x in group]
        selected_chain, score = range_chain(ranges)
        bestids |= set(x.id for x in selected_chain)

    removed = set(x.accn for x in allgenes) - bestids
    fw = open("removed.ids", "w")
    print >> fw, "\n".join(sorted(removed))
    fw.close()
    populate_children(opts.outfile, bestids, gffile, "gene")
Example #3
0
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.

    If reference GFF3 is provided, reinstate UTRs from reference
    transcripts after trimming.

    Note: After running trimUTR, it is advised to also run
    `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3
    to adjust the boundaries of all parent 'gene' features
    """
    import gffutils
    from jcvi.formats.base import SetFile

    p = OptionParser(trimUTR.__doc__)
    p.add_option(
        "--trim5",
        default=None,
        type="str",
        help="File containing gene list for 5' UTR trimming",
    )
    p.add_option(
        "--trim3",
        default=None,
        type="str",
        help="File containing gene list for 3' UTR trimming",
    )
    p.add_option(
        "--trimrange",
        default=None,
        type="str",
        help="File containing gene list for UTR trim back" +
        "based on suggested (start, stop) coordinate range",
    )
    p.add_option(
        "--refgff",
        default=None,
        type="str",
        help="Reference GFF3 used as fallback to replace UTRs",
    )
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (gffile, ) = args
    gff = make_index(gffile)

    trim_both = False if (opts.trim5 or opts.trim3) else True
    trim5 = SetFile(opts.trim5) if opts.trim5 else set()
    trim3 = SetFile(opts.trim3) if opts.trim3 else set()
    trimrange = dict()
    if opts.trimrange:
        trf = must_open(opts.trimrange)
        for tr in trf:
            assert (len(tr.split("\t")) == 3
                    ), "Must specify (start, stop) coordinate range"
            id, start, stop = tr.split("\t")
            trimrange[id] = (int(start), int(stop))
        trf.close()

    refgff = make_index(opts.refgff) if opts.refgff else None

    fw = must_open(opts.outfile, "w")
    for feat in gff.iter_by_parent_childs(featuretype="gene",
                                          order_by=("seqid", "start"),
                                          level=1):
        for c in feat:
            cid, ctype, cparent = (
                c.id,
                c.featuretype,
                c.attributes.get("Parent", [None])[0],
            )
            t5, t3 = False, False
            if ctype == "gene":
                t5 = True if cid in trim5 else False
                t3 = True if cid in trim3 else False
                start, end = get_cds_minmax(gff, cid)
                trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
            elif ctype == "mRNA":
                utr_types, extras = [], set()
                if any(id in trim5 for id in (cid, cparent)):
                    t5 = True
                    trim5.add(cid)
                if any(id in trim3 for id in (cid, cparent)):
                    t3 = True
                    trim3.add(cid)
                refc = None
                if refgff:
                    try:
                        refc = refgff[cid]
                        refctype = refc.featuretype
                        refptype = refgff[refc.attributes["Parent"]
                                          [0]].featuretype
                        if refctype == "mRNA" and refptype == "gene":
                            if cmp_children(cid, gff, refgff, cftype="CDS"):
                                reinstate(c,
                                          refc,
                                          trim5=t5,
                                          trim3=t3,
                                          both=trim_both)
                                if t5:
                                    utr_types.append("five_prime_UTR")
                                if t3:
                                    utr_types.append("three_prime_UTR")
                                for utr_type in utr_types:
                                    for utr in refgff.children(
                                            refc, featuretype=utr_type):
                                        extras.add(utr)
                                        for exon in refgff.region(
                                                region=utr,
                                                featuretype="exon"):
                                            if exon.attributes["Parent"][
                                                    0] == cid:
                                                extras.add(exon)
                        else:
                            refc = None
                    except gffutils.exceptions.FeatureNotFoundError:
                        pass
                start, end = get_cds_minmax(gff, cid, level=1)
                if cid in trimrange:
                    start, end = range_minmax([trimrange[cid], (start, end)])
                if not refc:
                    trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
                for cc in gff.children(cid, order_by="start"):
                    _ctype = cc.featuretype
                    if _ctype not in utr_types:
                        if _ctype != "CDS":
                            if _ctype == "exon":
                                eskip = [
                                    range_overlap(to_range(cc), to_range(x))
                                    for x in extras if x.featuretype == "exon"
                                ]
                                if any(eskip):
                                    continue
                            trim(cc,
                                 start,
                                 end,
                                 trim5=t5,
                                 trim3=t3,
                                 both=trim_both)
                            fprint(cc, fw)
                        else:
                            fprint(cc, fw)
                for x in extras:
                    fprint(x, fw)
    fw.close()
Example #4
0
File: qc.py Project: arvin580/jcvi
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.

    If reference GFF3 is provided, reinstate UTRs from reference
    transcripts after trimming.

    Note: After running trimUTR, it is advised to also run
    `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3
    to adjust the boundaries of all parent 'gene' features
    """
    import gffutils
    from jcvi.formats.base import SetFile

    p = OptionParser(trimUTR.__doc__)
    p.add_option("--trim5", default=None, type="str", \
        help="File containing gene list for 5' UTR trimming")
    p.add_option("--trim3", default=None, type="str", \
        help="File containing gene list for 3' UTR trimming")
    p.add_option("--trimrange", default=None, type="str", \
        help="File containing gene list for UTR trim back" + \
             "based on suggested (start, stop) coordinate range")
    p.add_option("--refgff", default=None, type="str", \
        help="Reference GFF3 used as fallback to replace UTRs")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    gff = make_index(gffile)

    trim_both = False if (opts.trim5 or opts.trim3) else True
    trim5 = SetFile(opts.trim5) if opts.trim5 else set()
    trim3 = SetFile(opts.trim3) if opts.trim3 else set()
    trimrange = dict()
    if opts.trimrange:
        trf = must_open(opts.trimrange)
        for tr in trf:
            assert len(tr.split("\t")) == 3, \
                "Must specify (start, stop) coordinate range"
            id, start, stop = tr.split("\t")
            trimrange[id] = (int(start), int(stop))
        trf.close()

    refgff = make_index(opts.refgff) if opts.refgff else None

    fw = must_open(opts.outfile, "w")
    for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1):
        for c in feat:
            cid, ctype, cparent = c.id, c.featuretype, \
                c.attributes.get('Parent', [None])[0]
            t5, t3 = False, False
            if ctype == "gene":
                t5 = True if cid in trim5 else False
                t3 = True if cid in trim3 else False
                start, end = get_cds_minmax(gff, cid)
                trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
            elif ctype == "mRNA":
                utr_types, extras = [], set()
                if any(id in trim5 for id in (cid, cparent)):
                    t5 = True
                    trim5.add(cid)
                if any(id in trim3 for id in (cid, cparent)):
                    t3 = True
                    trim3.add(cid)
                refc = None
                if refgff:
                    try:
                        refc = refgff[cid]
                        refctype = refc.featuretype
                        refptype = refgff[refc.attributes['Parent'][0]].featuretype
                        if refctype == "mRNA" and refptype == "gene":
                            if cmp_children(cid, gff, refgff, cftype="CDS"):
                                reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both)
                                if t5: utr_types.append('five_prime_UTR')
                                if t3: utr_types.append('three_prime_UTR')
                                for utr_type in utr_types:
                                    for utr in refgff.children(refc, featuretype=utr_type):
                                        extras.add(utr)
                                        for exon in refgff.region(region=utr, featuretype="exon"):
                                            if exon.attributes['Parent'][0] == cid:
                                                extras.add(exon)
                        else:
                            refc = None
                    except gffutils.exceptions.FeatureNotFoundError:
                        pass
                start, end = get_cds_minmax(gff, cid, level=1)
                if cid in trimrange:
                    start, end = range_minmax([trimrange[cid], (start, end)])
                if not refc:
                    trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
                for cc in gff.children(cid, order_by=("start")):
                    _ctype = cc.featuretype
                    if _ctype not in utr_types:
                        if _ctype != "CDS":
                            if _ctype == "exon":
                                eskip = [range_overlap(to_range(cc), to_range(x)) \
                                    for x in extras if x.featuretype == 'exon']
                                if any(skip for skip in eskip): continue
                            trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both)
                            fprint(cc, fw)
                        else:
                            fprint(cc, fw)
                for x in extras:
                    fprint(x, fw)
    fw.close()