Example #1
0
def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.set_annot_reformat_opts()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        ranks = []
        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \
                    "==>", len(lranks)

        granks = set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, uc=opts.uc) for x in lranks) | \
                 set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x)
                    for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print "\t".join((str(s), "|".join(tag)))
Example #2
0
def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.set_annot_reformat_opts()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        ranks = []
        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \
                    "==>", len(lranks)

        granks = set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, uc=opts.uc) for x in lranks) | \
                 set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print "\t".join((str(s), "|".join(tag)))
Example #3
0
def instantiate(args):
    """
    %prog instantiate tagged.bed blacklist.ids big_gaps.bed

    instantiate NEW genes tagged by renumber.
    """
    p = OptionParser(instantiate.__doc__)
    p.set_annot_reformat_opts()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    taggedbed, blacklist, gapsbed = args
    r = NameRegister(prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc)
    r.get_blacklist(blacklist)
    r.get_gaps(gapsbed)

    # Run through the bed, identify stretch of NEW ids to instantiate,
    # identify the flanking FRAMEs, interpolate!
    bed = Bed(taggedbed)
    outputbed = taggedbed.rsplit(".", 1)[0] + ".new.bed"
    fw = open(outputbed, "w")

    tagkey = lambda x: x.rsplit("|", 1)[-1]
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        sbed = list(sbed)

        ranks = []
        for i, s in enumerate(sbed):
            nametag = s.extra[0]
            tag = tagkey(nametag)

            if tag in (NEW, FRAME):
                ranks.append((i, nametag))

        blocks = []
        for tag, names in groupby(ranks, key=lambda x: tagkey(x[-1])):
            names = list(names)
            if tag == NEW:
                blocks.append((tag, [sbed[x[0]] for x in names]))
            else:
                start, end = names[0][-1], names[-1][-1]
                start, end = atg_name(start,
                                      retval="rank"), atg_name(end,
                                                               retval="rank")
                blocks.append((tag, [start, end]))

        id_table = {}  # old to new name conversion
        for i, (tag, info) in enumerate(blocks):
            if tag != NEW:
                continue

            start_id = 0 if i == 0 else blocks[i - 1][1][-1]
            end_id = start_id + 10000 if i == len(blocks) -1 \
                        else blocks[i + 1][1][0]

            r.allocate(info, chr, start_id, end_id, id_table)

        # Output new names
        for i, s in enumerate(sbed):
            nametag = s.extra[0]
            name, tag = nametag.split("|")

            if tag == NEW:
                assert name == '.'
                name = id_table[s.accn]
            elif tag == OVERLAP:
                if name in id_table:
                    name = id_table[name]

            s.extra[0] = "|".join((name, tag))
            print >> fw, s

    fw.close()
Example #4
0
def instantiate(args):
    """
    %prog instantiate tagged.bed blacklist.ids big_gaps.bed

    instantiate NEW genes tagged by renumber.
    """
    p = OptionParser(instantiate.__doc__)
    p.set_annot_reformat_opts()
    p.add_option("--extended_stride", default=False, action="store_true",
                 help="Toggle extended strides for gene numbering")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    taggedbed, blacklist, gapsbed = args
    r = NameRegister(prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc)
    r.get_blacklist(blacklist)
    r.get_gaps(gapsbed)

    # Run through the bed, identify stretch of NEW ids to instantiate,
    # identify the flanking FRAMEs, interpolate!
    bed = Bed(taggedbed)
    outputbed = taggedbed.rsplit(".", 1)[0] + ".new.bed"
    fw = open(outputbed, "w")

    tagkey = lambda x: x.rsplit("|", 1)[-1]
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        sbed = list(sbed)

        ranks = []
        for i, s in enumerate(sbed):
            nametag = s.extra[0]
            tag = tagkey(nametag)

            if tag in (NEW, FRAME):
                ranks.append((i, nametag))

        blocks = []
        for tag, names in groupby(ranks, key=lambda x: tagkey(x[-1])):
            names = list(names)
            if tag == NEW:
                blocks.append((tag, [sbed[x[0]] for x in names]))
            else:
                start, end = names[0][-1], names[-1][-1]
                start, end = atg_name(start, retval="rank"), atg_name(end, retval="rank")
                blocks.append((tag, [start, end]))

        id_table = {}  # old to new name conversion
        for i, (tag, info) in enumerate(blocks):
            if tag != NEW:
                continue

            start_id = 0 if i == 0 else blocks[i - 1][1][-1]
            end_id = start_id + 10000 if i == len(blocks) -1 \
                        else blocks[i + 1][1][0]

            r.allocate(info, chr, start_id, end_id, id_table, extended_stride=opts.extended_stride)

        # Output new names
        for i, s in enumerate(sbed):
            nametag = s.extra[0]
            name, tag = nametag.split("|")

            if tag == NEW:
                assert name == '.'
                name = id_table[s.accn]
            elif tag == OVERLAP:
                if name in id_table:
                    name = id_table[name]

            s.extra[0] = "|".join((name, tag))
            print >> fw, s

    fw.close()