Beispiel #1
0
def annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits):
    current_chr = number(chr)

    for line in chrbed:
        accn = line.accn
        if accn not in g or (opts.atg_name and "chr" not in chr):
            abedline[accn] = line
            continue

        gaccns = g[accn]
        new = [a for a in gaccns if re.search(new_id_pat, a)]
        newgrp = ";".join(sorted(new))

        if accn in scores:
            scores[accn] = sorted(scores[accn], key=lambda x: x[1])
            scores[accn] = sorted(scores[accn], key=lambda x: float(x[3]), reverse=True)

            accns = []
            print >> sys.stderr, accn
            for elem in scores[accn]:
                print >> sys.stderr, "\t" + ", ".join([str(x)\
                        for x in elem[1:]])
                if opts.atg_name:
                    achr, arank = atg_name(elem[1])
                    if not achr or achr != current_chr:
                        continue

                accns.append(elem[1])
                if len(new) > 1:
                    if newgrp not in scores: scores[newgrp] = []
                    scores[newgrp].append(elem)
                else:
                    accns[0:0] = [accn]
                    line.accn = ";".join([str(x) for x in accns])
                if len(scores[accn]) > 1: break

        if len(new) > 1:
            splits.add(newgrp)
        else:
            abedline[line.accn] = line

    return abedline, splits
Beispiel #2
0
def atg_name(name, retval="chr,rank", trimpad0=True):
    atg_name_pat = re.compile(r"""
            ^(?P<locus>
                (?P<prefix>\D+)(?P<chr>[\d+CM])(?P<sep>\D+)(?P<rank>\d+)
            )
            \.?(?P<iso>\d+)?
            """, re.VERBOSE)

    seps = ["g", "te", "trna", "s"]
    pad0s = ["chr", "rank"]

    if name is not None:
        m = re.match(atg_name_pat, name)
        if m is not None and m.group('sep').lower() in seps:
            retvals = []
            for grp in retval.split(","):
                val = number(m.group(grp)) \
                        if trimpad0 and grp in pad0s \
                        else m.group(grp)
                retvals.append(val)

            return (x for x in retvals)
    else:
        return (None for x in retval.split(","))
Beispiel #3
0
    def allocate(self, info, chr, start_id, end_id, id_table):

        start_bp = info[0].start
        end_bp = info[-1].end

        current_chr = number(chr)
        needed = info
        assert end_id > start_id, \
            "end ({0}) > start ({1})".format(end_id, start_id)

        spots = end_id - start_id - 1
        available = [x for x in xrange(start_id + 1, end_id) if
                            (current_chr, x) not in self.black]

        message = "chr{0} need {1} ids, has {2} spots ({3} available)".\
                format(current_chr, len(needed), spots, len(available))

        start_gene = gene_name(current_chr, start_id)
        end_gene = gene_name(current_chr, end_id)
        message += " between {0} - {1}\n".format(start_gene, end_gene)

        assert end_bp > start_bp

        b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp))
        cmd = "echo '{0}' |".format(b)
        cmd += " intersectBed -a {0} -b stdin".format(self.gapfile)
        gaps = list(BedLine(x) for x in popen(cmd, debug=False))
        ngaps = len(gaps)

        gapsexpanded = []
        GeneDensity = 10000.  # assume 10Kb per gene
        for gap in gaps:
            gap_bp = int(gap.score)
            gap_ids = int(round(gap_bp / GeneDensity))
            gapsexpanded += [gap] * gap_ids

        lines = sorted(info + gapsexpanded, key=lambda x: x.start)

        message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\
                format(start_bp, end_bp, ngaps, len(lines))

        needed = lines
        stride = Stride(needed, available)
        conf = stride.conf
        message += " stride: {0}".format(conf)
        print >> sys.stderr, message

        nneeded = len(needed)
        if conf is None: # prefix rule - prepend version number for spills
            magic = 400000  # version 4
            firstdigit = 100000
            step = 10  # stride for the prefixed ids
            rank = start_id + magic
            if rank > magic + firstdigit:
                rank -= firstdigit
            available = []
            while len(available) != nneeded:
                rank += step
                if (current_chr, rank) in self.black:  # avoid blacklisted ids
                    continue
                available.append(rank)

        else: # follow the best stride
            available = stride.available
            if start_id == 0:  # follow right flank at start of chr
                available = available[- nneeded:]
            else:  # follow left flank otherwise
                available = available[:nneeded]

        # Finally assign the ids
        assert len(needed) == len(available)
        for b, rank in zip(needed, available):
            name = gene_name(current_chr, rank)
            print >> sys.stderr, "\t".join((str(b), name))
            id_table[b.accn] = name
            self.black.add((current_chr, rank))
        print >> sys.stderr
Beispiel #4
0
def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.add_option("--pad0", default=6, type="int",
                 help="Pad gene identifiers with 0 [default: %default]")
    p.add_option("--prefix", default="Medtr",
                 help="Genome prefix [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        if "chr" not in chr:
            continue

        current_chr = number(chr)
        ranks = []

        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \
                    "==>", len(lranks)

        granks = set(gene_name(current_chr, x) for x in lranks) | \
                 set(gene_name(current_chr, x, sep="te") for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print "\t".join((str(s), "|".join(tag)))