def annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits): current_chr = number(chr) for line in chrbed: accn = line.accn if accn not in g or (opts.atg_name and "chr" not in chr): abedline[accn] = line continue gaccns = g[accn] new = [a for a in gaccns if re.search(new_id_pat, a)] newgrp = ";".join(sorted(new)) if accn in scores: scores[accn] = sorted(scores[accn], key=lambda x: x[1]) scores[accn] = sorted(scores[accn], key=lambda x: float(x[3]), reverse=True) accns = [] print >> sys.stderr, accn for elem in scores[accn]: print >> sys.stderr, "\t" + ", ".join([str(x)\ for x in elem[1:]]) if opts.atg_name: achr, arank = atg_name(elem[1]) if not achr or achr != current_chr: continue accns.append(elem[1]) if len(new) > 1: if newgrp not in scores: scores[newgrp] = [] scores[newgrp].append(elem) else: accns[0:0] = [accn] line.accn = ";".join([str(x) for x in accns]) if len(scores[accn]) > 1: break if len(new) > 1: splits.add(newgrp) else: abedline[line.accn] = line return abedline, splits
def atg_name(name, retval="chr,rank", trimpad0=True): atg_name_pat = re.compile(r""" ^(?P<locus> (?P<prefix>\D+)(?P<chr>[\d+CM])(?P<sep>\D+)(?P<rank>\d+) ) \.?(?P<iso>\d+)? """, re.VERBOSE) seps = ["g", "te", "trna", "s"] pad0s = ["chr", "rank"] if name is not None: m = re.match(atg_name_pat, name) if m is not None and m.group('sep').lower() in seps: retvals = [] for grp in retval.split(","): val = number(m.group(grp)) \ if trimpad0 and grp in pad0s \ else m.group(grp) retvals.append(val) return (x for x in retvals) else: return (None for x in retval.split(","))
def allocate(self, info, chr, start_id, end_id, id_table): start_bp = info[0].start end_bp = info[-1].end current_chr = number(chr) needed = info assert end_id > start_id, \ "end ({0}) > start ({1})".format(end_id, start_id) spots = end_id - start_id - 1 available = [x for x in xrange(start_id + 1, end_id) if (current_chr, x) not in self.black] message = "chr{0} need {1} ids, has {2} spots ({3} available)".\ format(current_chr, len(needed), spots, len(available)) start_gene = gene_name(current_chr, start_id) end_gene = gene_name(current_chr, end_id) message += " between {0} - {1}\n".format(start_gene, end_gene) assert end_bp > start_bp b = "\t".join(str(x) for x in (chr, start_bp - 1, end_bp)) cmd = "echo '{0}' |".format(b) cmd += " intersectBed -a {0} -b stdin".format(self.gapfile) gaps = list(BedLine(x) for x in popen(cmd, debug=False)) ngaps = len(gaps) gapsexpanded = [] GeneDensity = 10000. # assume 10Kb per gene for gap in gaps: gap_bp = int(gap.score) gap_ids = int(round(gap_bp / GeneDensity)) gapsexpanded += [gap] * gap_ids lines = sorted(info + gapsexpanded, key=lambda x: x.start) message += "between bp: {0} - {1}, there are {2} gaps (total {3} ids)".\ format(start_bp, end_bp, ngaps, len(lines)) needed = lines stride = Stride(needed, available) conf = stride.conf message += " stride: {0}".format(conf) print >> sys.stderr, message nneeded = len(needed) if conf is None: # prefix rule - prepend version number for spills magic = 400000 # version 4 firstdigit = 100000 step = 10 # stride for the prefixed ids rank = start_id + magic if rank > magic + firstdigit: rank -= firstdigit available = [] while len(available) != nneeded: rank += step if (current_chr, rank) in self.black: # avoid blacklisted ids continue available.append(rank) else: # follow the best stride available = stride.available if start_id == 0: # follow right flank at start of chr available = available[- nneeded:] else: # follow left flank otherwise available = available[:nneeded] # Finally assign the ids assert len(needed) == len(available) for b, rank in zip(needed, available): name = gene_name(current_chr, rank) print >> sys.stderr, "\t".join((str(b), name)) id_table[b.accn] = name self.black.add((current_chr, rank)) print >> sys.stderr
def renumber(args): """ %prog renumber Mt35.consolidated.bed > tagged.bed Renumber genes for annotation updates. """ from jcvi.algorithms.lis import longest_increasing_subsequence from jcvi.utils.grouper import Grouper p = OptionParser(renumber.__doc__) p.add_option("--pad0", default=6, type="int", help="Pad gene identifiers with 0 [default: %default]") p.add_option("--prefix", default="Medtr", help="Genome prefix [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args pf = bedfile.rsplit(".", 1)[0] abedfile = pf + ".a.bed" bbedfile = pf + ".b.bed" if need_update(bedfile, (abedfile, bbedfile)): prepare(bedfile) mbed = Bed(bbedfile) g = Grouper() for s in mbed: accn = s.accn g.join(*accn.split(";")) bed = Bed(abedfile) for chr, sbed in bed.sub_beds(): if "chr" not in chr: continue current_chr = number(chr) ranks = [] gg = set() for s in sbed: accn = s.accn achr, arank = atg_name(accn) if achr != current_chr: continue ranks.append(arank) gg.add(accn) lranks = longest_increasing_subsequence(ranks) print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \ "==>", len(lranks) granks = set(gene_name(current_chr, x) for x in lranks) | \ set(gene_name(current_chr, x, sep="te") for x in lranks) tagstore = {} for s in sbed: achr, arank = atg_name(s.accn) accn = s.accn if accn in granks: tag = (accn, FRAME) elif accn in gg: tag = (accn, RETAIN) else: tag = (".", NEW) tagstore[accn] = tag # Find cases where genes overlap for s in sbed: accn = s.accn gaccn = g[accn] tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn] group = [(PRIORITY.index(tag), x) for tag, x in tags] best = min(group)[-1] if accn != best: tag = (best, OVERLAP) else: tag = tagstore[accn] print "\t".join((str(s), "|".join(tag)))