def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \ + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join( (gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def insertion(args): """ %prog insertion mic.mac.bed Find IES based on mapping MIC reads to MAC genome. Output a bedfile with 'lesions' (stack of broken reads) in the MAC genome. """ p = OptionParser(insertion.__doc__) p.add_option("--mindepth", default=6, type="int", help="Minimum depth to call an insertion") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args mindepth = opts.mindepth bed = Bed(bedfile) fw = must_open(opts.outfile, "w") for seqid, feats in bed.sub_beds(): left_ends = Counter([x.start for x in feats]) right_ends = Counter([x.end for x in feats]) selected = [] for le, count in left_ends.items(): if count >= mindepth: selected.append((seqid, le, "LE-{0}".format(le), count)) for re, count in right_ends.items(): if count >= mindepth: selected.append((seqid, re, "RE-{0}".format(re), count)) selected.sort() for seqid, pos, label, count in selected: label = "{0}-r{1}".format(label, count) print >> fw, "\t".join((seqid, str(pos - 1), str(pos), label))
def frombed(args): """ %prog frombed bedfile contigfasta readfasta Convert read placement to contig format. This is useful before running BAMBUS. """ from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.bed import Bed from jcvi.utils.cbook import fill p = OptionParser(frombed.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, contigfasta, readfasta = args prefix = bedfile.rsplit(".", 1)[0] contigfile = prefix + ".contig" idsfile = prefix + ".ids" contigfasta = Fasta(contigfasta) readfasta = Fasta(readfasta) bed = Bed(bedfile) checksum = "00000000 checksum." fw_ids = open(idsfile, "w") fw = open(contigfile, "w") for ctg, reads in bed.sub_beds(): ctgseq = contigfasta[ctg] ctgline = "##{0} {1} {2} bases, {3}".format(\ ctg, len(reads), len(ctgseq), checksum) print >> fw_ids, ctg print >> fw, ctgline print >> fw, fill(ctgseq.seq) for b in reads: read = b.accn strand = b.strand readseq = readfasta[read] rc = " [RC]" if strand == "-" else "" readlen = len(readseq) rstart, rend = 1, readlen if strand == "-": rstart, rend = rend, rstart readrange = "{{{0} {1}}}".format(rstart, rend) conrange = "<{0} {1}>".format(b.start, b.end) readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\ read, rc, readlen, checksum, readrange, conrange) print >> fw, readline print >> fw, fill(readseq.seq) logging.debug("Mapped contigs written to `{0}`.".format(contigfile)) logging.debug("Contig IDs written to `{0}`.".format(idsfile))
def frombed(args): """ %prog frombed bedfile contigfasta readfasta Convert read placement to contig format. This is useful before running BAMBUS. """ from jcvi.formats.fasta import Fasta from jcvi.formats.bed import Bed from jcvi.utils.cbook import fill p = OptionParser(frombed.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, contigfasta, readfasta = args prefix = bedfile.rsplit(".", 1)[0] contigfile = prefix + ".contig" idsfile = prefix + ".ids" contigfasta = Fasta(contigfasta) readfasta = Fasta(readfasta) bed = Bed(bedfile) checksum = "00000000 checksum." fw_ids = open(idsfile, "w") fw = open(contigfile, "w") for ctg, reads in bed.sub_beds(): ctgseq = contigfasta[ctg] ctgline = "##{0} {1} {2} bases, {3}".format(\ ctg, len(reads), len(ctgseq), checksum) print >> fw_ids, ctg print >> fw, ctgline print >> fw, fill(ctgseq.seq) for b in reads: read = b.accn strand = b.strand readseq = readfasta[read] rc = " [RC]" if strand == "-" else "" readlen = len(readseq) rstart, rend = 1, readlen if strand == "-": rstart, rend = rend, rstart readrange = "{{{0} {1}}}".format(rstart, rend) conrange = "<{0} {1}>".format(b.start, b.end) readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\ read, rc, readlen, checksum, readrange, conrange) print >> fw, readline print >> fw, fill(readseq.seq) logging.debug("Mapped contigs written to `{0}`.".format(contigfile)) logging.debug("Contig IDs written to `{0}`.".format(idsfile))
def write_lst(bedfile): pf = op.basename(bedfile).split(".")[0] mkdir(pf) bed = Bed(bedfile) stanza = [] for seqid, bs in bed.sub_beds(): fname = op.join(pf, "{0}.lst".format(seqid)) fw = open(fname, "w") for b in bs: print("{0}{1}".format(b.accn.replace(" ", ""), b.strand), file=fw) stanza.append((seqid, fname)) fw.close() return pf, stanza
def write_lst(bedfile): pf = op.basename(bedfile).split(".")[0] mkdir(pf) bed = Bed(bedfile) stanza = [] for seqid, bs in bed.sub_beds(): fname = op.join(pf, "{0}.lst".format(seqid)) fw = open(fname, "w") for b in bs: print >> fw, "{0}{1}".format(b.accn.replace(" ", ""), b.strand) stanza.append((seqid, fname)) fw.close() return pf, stanza
def breakpoint(args): """ %prog breakpoint blastfile bedfile Identify breakpoints where collinearity ends. `blastfile` contains mapping from markers (query) to scaffolds (subject). `bedfile` contains marker locations in the related species. """ from jcvi.formats.blast import bed from jcvi.utils.range import range_interleave p = OptionParser(breakpoint.__doc__) p.add_option("--xdist", type="int", default=20, help="xdist (in related genome) cutoff [default: %default]") p.add_option("--ydist", type="int", default=200000, help="ydist (in current genome) cutoff [default: %default]") p.add_option("-n", type="int", default=5, help="number of markers in a block [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, bedfile = args order = Bed(bedfile).order blastbedfile = bed([blastfile]) bbed = Bed(blastbedfile) key = lambda x: x[1] for scaffold, bs in bbed.sub_beds(): blocks = get_blocks(scaffold, bs, order, xdist=opts.xdist, ydist=opts.ydist, N=opts.n) sblocks = [] for block in blocks: xx, yy = zip(*block) sblocks.append((scaffold, min(yy), max(yy))) iblocks = range_interleave(sblocks) for ib in iblocks: ch, start, end = ib print "{0}\t{1}\t{2}".format(ch, start - 1, end)
def annotate(args): """ %prog annotate new.bed old.bed 2> log Annotate the `new.bed` with features from `old.bed` for the purpose of gene numbering. Ambiguity in ID assignment can be resolved by either of the following 2 methods: - `alignment`: make use of global sequence alignment score (calculated by `needle`) - `overlap`: make use of overlap length (calculated by `intersectBed`) Transfer over as many identifiers as possible while following guidelines: http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing Note: Following RegExp pattern describes the structure of the identifier assigned to features in the `new.bed` file. new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+") Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144 Adjust the value of `new_id_pat` manually as per your ID naming conventions. """ from jcvi.utils.grouper import Grouper valid_resolve_choices = ["alignment", "overlap"] p = OptionParser(annotate.__doc__) p.add_option("--resolve", default="alignment", choices=valid_resolve_choices, help="Resolve ID assignment based on a certain metric" \ + " [default: %default]") p.add_option("--atg_name", default=False, action="store_true", help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \ + " [default: %default]") g1 = OptionGroup(p, "Optional parameters (alignment):\n" \ + "Use if resolving ambiguities based on sequence `alignment`") g1.add_option("--pid", dest="pid", default=35., type="float", help="Percent identity cutoff [default: %default]") g1.add_option("--score", dest="score", default=250., type="float", help="Alignment score cutoff [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters (overlap):\n" \ + "Use if resolving ambiguities based on `overlap` length\n" \ + "Parameters equivalent to `intersectBed`") g2.add_option( "-f", dest="f", default=0.5, type="float", help="Minimum overlap fraction (0.0 - 1.0) [default: %default]") g2.add_option( "-r", dest="r", default=False, action="store_true", help="Require fraction overlap to be reciprocal [default: %default]") g2.add_option("-s", dest="s", default=True, action="store_true", help="Require same strandedness [default: %default]") p.add_option_group(g2) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) nbedfile, obedfile = args npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0] # Make consolidated.bed cbedfile = "consolidated.bed" if not os.path.isfile(cbedfile): consolidate(nbedfile, obedfile, cbedfile) else: logging.warning("`{0}` already exists. Skipping step".format(cbedfile)) logging.warning("Resolving ID assignment ambiguity based on `{0}`".\ format(opts.resolve)) if opts.resolve == "alignment": # Get pairs and prompt to run needle pairsfile = "nw.pairs" scoresfile = "nw.scores" if not os.path.isfile(pairsfile): get_pairs(cbedfile, pairsfile) else: logging.warning("`{0}` already exists. Checking for needle output".\ format(pairsfile)) # If needle scores do not exist, prompt user to run needle if not os.path.isfile(scoresfile): logging.error("`{0}` does not exist. Please process {1} using `needle`".\ format(scoresfile, pairsfile)) sys.exit() else: scoresfile = "ovl.scores" # Calculate overlap length using intersectBed calculate_ovl(nbedfile, obedfile, opts, scoresfile) logging.warning("`{0}' exists. Storing scores in memory".\ format(scoresfile)) scores = read_scores(scoresfile, opts) # Iterate through consolidated bed and # filter piles based on score abedline = {} cbed = Bed(cbedfile) g = Grouper() for c in cbed: accn = c.accn g.join(*accn.split(";")) nbedline = {} nbed = Bed(nbedfile) for line in nbed: nbedline[line.accn] = line splits = set() for chr, chrbed in nbed.sub_beds(): abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits) if splits is not None: abedline = process_splits(splits, scores, nbedline, abedline) abedfile = npf + ".annotated.bed" afh = open(abedfile, "w") for accn in abedline: print >> afh, abedline[accn] afh.close() sort([abedfile, "-i"])
def scaffold(args): """ %prog scaffold ctgfasta agpfile Build scaffolds based on ordering in the AGP file. """ from jcvi.formats.agp import bed, order_to_agp, build from jcvi.formats.bed import Bed p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, agpfile = args sizes = Sizes(ctgfasta).mapping pf = ctgfasta.rsplit(".", 1)[0] phasefile = pf + ".phases" fwphase = open(phasefile, "w") newagpfile = pf + ".new.agp" fwagp = open(newagpfile, "w") scaffoldbuckets = defaultdict(list) bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"]) bb = Bed(bedfile) for s, partialorder in bb.sub_beds(): name = partialorder[0].accn bname = name.rsplit("_", 1)[0] if opts.prefix else s scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder]) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. for bname, scaffolds in sorted(scaffoldbuckets.items()): ctgorder = [] singletons = set() for scaf in sorted(scaffolds): for node, orientation in scaf: ctgorder.append((node, orientation)) if len(scaf) == 1: singletons.add(node) nscaffolds = len(scaffolds) nsingletons = len(singletons) if nsingletons == 1 and nscaffolds == 0: phase = 3 elif nsingletons == 0 and nscaffolds == 1: phase = 2 else: phase = 1 msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\ format(bname, nscaffolds, nsingletons, phase) print >> sys.stderr, msg print >> fwphase, "\t".join((bname, str(phase))) order_to_agp(bname, ctgorder, sizes, fwagp) fwagp.close() os.remove(bedfile) fastafile = "final.fasta" build([newagpfile, ctgfasta, fastafile]) tidy([fastafile])
def instantiate(args): """ %prog instantiate tagged.bed blacklist.ids big_gaps.bed instantiate NEW genes tagged by renumber. """ p = OptionParser(instantiate.__doc__) p.set_annot_reformat_opts() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) taggedbed, blacklist, gapsbed = args r = NameRegister(prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc) r.get_blacklist(blacklist) r.get_gaps(gapsbed) # Run through the bed, identify stretch of NEW ids to instantiate, # identify the flanking FRAMEs, interpolate! bed = Bed(taggedbed) outputbed = taggedbed.rsplit(".", 1)[0] + ".new.bed" fw = open(outputbed, "w") tagkey = lambda x: x.rsplit("|", 1)[-1] for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue sbed = list(sbed) ranks = [] for i, s in enumerate(sbed): nametag = s.extra[0] tag = tagkey(nametag) if tag in (NEW, FRAME): ranks.append((i, nametag)) blocks = [] for tag, names in groupby(ranks, key=lambda x: tagkey(x[-1])): names = list(names) if tag == NEW: blocks.append((tag, [sbed[x[0]] for x in names])) else: start, end = names[0][-1], names[-1][-1] start, end = atg_name(start, retval="rank"), atg_name(end, retval="rank") blocks.append((tag, [start, end])) id_table = {} # old to new name conversion for i, (tag, info) in enumerate(blocks): if tag != NEW: continue start_id = 0 if i == 0 else blocks[i - 1][1][-1] end_id = start_id + 10000 if i == len(blocks) -1 \ else blocks[i + 1][1][0] r.allocate(info, chr, start_id, end_id, id_table) # Output new names for i, s in enumerate(sbed): nametag = s.extra[0] name, tag = nametag.split("|") if tag == NEW: assert name == '.' name = id_table[s.accn] elif tag == OVERLAP: if name in id_table: name = id_table[name] s.extra[0] = "|".join((name, tag)) print >> fw, s fw.close()
def renumber(args): """ %prog renumber Mt35.consolidated.bed > tagged.bed Renumber genes for annotation updates. """ from jcvi.algorithms.lis import longest_increasing_subsequence from jcvi.utils.grouper import Grouper p = OptionParser(renumber.__doc__) p.set_annot_reformat_opts() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args pf = bedfile.rsplit(".", 1)[0] abedfile = pf + ".a.bed" bbedfile = pf + ".b.bed" if need_update(bedfile, (abedfile, bbedfile)): prepare(bedfile) mbed = Bed(bbedfile) g = Grouper() for s in mbed: accn = s.accn g.join(*accn.split(";")) bed = Bed(abedfile) for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue ranks = [] gg = set() for s in sbed: accn = s.accn achr, arank = atg_name(accn) if achr != current_chr: continue ranks.append(arank) gg.add(accn) lranks = longest_increasing_subsequence(ranks) print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \ "==>", len(lranks) granks = set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, uc=opts.uc) for x in lranks) | \ set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks) tagstore = {} for s in sbed: achr, arank = atg_name(s.accn) accn = s.accn if accn in granks: tag = (accn, FRAME) elif accn in gg: tag = (accn, RETAIN) else: tag = (".", NEW) tagstore[accn] = tag # Find cases where genes overlap for s in sbed: accn = s.accn gaccn = g[accn] tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn] group = [(PRIORITY.index(tag), x) for tag, x in tags] best = min(group)[-1] if accn != best: tag = (best, OVERLAP) else: tag = tagstore[accn] print "\t".join((str(s), "|".join(tag)))
def annotate(args): """ %prog annotate new.bed old.bed 2> log Annotate the `new.bed` with features from `old.bed` for the purpose of gene numbering. Ambiguity in ID assignment can be resolved by either of the following 2 methods: - `alignment`: make use of global sequence alignment score (calculated by `needle`) - `overlap`: make use of overlap length (calculated by `intersectBed`) Transfer over as many identifiers as possible while following guidelines: http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing Note: Following RegExp pattern describes the structure of the identifier assigned to features in the `new.bed` file. new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+") Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144 Adjust the value of `new_id_pat` manually as per your ID naming conventions. """ from jcvi.utils.grouper import Grouper valid_resolve_choices = ["alignment", "overlap"] p = OptionParser(annotate.__doc__) p.add_option("--resolve", default="alignment", choices=valid_resolve_choices, help="Resolve ID assignment based on a certain metric" \ + " [default: %default]") p.add_option("--atg_name", default=False, action="store_true", help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \ + " [default: %default]") g1 = OptionGroup(p, "Optional parameters (alignment):\n" \ + "Use if resolving ambiguities based on sequence `alignment`") g1.add_option("--pid", dest="pid", default=35., type="float", help="Percent identity cutoff [default: %default]") g1.add_option("--score", dest="score", default=250., type="float", help="Alignment score cutoff [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters (overlap):\n" \ + "Use if resolving ambiguities based on `overlap` length\n" \ + "Parameters equivalent to `intersectBed`") g2.add_option("-f", dest="f", default=0.5, type="float", help="Minimum overlap fraction (0.0 - 1.0) [default: %default]") g2.add_option("-r", dest="r", default=False, action="store_true", help="Require fraction overlap to be reciprocal [default: %default]") g2.add_option("-s", dest="s", default=True, action="store_true", help="Require same strandedness [default: %default]") p.add_option_group(g2) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) nbedfile, obedfile = args npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0] # Make consolidated.bed cbedfile = "consolidated.bed" if not os.path.isfile(cbedfile): consolidate(nbedfile, obedfile, cbedfile) else: logging.warning("`{0}` already exists. Skipping step".format(cbedfile)) logging.warning("Resolving ID assignment ambiguity based on `{0}`".\ format(opts.resolve)) if opts.resolve == "alignment": # Get pairs and prompt to run needle pairsfile = "nw.pairs" scoresfile = "nw.scores" if not os.path.isfile(pairsfile): get_pairs(cbedfile, pairsfile) else: logging.warning("`{0}` already exists. Checking for needle output".\ format(pairsfile)) # If needle scores do not exist, prompt user to run needle if not os.path.isfile(scoresfile): logging.error("`{0}` does not exist. Please process {1} using `needle`".\ format(scoresfile, pairsfile)) sys.exit() else: scoresfile = "ovl.scores" # Calculate overlap length using intersectBed calculate_ovl(nbedfile, obedfile, opts, scoresfile) logging.warning("`{0}' exists. Storing scores in memory".\ format(scoresfile)) scores = read_scores(scoresfile, opts) # Iterate through consolidated bed and # filter piles based on score abedline = {} cbed = Bed(cbedfile) g = Grouper() for c in cbed: accn = c.accn g.join(*accn.split(";")) nbedline = {} nbed = Bed(nbedfile) for line in nbed: nbedline[line.accn] = line splits = set() for chr, chrbed in nbed.sub_beds(): abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits) if splits is not None: abedline = process_splits(splits, scores, nbedline, abedline) abedfile = npf + ".annotated.bed" afh = open(abedfile, "w") for accn in abedline: print >> afh, abedline[accn] afh.close() sort([abedfile, "-i"])
def cut(args): """ %prog cut agpfile bedfile Cut at the boundaries of the ranges in the bedfile. """ p = OptionParser(cut.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order newagpfile = agpfile.replace(".agp", ".cut.agp") fw = open(newagpfile, "w") agp_fixes = defaultdict(list) for component, intervals in bed.sub_beds(): i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation assert a.component_beg, a.component_end arange = a.component_beg, a.component_end cuts = set() for i in intervals: start, end = i.start, i.end end -= 1 assert start <= end cuts.add(start) cuts.add(end) cuts.add(0) cuts.add(component_span) cuts = list(sorted(cuts)) sum_of_spans = 0 for i, (a, b) in enumerate(pairwise(cuts)): oid = object + "_{0}".format(i) aline = [oid, 0, 0, 0] cspan = b - a aline += ['D', component, a + 1, b, orientation] sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) agp_fixes[component].append(aline) assert component_span == sum_of_spans # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def instantiate(args): """ %prog instantiate tagged.bed blacklist.ids big_gaps.bed instantiate NEW genes tagged by renumber. """ p = OptionParser(instantiate.__doc__) p.set_annot_reformat_opts() p.add_option("--extended_stride", default=False, action="store_true", help="Toggle extended strides for gene numbering") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) taggedbed, blacklist, gapsbed = args r = NameRegister(prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc) r.get_blacklist(blacklist) r.get_gaps(gapsbed) # Run through the bed, identify stretch of NEW ids to instantiate, # identify the flanking FRAMEs, interpolate! bed = Bed(taggedbed) outputbed = taggedbed.rsplit(".", 1)[0] + ".new.bed" fw = open(outputbed, "w") tagkey = lambda x: x.rsplit("|", 1)[-1] for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue sbed = list(sbed) ranks = [] for i, s in enumerate(sbed): nametag = s.extra[0] tag = tagkey(nametag) if tag in (NEW, FRAME): ranks.append((i, nametag)) blocks = [] for tag, names in groupby(ranks, key=lambda x: tagkey(x[-1])): names = list(names) if tag == NEW: blocks.append((tag, [sbed[x[0]] for x in names])) else: start, end = names[0][-1], names[-1][-1] start, end = atg_name(start, retval="rank"), atg_name(end, retval="rank") blocks.append((tag, [start, end])) id_table = {} # old to new name conversion for i, (tag, info) in enumerate(blocks): if tag != NEW: continue start_id = 0 if i == 0 else blocks[i - 1][1][-1] end_id = start_id + 10000 if i == len(blocks) -1 \ else blocks[i + 1][1][0] r.allocate(info, chr, start_id, end_id, id_table, extended_stride=opts.extended_stride) # Output new names for i, s in enumerate(sbed): nametag = s.extra[0] name, tag = nametag.split("|") if tag == NEW: assert name == '.' name = id_table[s.accn] elif tag == OVERLAP: if name in id_table: name = id_table[name] s.extra[0] = "|".join((name, tag)) print >> fw, s fw.close()
def mask(args): """ %prog mask agpfile bedfile Mask given ranges in componets to gaps. """ p = OptionParser(mask.__doc__) p.add_option("--split", default=False, action="store_true", help="Split object and create new names [default: %default]") p.add_option("--log", default=False, action="store_true", help="Write verbose logs to .masklog file [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order # agp lines to replace original ones, keyed by the component agp_fixes = defaultdict(list) newagpfile = agpfile.replace(".agp", ".masked.agp") logfile = bedfile.replace(".bed", ".masklog") fw = open(newagpfile, "w") if opts.log: fwlog = open(logfile, "w") for component, intervals in bed.sub_beds(): if opts.log: print >> fwlog, "\n".join(str(x) for x in intervals) i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation if opts.log: print >> fwlog, a assert a.component_beg, a.component_end arange = a.component_beg, a.component_end # Make sure `ivs` contain DISJOINT ranges, and located within `arange` ivs = [] for i in intervals: iv = range_intersect(arange, (i.start, i.end)) if iv is not None: ivs.append(iv) # Sort the ends of `ivs` as well as the arange arange = a.component_beg - 1, a.component_end + 1 endpoints = sorted(flatten(ivs + [arange])) # reverse if component on negative strand if orientation == '-': endpoints.reverse() sum_of_spans = 0 # assign complements as sequence components for i, (a, b) in enumerate(pairwise(endpoints)): if orientation == '-': a, b = b, a if orientation not in ('+', '-'): orientation = '+' oid = object + "_{0}".format(i / 2) if opts.split else object aline = [oid, 0, 0, 0] if i % 2 == 0: cspan = b - a - 1 aline += ['D', component, a + 1, b - 1, orientation] is_gap = False else: cspan = b - a + 1 aline += ["N", cspan, "fragment", "yes"] is_gap = True if cspan <= 0: continue sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) if not (opts.split and is_gap): agp_fixes[component].append(aline) if opts.log: print >> fwlog, aline assert component_span == sum_of_spans if opts.log: print >> fwlog # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def cut(args): """ %prog cut agpfile bedfile Cut at the boundaries of the ranges in the bedfile. Use --shrink to control the exact boundaries where you cut. """ p = OptionParser(cut.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order newagpfile = agpfile.replace(".agp", ".cut.agp") fw = open(newagpfile, "w") agp_fixes = defaultdict(list) for component, intervals in bed.sub_beds(): i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation assert a.component_beg, a.component_end arange = a.component_beg, a.component_end cuts = set() for i in intervals: start, end = i.start, i.end end -= 1 assert start <= end cuts.add(start) cuts.add(end) cuts.add(0) cuts.add(component_span) cuts = list(sorted(cuts)) sum_of_spans = 0 for i, (a, b) in enumerate(pairwise(cuts)): oid = object + "_{0}".format(i) aline = [oid, 0, 0, 0] cspan = b - a aline += ['D', component, a + 1, b, orientation] sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) agp_fixes[component].append(aline) assert component_span == sum_of_spans # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def mask(args): """ %prog mask agpfile bedfile Mask given ranges in components to gaps. """ p = OptionParser(mask.__doc__) p.add_option("--split", default=False, action="store_true", help="Split object and create new names [default: %default]") p.add_option( "--log", default=False, action="store_true", help="Write verbose logs to .masklog file [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order # agp lines to replace original ones, keyed by the component agp_fixes = defaultdict(list) newagpfile = agpfile.replace(".agp", ".masked.agp") logfile = bedfile.replace(".bed", ".masklog") fw = open(newagpfile, "w") if opts.log: fwlog = open(logfile, "w") for component, intervals in bed.sub_beds(): if opts.log: print >> fwlog, "\n".join(str(x) for x in intervals) i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation if opts.log: print >> fwlog, a assert a.component_beg, a.component_end arange = a.component_beg, a.component_end # Make sure `ivs` contain DISJOINT ranges, and located within `arange` ivs = [] for i in intervals: iv = range_intersect(arange, (i.start, i.end)) if iv is not None: ivs.append(iv) # Sort the ends of `ivs` as well as the arange arange = a.component_beg - 1, a.component_end + 1 endpoints = sorted(flatten(ivs + [arange])) # reverse if component on negative strand if orientation == '-': endpoints.reverse() sum_of_spans = 0 # assign complements as sequence components for i, (a, b) in enumerate(pairwise(endpoints)): if orientation == '-': a, b = b, a if orientation not in ('+', '-'): orientation = '+' oid = object + "_{0}".format(i / 2) if opts.split else object aline = [oid, 0, 0, 0] if i % 2 == 0: cspan = b - a - 1 aline += ['D', component, a + 1, b - 1, orientation] is_gap = False else: cspan = b - a + 1 aline += ["N", cspan, "fragment", "yes"] is_gap = True if cspan <= 0: continue sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) if not (opts.split and is_gap): agp_fixes[component].append(aline) if opts.log: print >> fwlog, aline #assert component_span == sum_of_spans if opts.log: print >> fwlog # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.base import blast from jcvi.formats.blast import BlastSlow from jcvi.formats.fasta import SeqIO from jcvi.utils.iter import roundrobin p = OptionParser(install.__doc__) p.add_option( "--rclip", default=1, type="int", help="Pair ID is derived from rstrip N chars [default: %default]") p.add_option( "--maxsize", default=1000000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args Max = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip prefix = opts.prefix blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = "before.bed", "after.bed" fwa = open(beforebed, "w") fwb = open(afterbed, "w") key1 = lambda x: x.query key2 = lambda x: x.query[:-rclip] if rclip else key1 data = BlastSlow(blastfile) for pe, lines in groupby(data, key=key2): lines = list(lines) if len(lines) != 2: continue a, b = lines aquery, bquery = a.query, b.query asubject, bsubject = a.subject, b.subject if asubject != bsubject: continue astrand, bstrand = a.orientation, b.orientation assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery)) ai, ax = order[aquery] bi, bx = order[bquery] qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1 if astrand == '+' and bstrand == '+': sstart, sstop = a.sstart, b.sstop elif astrand == '-' and bstrand == '-': sstart, sstop = b.sstart, a.sstop else: continue if sstart > sstop: continue if sstop > sstart + Max: continue name = aquery[:-1] + "LR" print >> fwa, "\t".join(str(x) for x in \ (ax.seqid, qstart - 1, qstop, name, 1000, "+")) print >> fwb, "\t".join(str(x) for x in \ (asubject, sstart - 1, sstop, name, 1000, astrand)) fwa.close() fwb.close() beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) import math pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}) for seqid, start, end in cranges: bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) for a in aa: gapid = a.accn bi, b = border[gapid] bbeds.append(b) a = abeds[0] if abeds else [] assert abs(len(abeds) - len(bbeds)) <= 1 if (not a) or a.start > 1: abeds, bbeds = bbeds, abeds beds = list(roundrobin(abeds, bbeds)) if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled)