def nucmer(args): """ %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 Select specific chromosome region based on MTR mapping. The above command will extract chr1:2,000,001-3,000,000. """ p = OptionParser(nucmer.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) mapbed, mtrfasta, asmfasta, chr, idx = args idx = int(idx) m1 = 1000000 bedfile = "sample.bed" bed = Bed() bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) bed.print_to_file(bedfile) cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile) idsfile = "query.ids" sh(cmd, outfile=idsfile) sfasta = fastaFromBed(bedfile, mtrfasta) qfasta = "query.fasta" cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) sh(cmd) cmd = "nucmer {0} {1}".format(sfasta, qfasta) sh(cmd) mummerplot_main(["out.delta", "--refcov=0"]) sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError, "`{0}` is on `{1}` with no number to extract".\ format(saccn, sseqid) bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def nucmer(args): """ %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 Select specific chromosome region based on MTR mapping. The above command will extract chr1:2,000,001-3,000,000. """ p = OptionParser(nucmer.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) mapbed, mtrfasta, asmfasta, chr, idx = args idx = int(idx) m1 = 1000000 bedfile = "sample.bed" bed = Bed() bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) bed.print_to_file(bedfile) cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format( mapbed, bedfile) idsfile = "query.ids" sh(cmd, outfile=idsfile) sfasta = fastaFromBed(bedfile, mtrfasta) qfasta = "query.fasta" cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) sh(cmd) cmd = "nucmer {0} {1}".format(sfasta, qfasta) sh(cmd) mummerplot_main(["out.delta", "--refcov=0"]) sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() if len(atoms) <= 6: continue unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) finalbed = Bed() for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: finalbed.add("{0}\n".format(b)) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] finalbed.add("\t".join(str(x) for x in (scf, 0, size, sid, 1000, strand))) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option( "--switch", default=False, action="store_true", help="Switch reference and aligned map elements", ) p.add_option( "--scale", type="float", help="Scale the aligned map distance by factor" ) p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorsfile,) = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError( "`{0}` is on `{1}` with no number to extract".format(saccn, sseqid) ) bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart)) ) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def ancestral(args): """ %prog ancestral vplanifoliaA.vplanifoliaA.anchors > vplanifoliaA_blocks.bed Paint 14 chromosomes following alpha WGD. """ p = OptionParser(ancestral.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorsfile, ) = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) # We focus on the following chromosome pairs target_pairs = { (1, 1), (1, 6), (1, 8), (1, 13), (2, 4), (3, 12), (3, 14), (5, 6), (5, 8), (7, 9), (7, 11), (9, 10), (10, 11), } def get_target(achr, bchr): if "chr" not in achr and "chr" not in bchr: return None achr, bchr = get_number(achr), get_number(bchr) if achr > bchr: achr, bchr = bchr, achr if (achr, bchr) in target_pairs: return achr, bchr return None def build_bedline(astart, aend, target_pair): # target_name = "{:02d}-{:02d}".format(*target_pair) target_name = [ str(x) for x in target_pair if x in (1, 2, 3, 5, 7, 10) ][0] return "\t".join( str(x) for x in (astart.seqid, astart.start, aend.end, target_name)) # Iterate through the blocks, store any regions that has hits to one of the # target_pairs ac = AnchorFile(anchorsfile) blocks = ac.blocks outbed = Bed() for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] astart, aend = min(a)[1], max(a)[1] bstart, bend = min(b)[1], max(b)[1] # Now convert to BED lines with new accn achr, bchr = astart.seqid, bstart.seqid target = get_target(achr, bchr) if target is None: continue outbed.add(build_bedline(astart, aend, target)) outbed.add(build_bedline(bstart, bend, target)) outbed.print_to_file(sorted=True)
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() if len(atoms) <= 6: continue unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) finalbed = Bed() for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: finalbed.add("{0}\n".format(b)) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] finalbed.add("\t".join(str(x) for x in \ (scf, 0, size, sid, 1000, strand))) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def extract(args): import re from jcvi.formats.bed import Bed db = "" if op.isfile(args.db): db = Fasta(args.db) else: f_db = "%s/data/%s/10_genome.fna" % (os.environ["genome"], args.db) assert op.isfile(f_db), "cannot find %s" % args.db db = Fasta(f_db) reg1 = re.compile("^([\w\-]+)\:([\d,]+)(\-|\.{1,2})([\d,]+)$") reg2 = re.compile("^([\w\-]+)$") bed = Bed() if op.isfile(args.loc): if args.list: fho = must_open(args.loc, 'r') for line in fho: sid = line.strip() beg = 0 if sid in db: end = len(db[sid]) bed.add("%s\t%d\t%d\n" % (sid, beg, end)) # else: # logging.error("%s not in db => skipped" % sid) else: bed = Bed(args.loc, sorted=False) else: for loc in args.loc.split(","): res = reg1.match(loc) if res: sid, beg, end = res.group(1), res.group(2), res.group(4) beg = int(beg.replace(",", "")) end = int(end.replace(",", "")) bed.add("%s\t%d\t%d\n" % (sid, beg - 1, end)) else: res = reg2.match(loc) if res: sid = res.group(1) beg = 0 if sid in db: end = len(db[sid]) bed.add("%s\t%d\t%d\n" % (sid, beg, end)) # else: # logging.error("%s not in db => skipped" % sid) else: logging.error("%s: unknown locstr => skipped" % loc) rcds = [] for b in bed: sid, beg, end = b.seqid, b.start, b.end oid = sid if args.list else f"{sid}-{beg}-{end}" if b.accn: oid = b.accn if sid not in db: print("%s not in db => skipped" % sid) continue size = end - beg + 1 bp_pad = 0 if beg < 1: bp_pad += 1 - beg beg = 1 if beg > len(db[sid]): bp_pad = 1 beg = len(db[sid]) if end > len(db[sid]): bp_pad += end - len(db[sid]) end = len(db[sid]) seq = db[sid][beg - 1:end].seq if args.padding: if bp_pad > 0: if end - beg + 1 < 30: seq = "N" * size else: seq += "N" * bp_pad assert len(seq) == size, "error in seq size: %s:%d-%d %d" % ( sid, beg, end, bp_pad) if args.tsv: print("\t".join([sid, str(beg), str(end), seq])) else: rcd = SeqRecord(Seq(seq), id=oid, description='') SeqIO.write([rcd], sys.stdout, 'fasta')