def frombed(args): """ %prog frombed bedfile contigfasta readfasta Convert read placement to contig format. This is useful before running BAMBUS. """ from jcvi.formats.fasta import Fasta from jcvi.formats.bed import Bed from jcvi.utils.cbook import fill p = OptionParser(frombed.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, contigfasta, readfasta = args prefix = bedfile.rsplit(".", 1)[0] contigfile = prefix + ".contig" idsfile = prefix + ".ids" contigfasta = Fasta(contigfasta) readfasta = Fasta(readfasta) bed = Bed(bedfile) checksum = "00000000 checksum." fw_ids = open(idsfile, "w") fw = open(contigfile, "w") for ctg, reads in bed.sub_beds(): ctgseq = contigfasta[ctg] ctgline = "##{0} {1} {2} bases, {3}".format(\ ctg, len(reads), len(ctgseq), checksum) print >> fw_ids, ctg print >> fw, ctgline print >> fw, fill(ctgseq.seq) for b in reads: read = b.accn strand = b.strand readseq = readfasta[read] rc = " [RC]" if strand == "-" else "" readlen = len(readseq) rstart, rend = 1, readlen if strand == "-": rstart, rend = rend, rstart readrange = "{{{0} {1}}}".format(rstart, rend) conrange = "<{0} {1}>".format(b.start, b.end) readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\ read, rc, readlen, checksum, readrange, conrange) print >> fw, readline print >> fw, fill(readseq.seq) logging.debug("Mapped contigs written to `{0}`.".format(contigfile)) logging.debug("Contig IDs written to `{0}`.".format(idsfile))
def frombed(args): """ %prog frombed bedfile contigfasta readfasta Convert read placement to contig format. This is useful before running BAMBUS. """ from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.bed import Bed from jcvi.utils.cbook import fill p = OptionParser(frombed.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, contigfasta, readfasta = args prefix = bedfile.rsplit(".", 1)[0] contigfile = prefix + ".contig" idsfile = prefix + ".ids" contigfasta = Fasta(contigfasta) readfasta = Fasta(readfasta) bed = Bed(bedfile) checksum = "00000000 checksum." fw_ids = open(idsfile, "w") fw = open(contigfile, "w") for ctg, reads in bed.sub_beds(): ctgseq = contigfasta[ctg] ctgline = "##{0} {1} {2} bases, {3}".format(\ ctg, len(reads), len(ctgseq), checksum) print >> fw_ids, ctg print >> fw, ctgline print >> fw, fill(ctgseq.seq) for b in reads: read = b.accn strand = b.strand readseq = readfasta[read] rc = " [RC]" if strand == "-" else "" readlen = len(readseq) rstart, rend = 1, readlen if strand == "-": rstart, rend = rend, rstart readrange = "{{{0} {1}}}".format(rstart, rend) conrange = "<{0} {1}>".format(b.start, b.end) readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\ read, rc, readlen, checksum, readrange, conrange) print >> fw, readline print >> fw, fill(readseq.seq) logging.debug("Mapped contigs written to `{0}`.".format(contigfile)) logging.debug("Contig IDs written to `{0}`.".format(idsfile))
def emitFragment(fw, fragID, libID, shredded_seq, fasta=False): """ Print out the shredded sequence. """ if fasta: s = SeqRecord(shredded_seq, id=fragID, description="") SeqIO.write([s], fw, "fasta") return seq = str(shredded_seq) slen = len(seq) qvs = DEFAULTQV * slen # shredded reads have default low qv print >> fw, frgTemplate.format(fragID=fragID, libID=libID, seq=fill(seq), qvs=fill(qvs), slen=slen)
def add_objective(self, edges, objective=MAXIMIZE): assert edges, "Edges must be non-empty" self.objective = objective items = [" + {0}x{1}".format(w, i + 1) \ for i, (a, b, w) in enumerate(edges) if w] sums = fill(items, width=10) self.sum = sums
def print_objective(lp_handle, edges, objective=MAXIMIZE): """ CPLEX LP format commonly contains three blocks: objective, constraints, vars spec <http://lpsolve.sourceforge.net/5.0/CPLEX-format.htm> """ print >> lp_handle, objective items = [" + {0}x{1}".format(w, i + 1) \ for i, (a, b, w) in enumerate(edges)] sums = fill(items, width=10) print >> lp_handle, sums
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option( "--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir", ) p.add_option( "--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig", ) p.add_option( "--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig", ) p.add_option( "--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig", ) p.add_option( "--astat", default=False, action="store_true", help="create .astat to list repetitiveness", ) p.add_option( "--readids", default=False, action="store_true", help="create file of mapped and unmapped ids", ) from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print("AS {0} {1}".format(ncontigs, totalreads), file=fw) print(file=fw) for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print("CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments), file=fw) print(fill(str(cseq.seq)), file=fw) print(file=fw) if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print("{0}\t{1:.1f}".format(contig, astat), file=astatfw) text = fill([qual] * nbases, delimiter=" ", width=30) print("BQ\n{0}".format(text), file=fw) print(file=fw) rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print(readname, file=readsfw) rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print(af, file=fw) print(file=fw) for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format( rname, alen, ninfos, ntags, fill(aseq) ) qs = "QA 1 {0} 1 {0}".format(alen) print(rd, file=fw) print(file=fw) print(qs, file=fw) print(file=fw)
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option("--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir [default: %default]") p.add_option("--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig [default: %default]") p.add_option("--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig [default: %default]") p.add_option("--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig [default: %default]") p.add_option("--astat", default=False, action="store_true", help="create .astat to list repetitiveness [default: %default]") p.add_option("--readids", default=False, action="store_true", help="create file of mapped and unmapped ids [default: %default]") from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print >> fw, "AS {0} {1}".format(ncontigs, totalreads) print >> fw for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments) print >> fw, fill(str(cseq.seq)) print >> fw if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print >> astatfw, "{0}\t{1:.1f}".format(contig, astat) text = fill([qual] * nbases, delimiter=" ", width=30) print >> fw, "BQ\n{0}".format(text) print >> fw rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print >> readsfw, readname rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print >> fw, af print >> fw for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags, fill(aseq)) qs = "QA 1 {0} 1 {0}".format(alen) print >> fw, rd print >> fw print >> fw, qs print >> fw
def gss(args): """ %prog gss fastafile plateMapping Generate sequence files and metadata templates suited for gss submission. The FASTA file is assumed to be exported from the JCVI data delivery folder which looks like: >1127963806024 /library_name=SIL1T054-B-01-120KB /clear_start=0 /clear_end=839 /primer_id=1049000104196 /trace_id=1064147620169 /trace_file_id=1127963805941 /clone_insert_id=1061064364776 /direction=reverse /sequencer_run_id=1064147620155 /sequencer_plate_barcode=B906423 /sequencer_plate_well_coordinates=C3 /sequencer_plate_96well_quadrant=1 /sequencer_plate_96well_coordinates=B02 /template_plate_barcode=CC0251602AB /growth_plate_barcode=BB0273005AB AGCTTTAGTTTCAAGGATACCTTCATTGTCATTCCCGGTTATGATGATATCATCAAGATAAACAAGAATG ACAATGATACCTGTTTGGTTCTGAAGTGTAAAGAGGGTATGTTCAGCTTCAGATCTTCTAAACCCTTTGT CTAGTAAGCTGGCACTTAGCTTCCTATACCAAACCCTTTGTGATTGCTTCAGTCCATAAATTGCCTTTTT Plate mapping file maps the JTC `sequencer_plate_barcode` to external IDs. For example: B906423 SIL-001 """ p = OptionParser(gss.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) fastafile, mappingfile = args seen = defaultdict(int) clone = defaultdict(set) plateMapping = DictFile(mappingfile) fw = open("MetaData.txt", "w") print >> fw, PublicationTemplate.format(**vars) print >> fw, LibraryTemplate.format(**vars) print >> fw, ContactTemplate.format(**vars) logging.debug("Meta data written to `{0}`".format(fw.name)) fw = open("GSS.txt", "w") fw_log = open("GSS.log", "w") for rec in SeqIO.parse(fastafile, "fasta"): # First pass just check well number matchings and populate sequences in # the same clone description = rec.description a = parse_description(description) direction = a["direction"][0] sequencer_plate_barcode = a["sequencer_plate_barcode"][0] sequencer_plate_well_coordinates = \ a["sequencer_plate_well_coordinates"][0] sequencer_plate_96well_quadrant = \ a["sequencer_plate_96well_quadrant"][0] sequencer_plate_96well_coordinates = \ a["sequencer_plate_96well_coordinates"][0] # Check the 96-well ID is correctly converted to 384-well ID w96 = sequencer_plate_96well_coordinates w96quad = int(sequencer_plate_96well_quadrant) w384 = sequencer_plate_well_coordinates assert convert_96_to_384(w96, w96quad) == w384 plate = sequencer_plate_barcode assert plate in plateMapping, \ "{0} not found in `{1}` !".format(plate, mappingfile) plate = plateMapping[plate] d = Directions[direction] cloneID = "{0}{1}".format(plate, w384) gssID = "{0}{1}".format(cloneID, d) seen[gssID] += 1 if seen[gssID] > 1: gssID = "{0}{1}".format(gssID, seen[gssID]) seen[gssID] += 1 clone[cloneID].add(gssID) seen = defaultdict(int) for rec in SeqIO.parse(fastafile, "fasta"): # need to populate gssID, mateID, cloneID, seq, plate, row, column description = rec.description a = parse_description(description) direction = a["direction"][0] sequencer_plate_barcode = a["sequencer_plate_barcode"][0] sequencer_plate_well_coordinates = \ a["sequencer_plate_well_coordinates"][0] w384 = sequencer_plate_well_coordinates plate = sequencer_plate_barcode plate = plateMapping[plate] d = Directions[direction] row = w384[0] column = int(w384[1:]) seq = fill(str(rec.seq), width=70) cloneID = "{0}{1}".format(plate, w384) gssID = "{0}{1}".format(cloneID, d) primer = Primers[d] seen[gssID] += 1 if seen[gssID] > 1: logging.error("duplicate key {0} found".format(gssID)) gssID = "{0}{1}".format(gssID, seen[gssID]) othergss = clone[cloneID] - set([gssID]) othergss = ", ".join(sorted(othergss)) vars.update(locals()) print >> fw, GSSTemplate.format(**vars) # Write conversion logs to log file print >> fw_log, "{0}\t{1}".format(gssID, description) print >> fw_log, "=" * 60 logging.debug("A total of {0} seqs written to `{1}`".\ format(len(seen), fw.name)) fw.close() fw_log.close()