def batch_entrez(list_of_terms, db="nuccore", retmax=1, rettype="fasta", batchsize=1, email=myEmail): """ Retrieve multiple rather than a single record """ for term in list_of_terms: logging.debug("Search term %s" % term) success = False ids = None if not term: continue while not success: try: search_handle = Entrez.esearch(db=db, retmax=retmax, term=term) rec = Entrez.read(search_handle) success = True ids = rec["IdList"] except (urllib2.HTTPError, urllib2.URLError, RuntimeError, KeyError) as e: logging.error(e) logging.debug("wait 5 seconds to reconnect...") time.sleep(5) if not ids: logging.error("term {0} not found".format(term)) continue assert ids nids = len(ids) if nids > 1: logging.debug("A total of {0} results found.".format(nids)) if batchsize != 1: logging.debug("Use a batch size of {0}.".format(batchsize)) ids = list(grouper(ids, batchsize)) for id in ids: id = [x for x in id if x] size = len(id) id = ",".join(id) success = False while not success: try: fetch_handle = Entrez.efetch(db=db, id=id, rettype=rettype, email=email) success = True except (urllib2.HTTPError, urllib2.URLError, RuntimeError) as e: logging.error(e) logging.debug("wait 5 seconds to reconnect...") time.sleep(5) yield id, size, term, fetch_handle
def scaffold(args): """ %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed physicalmap.blast physicalmap.sizes physicalmap.bed As evaluation of scaffolding, visualize external line of evidences: * Plot synteny to an external genome * Plot alignments to physical map * Plot alignments to genetic map (TODO) Each trio defines one panel to be plotted. blastfile defines the matchings between the evidences vs scaffolds. Then the evidence sizes, and evidence bed to plot dot plots. This script will plot a dot in the dot plot in the corresponding location the plots are one contig/scaffold per plot. """ from jcvi.graphics.base import set_image_options from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.add_option("--cutoff", type="int", default=1000000, help="Plot scaffolds with size larger than [default: %default]") p.add_option("--highlights", help="A set of regions in BED format to highlight [default: %default]") opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150) if len(args) < 4 or len(args) % 3 != 1: sys.exit(not p.print_help()) highlights = opts.highlights scafsizes = Sizes(args[0]) trios = list(grouper(3, args[1:])) trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios] if highlights: hlbed = Bed(highlights) for scaffoldID, scafsize in scafsizes.iter_sizes(): if scafsize < opts.cutoff: continue logging.debug("Loading {0} (size={1})".format(scaffoldID, thousands(scafsize))) tmpname = scaffoldID + ".sizes" tmp = open(tmpname, "w") tmp.write("{0}\t{1}".format(scaffoldID, scafsize)) tmp.close() tmpsizes = Sizes(tmpname) tmpsizes.close(clean=True) if highlights: subhighlights = list(hlbed.sub_bed(scaffoldID)) imagename = ".".join((scaffoldID, opts.format)) plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts, highlights=subhighlights)
def iter_project(folder, pattern, n=2): # Check for paired reads and extract project id filelist = [x for x in iglob(folder, pattern)] for p in grouper(filelist, n): if len(p) != n: continue pp = [op.basename(x) for x in p] pf = pairspf(pp) yield list(p), pf
def iter_project(folder, n=2): # Check for paired reads and extract project id filelist = [x for x in glob(folder + "/*.*") if x.rsplit(".", 1)[-1] in ("fq", "fastq", "txt", "gz")] for p in grouper(filelist, n): if len(p) != n: continue pp = [op.basename(x) for x in p] pf = pairspf(pp) yield list(p), pf
def iter_project(folder, pattern="*.fq,*.fq.gz,*.fastq,*.fastq.gz", n=2): # Check for paired reads and extract project id filelist = [x for x in iglob(folder, pattern)] for p in grouper(filelist, n): if len(p) != n or None in p: continue pp = [op.basename(x) for x in p] pf = pairspf(pp) yield list(p), pf
def blasr(args): """ %prog blasr ref.fasta fofn Run blasr on a set of PacBio reads. This is based on a divide-and-conquer strategy described below. """ from jcvi.apps.grid import MakeManager from jcvi.utils.iter import grouper p = OptionParser(blasr.__doc__) p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, fofn = args flist = sorted([x.strip() for x in open(fofn)]) h5list = [] mm = MakeManager() for i, fl in enumerate(grouper(flist, 3)): chunkname = "chunk{0:03d}".format(i) fn = chunkname + ".fofn" h5 = chunkname + ".cmp.h5" fw = open(fn, "w") print >> fw, "\n".join(fl) fw.close() cmd = "pbalign {0} {1} {2}".format(fn, reffasta, h5) cmd += " --nproc {0} --forQuiver --tmpDir .".format(opts.cpus) mm.add((fn, reffasta), h5, cmd) h5list.append(h5) # Merge h5, sort and repack allh5 = "all.cmp.h5" tmph5 = "tmp.cmp.h5" cmd_merge = "cmph5tools.py merge --outFile {0}".format(allh5) cmd_merge += " " + " ".join(h5list) cmd_sort = "cmph5tools.py sort --deep {0} --tmpDir .".format(allh5) cmd_repack = "h5repack -f GZIP=1 {0} {1}".format(allh5, tmph5) cmd_repack += " && mv {0} {1}".format(tmph5, allh5) mm.add(h5list, allh5, [cmd_merge, cmd_sort, cmd_repack]) # Quiver pf = reffasta.rsplit(".", 1)[0] variantsgff = pf + ".variants.gff" consensusfasta = pf + ".consensus.fasta" cmd_faidx = "samtools faidx {0}".format(reffasta) cmd = "quiver -j 32 {0}".format(allh5) cmd += " -r {0} -o {1} -o {2}".format(reffasta, variantsgff, consensusfasta) mm.add(allh5, consensusfasta, [cmd_faidx, cmd]) mm.write()
def iter_project(folder, pattern="*.fq,*.fq.gz,*.fastq,*.fastq.gz", n=2, commonprefix=True): # Check for paired reads and extract project id filelist = [x for x in iglob(folder, pattern)] for p in grouper(filelist, n): if len(p) != n or None in p: continue pp = [op.basename(x) for x in p] pf = pairspf(pp, commonprefix=commonprefix) yield sorted(p), pf
def iter_project(folder, n=2): # Check for paired reads and extract project id filelist = [x for x in glob(folder + "/*.*") \ if x.rsplit(".", 1)[-1] in ("fq", "fastq", "txt", "gz")] for p in grouper(filelist, n): if len(p) != n: continue pp = [op.basename(x) for x in p] pf = pairspf(pp) yield list(p), pf
def tile(lt, width=70, gap=1): """ Pretty print list of items. """ from jcvi.utils.iter import grouper max_len = max(len(x) for x in lt) + gap items_per_line = max(width / max_len, 1) lt = [x.rjust(max_len) for x in lt] g = list(grouper(lt, items_per_line, fillvalue="")) return "\n".join("".join(x) for x in g)
def __iter__(self): nstacks = 0 fp = must_open(self.filename) for tag, contents in groupby(fp, lambda row: row[0] == '/'): if tag: continue data = Clust() for name, seq in grouper(contents, 2): name, seq = name.strip(), seq.strip() nrep = getsize(name) data.append((name, seq, nrep)) yield data nstacks += 1 if nstacks % 10000 == 0: logging.debug("{0} stacks parsed".format(nstacks))
def gallery(args): """ %prog gallery folder link_prefix Convert a folder of figures to a HTML table. For example: $ python -m jcvi.formats.html gallery Paper-figures/ https://dl.dropboxusercontent.com/u/15937715/Data/Paper-figures/ Maps the images from local to remote. """ from jcvi.apps.base import iglob from jcvi.utils.iter import grouper p = OptionParser(gallery.__doc__) p.add_option("--columns", default=3, type="int", help="How many cells per row") p.add_option("--width", default=200, type="int", help="Image width") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, link_prefix = args width = opts.width images = iglob(folder, "*.jpg,*.JPG,*.png") td = '<td>{0}<br><a href="{1}"><img src="{1}" width="{2}"></a></td>' print("<table>") for ims in grouper(images, opts.columns): print('<tr height="{0}" valign="top">'.format(width + 5)) for im in ims: if not im: continue im = op.basename(im) pf = im.split('.')[0].replace('_', '-') link = link_prefix.rstrip("/") + "/" + im print(td.format(pf, link, width)) print("</tr>") print("</table>")
def scaffold(args): """ %prog scaffold ctgfasta reads1.fasta mapping1.bed reads2.fasta mapping2.bed ... Run BAMBUS on set of contigs, reads and read mappings. """ from jcvi.formats.base import FileMerger from jcvi.formats.bed import mates from jcvi.formats.contig import frombed from jcvi.formats.fasta import join from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.set_rclip(rclip=1) p.add_option("--conf", help="BAMBUS configuration file [default: %default]") p.add_option("--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]") opts, args = p.parse_args(args) nargs = len(args) if nargs < 3 or nargs % 2 != 1: sys.exit(not p.print_help()) rclip = opts.rclip ctgfasta = args[0] duos = list(grouper(args[1:], 2)) trios = [] for fastafile, bedfile in duos: prefix = bedfile.rsplit(".", 1)[0] matefile = prefix + ".mates" matebedfile = matefile + ".bed" if need_update(bedfile, [matefile, matebedfile]): matesopt = [bedfile, "--lib", "--nointra", "--rclip={0}".format(rclip), "--cutoff={0}".format(opts.cutoff)] if opts.prefix: matesopt += ["--prefix"] matefile, matebedfile = mates(matesopt) trios.append((fastafile, matebedfile, matefile)) # Merge the readfasta, bedfile and matefile bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates" for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)): FileMerger(files, outfile=outfile).merge(checkexists=True) ctgfile = "bambus.contig" idsfile = "bambus.ids" frombedInputs = [bbbed, ctgfasta, bbfasta] if need_update(frombedInputs, ctgfile): frombed(frombedInputs) inputfasta = "bambus.contigs.fasta" singletonfasta = "bambus.singletons.fasta" cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile) sh(cmd + inputfasta) sh(cmd + singletonfasta + " -exclude") # Run bambus prefix = "bambus" cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix) if opts.conf: cmd += " -C {0}".format(opts.conf) sh(cmd) cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\ format(prefix) sh(cmd) final = "final" cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \ "-merge -detail -oo -sum -o {1}".format(prefix, final) sh(cmd) oofile = final + ".oo" join([inputfasta, "--oo={0}".format(oofile)])
def scaffold(args): """ %prog scaffold ctgfasta reads1.fasta mapping1.bed reads2.fasta mapping2.bed ... Run BAMBUS on set of contigs, reads and read mappings. """ from jcvi.formats.base import FileMerger from jcvi.formats.bed import mates from jcvi.formats.contig import frombed from jcvi.formats.fasta import join from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.add_option("--conf", help="BAMBUS configuration file [default: %default]") p.add_option( "--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]" ) opts, args = p.parse_args(args) nargs = len(args) if nargs < 3 or nargs % 2 != 1: sys.exit(not p.print_help()) ctgfasta = args[0] duos = list(grouper(2, args[1:])) trios = [] for fastafile, bedfile in duos: prefix = bedfile.rsplit(".", 1)[0] matefile = prefix + ".mates" matebedfile = matefile + ".bed" if need_update(bedfile, [matefile, matebedfile]): matesopt = [bedfile, "--lib", "--nointra"] if opts.prefix: matesopt += ["--prefix"] matefile, matebedfile = mates(matesopt) trios.append((fastafile, matebedfile, matefile)) # Merge the readfasta, bedfile and matefile bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates" for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)): FileMerger(files, outfile=outfile).merge(checkexists=True) ctgfile = "bambus.contig" idsfile = "bambus.ids" frombedInputs = [bbbed, ctgfasta, bbfasta] if need_update(frombedInputs, ctgfile): frombed(frombedInputs) inputfasta = "bambus.contigs.fasta" singletonfasta = "bambus.singletons.fasta" cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile) sh(cmd + inputfasta) sh(cmd + singletonfasta + " -exclude") # Run bambus prefix = "bambus" cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix) if opts.conf: cmd += " -C {0}".format(opts.conf) sh(cmd) cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\ format(prefix) sh(cmd) final = "final" cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \ "-merge -detail -oo -sum -o {1}".format(prefix, final) sh(cmd) oofile = final + ".oo" join([inputfasta, "--oo={0}".format(oofile)])