def download(self, name, base_url, cookies, downloader=None): """Download the file if it has an URL. Otherwise, this will recursively search the children. See also: <https://genome.jgi.doe.gov/portal/help/download.jsf> Args: name (str, optional): Name of the file. Defaults to None. base_url (str): Link to the file on the internet. cookies (str, optional): cookies file. Defaults to None. downloader (str, optional): Use a given downloader. One of wget|curl|powershell|insecure. Defaults to None. """ if self.name == name and base_url and self.url: url = urljoin(base_url, self.url) download(url, filename=name, debug=True, cookies=cookies, downloader=downloader) else: for child_name, child in self.items(): if child_name == name: child.download(name, base_url, cookies, downloader=downloader) return name
def sra(args): """ %prog sra term Given an SRA run ID, fetch the corresponding .sra file from the sra-instant FTP """ sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/" sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$') p = OptionParser(sra.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) term, = args m = re.search(sra_run_id_re, term) if m is None: logging.error("Incorrect SRA identifier format " + \ "[should be like SRR126150, SRR1001901. " + \ "len(identifier) should be between 9-10 characters]") sys.exit() prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2)) download_url = urljoin(sra_base_url, prefix, subprefix, term, "{0}.sra".format(term)) logging.debug("Downloading file: {0}".format(download_url)) download(download_url)
def download_species_ensembl(species, valid_species, url): assert species in valid_species, "{0} is not in the species list".format(species) # We want to download assembly and annotation for given species ann_url = urljoin(url, "gtf/{0}".format(species)) cds_url = urljoin(url, "fasta/{0}/cds".format(species)) for u in (ann_url, cds_url): valid_files = [x for x in ls_ftp(u) if x.endswith(".gz")] for f in valid_files: f = urljoin(u, f) download(f)
def download_species_ensembl(species, valid_species, url): assert species in valid_species, \ "{0} is not in the species list".format(species) # We want to download assembly and annotation for given species ann_url = urljoin(url, "gtf/{0}".format(species)) cds_url = urljoin(url, "fasta/{0}/cds".format(species)) for u in (ann_url, cds_url): valid_files = [x for x in ls_ftp(u) if x.endswith(".gz")] for f in valid_files: f = urljoin(u, f) download(f)
def links(args): """ %prog links url Extract all the links "<a href=''>" from web page. """ p = OptionParser(links.__doc__) p.add_option("--img", default=False, action="store_true", help="Extract <img> tags [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) url, = args img = opts.img htmlfile = download(url) page = open(htmlfile).read() soup = BeautifulSoup(page) tag = 'img' if img else 'a' src = 'src' if img else 'href' aa = soup.findAll(tag) for a in aa: link = a.get(src) link = urljoin(url, link) print(link)
def links(args): """ %prog links url Extract all the links "<a href=''>" from web page. """ p = OptionParser(links.__doc__) p.add_option( "--img", default=False, action="store_true", help="Extract <img> tags [default: %default]", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (url, ) = args img = opts.img htmlfile = download(url) page = open(htmlfile).read() soup = BeautifulSoup(page) tag = "img" if img else "a" src = "src" if img else "href" aa = soup.findAll(tag) for a in aa: link = a.get(src) link = urljoin(url, link) print(link)
def mask(args): """ %prog mask fastafile Mask the contaminants. By default, this will compare against UniVec_Core and Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can perform FASTA tidy if requested. """ p = OptionParser(mask.__doc__) p.add_option( "--db", help="Contaminant db other than Ecoli K12 [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args assert op.exists(fastafile) outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta" vecbedfile = blast([fastafile]) ecoliurl = \ "ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Escherichia_coli_K_12_substr__DH10B_uid58979/NC_010473.fna" ecolifile = opts.db or download(ecoliurl, filename="Ecoli.fasta") assert op.exists(ecolifile) ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)]) cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile) cmd += " | mergeBed -nms -d 100 -i stdin" cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".\ format(fastafile, outfastafile) sh(cmd) return tidy([outfastafile])
def mask(args): """ %prog mask fastafile Mask the contaminants. By default, this will compare against UniVec_Core and Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can perform FASTA tidy if requested. """ p = OptionParser(mask.__doc__) p.add_option("--db", help="Contaminant db other than Ecoli K12 [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args assert op.exists(fastafile) outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta" vecbedfile = blast([fastafile]) ecoliurl = "ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Escherichia_coli_K_12_substr__DH10B_uid58979/NC_010473.fna" ecolifile = opts.db or download(ecoliurl, filename="Ecoli.fasta") assert op.exists(ecolifile) ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)]) cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile) cmd += " | mergeBed -nms -d 100 -i stdin" cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".format(fastafile, outfastafile) sh(cmd) return tidy([outfastafile])
def download(self, name, base_url, cookies): """Download the file if it has an URL. Otherwise, this will recursively search the children. See also: <https://genome.jgi.doe.gov/portal/help/download.jsf> Args: name (str, optional): Name of the file. Defaults to None. """ if self.name == name and base_url and self.url: url = urljoin(base_url, self.url) download(url, filename=name, debug=True, cookies=cookies) else: for child_name, child in self.items(): if child_name == name: child.download(name, base_url, cookies) return name
def download_species_phytozome9(species, valid_species, base_url, assembly=False): assert species in valid_species, "{} is not in the species list".format(species) # We want to download assembly and annotation for given species surl = urljoin(base_url, species) contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")] magic = contents[0].split("_")[1] # Get the magic number logging.debug("Found magic number for {0}: {1}".format(species, magic)) pf = "{0}_{1}".format(species, magic) asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf)) ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf)) cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf)) res = {} if assembly: res["asm"] = download(asm_url) res["gff"] = download(ann_url) res["cds"] = download(cds_url) return res
def test_download(): from jcvi.apps.base import download from jcvi.apps.vecscreen import ECOLI_URL, UNIVEC_URL ret = download("http://www.google.com") assert ret == "index.html" remove_if_exists(ret) ret = download(ECOLI_URL, filename="ecoli.fa.gz") assert ret == "ecoli.fa.gz" remove_if_exists(ret) ret = download(UNIVEC_URL, filename="univec.fa.gz") assert ret == "univec.fa.gz" remove_if_exists(ret) ret = download(UNIVEC_URL) assert ret == "UniVec_Core" remove_if_exists(ret)
def load_GODag(): """ OBO file retrieved from http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo """ from jcvi.apps.base import download so_file_url = "http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo" so_file = download(so_file_url, debug=False) return GODag(so_file)
def download_species_phytozome(species, valid_species, url, assembly=False): from os.path import join as urljoin assert species in valid_species, \ "{0} is not in the species list".format(species) # We want to download assembly and annotation for given species surl = urljoin(url, species) contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")] magic = contents[0].split("_")[1] # Get the magic number logging.debug("Found magic number for {0}: {1}".format(species, magic)) pf = "{0}_{1}".format(species, magic) asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf)) ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf)) cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf)) if assembly: download(asm_url) for u in (ann_url, cds_url): download(u)
def scaffold(args): """ %prog scaffold contigs.fasta MP*.fastq Run SSPACE scaffolding. """ p = OptionParser(scaffold.__doc__) p.set_aligner(aligner="bwa") p.set_home("sspace") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) contigs = args[0] libtxt = write_libraries(args[1:], aligner=opts.aligner) # Requires getopts.pl which may be missing download("http://web.vims.edu/bridge/bridge2/aw/lib/getopts.pl") cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Standard_v3.0.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def scaffold(args): """ %prog scaffold contigs.fasta MP*.fastq Run SSPACE scaffolding. """ p = OptionParser(scaffold.__doc__) p.set_aligner(aligner="bwa") p.set_home("sspace") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) contigs = args[0] libtxt = write_libraries(args[1:], aligner=opts.aligner) # Requires getopts.pl which may be missing download("http://mflib.org/xampp/perl/lib/getopts.pl") cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Standard_v3.0.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def blast(args): """ %prog blast fastafile Run BLASTN against database (default is UniVec_Core). Output .bed format on the vector/contaminant ranges. """ p = OptionParser(blast.__doc__) p.add_option("--dist", dest="dist", default=100, type="int", help="Merge adjacent HSPs separated by [default: %default]") p.add_option("--db", dest="db", default=None, help="Use a different database rather than UniVec_Core") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args fastaprefix = fastafile.split(".", 1)[0] univec = opts.db or download( "ftp://ftp.ncbi.nih.gov/pub/UniVec/UniVec_Core") uniprefix = univec.split(".", 1)[0] fastablast = fastaprefix + ".{0}.blast".format(uniprefix) prog = run_megablast if opts.db else run_vecscreen prog(infile=fastafile, outfile=fastablast, db=univec, pctid=95, hitlen=50) fp = open(fastablast) ranges = [] for row in fp: b = BlastLine(row) ranges.append((b.query, b.qstart, b.qstop)) merged_ranges = range_merge(ranges, dist=opts.dist) bedfile = fastaprefix + ".{0}.bed".format(uniprefix) fw = must_open(bedfile, "w") for seqid, start, end in merged_ranges: print >> fw, "\t".join( str(x) for x in (seqid, start - 1, end, uniprefix)) return bedfile
def download_srr_term(term): sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/" sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$') m = re.search(sra_run_id_re, term) if m is None: logging.error("Incorrect SRA identifier format " + \ "[should be like SRR126150, SRR1001901. " + \ "len(identifier) should be between 9-10 characters]") sys.exit() prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2)) download_url = urljoin(sra_base_url, prefix, subprefix, term, "{0}.sra".format(term)) logging.debug("Downloading file: {0}".format(download_url)) return download(download_url)
def load_GODag(obo_url: str, prt: Optional[IO] = None) -> (GODag, str): """ Load given obo url and returns GODag object. Args: obo_url (str): URL to the remote OBO file. prt (Optional[IO]): IO stream to print verbose information. Returns: (GODag, str): GODag object that contains the dict, and path to the downloaded OBO file. """ from jcvi.apps.base import download so_file = download(obo_url, debug=False) return GODag(so_file, prt=prt), so_file
def validate_term(term): """ Validate an SO term against so.obo OBO file retrieved from 'http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo' """ from jcvi.formats.obo import GODag from jcvi.apps.base import download so_file_url = "http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo" so_file = download(so_file_url) so = GODag(so_file) valid_names = so.valid_names if not term in valid_names: logging.error("Term `{0}` does not exist. Please refer to `{1}`".format(term, so_file_url)) sys.exit() return True
def validate_term(term): """ Validate an SO term against so.obo OBO file retrieved from 'http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo' """ from jcvi.formats.obo import GODag from jcvi.apps.base import download so_file_url = "http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo" so_file = download(so_file_url) so = GODag(so_file) valid_names = so.valid_names if not term in valid_names: logging.error( "Term `{0}` does not exist. Please refer to `{1}`".format( term, so_file_url)) sys.exit() return True
def blast(args): """ %prog blast fastafile Run BLASTN against database (default is UniVec_Core). Output .bed format on the vector/contaminant ranges. """ p = OptionParser(blast.__doc__) p.add_option("--dist", default=100, type="int", help="Merge adjacent HSPs separated by [default: %default]") p.add_option("--db", help="Use a different database rather than UniVec_Core") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args fastaprefix = fastafile.split(".", 1)[0] univec = opts.db or download("ftp://ftp.ncbi.nih.gov/pub/UniVec/UniVec_Core") uniprefix = univec.split(".", 1)[0] fastablast = fastaprefix + ".{0}.blast".format(uniprefix) prog = run_megablast if opts.db else run_vecscreen prog(infile=fastafile, outfile=fastablast, db=univec, pctid=95, hitlen=50) fp = open(fastablast) ranges = [] for row in fp: b = BlastLine(row) ranges.append((b.query, b.qstart, b.qstop)) merged_ranges = range_merge(ranges, dist=opts.dist) bedfile = fastaprefix + ".{0}.bed".format(uniprefix) fw = must_open(bedfile, "w") for seqid, start, end in merged_ranges: print("\t".join(str(x) for x in (seqid, start - 1, end, uniprefix)), file=fw) return bedfile
def test_oboreader(): import os from jcvi.apps.base import download from jcvi.formats.obo import GODag obo_file = download("http://geneontology.org/ontology/go-basic.obo") go = GODag(obo_file, prt=None) r1, r2, r3 = [ rec for i, rec in enumerate( sorted(set(go.values()), key=lambda x: x.item_id)) if i < 3 ] assert r1.item_id == "GO:0000001" assert r1.name == "mitochondrion inheritance" assert r2.item_id == "GO:0000002" assert r2.namespace == "biological_process" assert r3.item_id == "GO:0000003" assert tuple(sorted(r3.alt_ids)) == ("GO:0019952", "GO:0050876") if os.path.exists(obo_file): os.remove(obo_file)
def mask(args): """ %prog mask fastafile Mask the contaminants. By default, this will compare against UniVec_Core and Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can perform FASTA tidy if requested. """ p = OptionParser(mask.__doc__) p.add_option( "--db", default=ECOLI_URL, help= "Contaminant db other than Ecoli K12, will download if file starts with http://, https://, or ftp://", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args db = opts.db assert op.exists(fastafile) outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta" vecbedfile = blast([fastafile]) ecolifile = (download(db, filename="Ecoli.fasta", handle_gzip=True) if is_internet_file(db) else db) assert op.exists(ecolifile) ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)]) cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile) cmd += " | sort -k1,1 -k2,2n" cmd += " | mergeBed -c 4 -o distinct -d 100 -i stdin" cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".format( fastafile, outfastafile) sh(cmd) return tidy([outfastafile])
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ tv = "0.32" TrimJar = "trimmomatic-{0}.jar".format(tv) phdchoices = ("33", "64") p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic jar file [default: %default]") p.add_option("--phred", default=None, choices=phdchoices, help="Phred score offset [default: guess]") p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=15, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=36, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--adapteronly", default=False, action="store_true", help="Only trim adapters with no qv trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") p.add_option("--log", default=None, dest="trimlog", help="Specify a `trimlog` file [default: %default]") p.set_cpus(cpus=4) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path), \ "Couldn't find Trimmomatic jar file at `{0}`".\ format(path) adaptersfile = "adapters.fasta" Adapters = must_open(op.join(datadir, adaptersfile)).read() write_file(adaptersfile, Adapters, skipcheck=True) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) threadsflag = " -threads {0}".format(opts.cpus) if opts.trimlog: trimlog = " -trimlog {0}".format(opts.trimlog) cmd = "java -Xmx4g -jar {0}".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] if len(args) == 1: cmd += " SE" cmd += phredflag cmd += threadsflag if opts.trimlog: cmd += trimlog fastqfile, = args prefix = get_prefix(fastqfile) frags1 = prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += " PE" cmd += phredflag cmd += threadsflag if opts.trimlog: cmd += trimlog fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) prefix2 = get_prefix(fastqfile2) pairs1 = prefix1 + pairs pairs2 = prefix2 + pairs frags1 = prefix1 + frags frags2 = prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile) if not opts.adapteronly: cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv) cmd += " MINLEN:{0}".format(opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd)
def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome using Globus API. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum The downloader will prompt you to enter Phytozome user name and password during downloading. Please register for a login at: https://phytozome.jgi.doe.gov/pz/portal.html. """ from jcvi.apps.biomart import GlobusXMLParser p = OptionParser(phytozome.__doc__) p.add_option( "--version", default="12", choices=("9", "10", "11", "12", "12_unrestricted", "13"), help="Phytozome version", ) p.add_option( "--assembly", default=False, action="store_true", help="Download assembly", ) p.add_option( "--format", default=False, action="store_true", help="Format to CDS and BED for synteny inference", ) p.set_downloader() opts, args = p.parse_args(args) downloader = opts.downloader directory_listing = ".phytozome_directory_V{}.xml".format(opts.version) # Get directory listing base_url = "http://genome.jgi.doe.gov" dlist = "{}/ext-api/downloads/get-directory?organism=PhytozomeV{}".format( base_url, opts.version ) # Make sure we have a valid cookies cookies = get_cookies() if cookies is None: logging.error("Error fetching cookies ... cleaning up") FileShredder([directory_listing]) sys.exit(1) # Proceed to use the cookies and download the species list try: download( dlist, filename=directory_listing, cookies=cookies, downloader=downloader, ) g = GlobusXMLParser(directory_listing) except: logging.error("Error downloading directory listing ... cleaning up") FileShredder([directory_listing, cookies]) sys.exit(1) genomes = g.get_genomes() valid_species = genomes.keys() species_tile = tile(valid_species) p.set_usage("\n".join((phytozome.__doc__, species_tile))) if len(args) != 1: sys.exit(not p.print_help()) (species,) = args if species == "all": species = ",".join(valid_species) species = species.split(",") for s in species: res = download_species_phytozome( genomes, s, valid_species, base_url, cookies, assembly=opts.assembly, downloader=downloader, ) if not res: logging.error("No files downloaded") gff, fa = res.get("gff"), res.get("cds") if opts.format: format_bed_and_cds(s, gff, fa)
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ TrimVersion = tv = "0.20" TrimJar = "trimmomatic-{0}.jar".format(tv) phdchoices = ("33", "64") p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic [default: %default]") p.add_option("--phred", default=None, choices=phdchoices, help="Phred score offset {0} [default: guess]".format(phdchoices)) p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=10, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=30, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path) adaptersfile = "adapters.fasta" if not op.exists(adaptersfile): write_file(adaptersfile, Adapters) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) cmd = JAVAPATH("java-1.6.0") cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] if len(args) == 1: cmd += ".TrimmomaticSE" cmd += phredflag fastqfile, = args prefix = get_prefix(fastqfile) frags1 = prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += ".TrimmomaticPE" cmd += phredflag fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) prefix2 = get_prefix(fastqfile2) pairs1 = prefix1 + pairs pairs2 = prefix2 + pairs frags1 = prefix1 + frags frags2 = prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile) cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd, grid=opts.grid)
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ TrimVersion = tv = "0.20" TrimJar = "trimmomatic-{0}.jar".format(tv) phdchoices = ("33", "64") p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic [default: %default]") p.add_option( "--phred", default=None, choices=phdchoices, help="Phred score offset {0} [default: guess]".format(phdchoices)) p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=10, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=30, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path) adaptersfile = "adapters.fasta" if not op.exists(adaptersfile): write_file(adaptersfile, Adapters) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) cmd = JAVAPATH("java-1.6.0") cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] if len(args) == 1: cmd += ".TrimmomaticSE" cmd += phredflag fastqfile, = args prefix = get_prefix(fastqfile) frags1 = prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += ".TrimmomaticPE" cmd += phredflag fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) prefix2 = get_prefix(fastqfile2) pairs1 = prefix1 + pairs pairs2 = prefix2 + pairs frags1 = prefix1 + frags frags2 = prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile) cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd, grid=opts.grid)