def ensembl(args): """ %prog ensembl species Retrieve genomes and annotations from ensembl FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog ensembl danio_rerio,gasterosteus_aculeatus """ p = OptionParser(ensembl.__doc__) p.add_option("--version", default="75", help="Ensembl version") opts, args = p.parse_args(args) version = opts.version url = "ftp://ftp.ensembl.org/pub/release-{0}/".format(version) fasta_url = url + "fasta/" valid_species = [x for x in ls_ftp(fasta_url) if "." not in x] doc = "\n".join((ensembl.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) (species,) = args species = species.split(",") for s in species: download_species_ensembl(s, valid_species, url)
def ensembl(args): """ %prog ensembl species Retrieve genomes and annotations from ensembl FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog ensembl danio_rerio,gasterosteus_aculeatus """ p = OptionParser(ensembl.__doc__) p.add_option("--version", default="75", help="Ensembl version [default: %default]") opts, args = p.parse_args(args) version = opts.version url = "ftp://ftp.ensembl.org/pub/release-{0}/".format(version) fasta_url = url + "fasta/" valid_species = [x for x in ls_ftp(fasta_url) if "." not in x] doc = "\n".join((ensembl.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) species, = args species = species.split(",") for s in species: download_species_ensembl(s, valid_species, url)
def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum """ p = OptionParser(phytozome.__doc__) p.add_option("--version", default="9.0", help="Phytozome version [default: %default]") p.add_option("--assembly", default=False, action="store_true", help="Download assembly [default: %default]") opts, args = p.parse_args(args) url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\ format(opts.version) valid_species = [x for x in ls_ftp(url) if "." not in x] doc = "\n".join((phytozome.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) species, = args species = species.split(",") for s in species: download_species_phytozome(s, valid_species, url, assembly=opts.assembly)
def test_ls_ftp(): from jcvi.apps.base import ls_ftp url = "ftp://ftp.ensembl.org/pub/release-75/fasta/" valid_species = [x for x in ls_ftp(url) if "." not in x] assert "saccharomyces_cerevisiae" in valid_species assert "gorilla_gorilla" in valid_species assert len(valid_species) == 67
def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum """ from jcvi.formats.gff import bed as gff_bed from jcvi.formats.fasta import format as fasta_format p = OptionParser(phytozome.__doc__) p.add_option("--version", default="9.0", help="Phytozome version [default: %default]") p.add_option("--assembly", default=False, action="store_true", help="Download assembly [default: %default]") p.add_option("--format", default=False, action="store_true", help="Format to CDS and BED for synteny inference") opts, args = p.parse_args(args) url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\ format(opts.version) valid_species = [x for x in ls_ftp(url) if "." not in x] doc = "\n".join((phytozome.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) species, = args if species == "all": species = ",".join(valid_species) species = species.split(",") use_IDs = set() # We have to watch out when the gene names and mRNA names mismatch, in which # case we just extract the mRNA names use_mRNAs = set(["Cclementina", "Creinhardtii", "Csinensis", "Fvesca", "Lusitatissimum", "Mesculenta", "Mguttatus", "Ppersica", "Pvirgatum", "Rcommunis", "Sitalica", "Tcacao", "Thalophila", "Vcarteri", "Vvinifera", "Zmays"]) for s in species: gff, fa = download_species_phytozome(s, valid_species, url, assembly=opts.assembly) key = "ID" if s in use_IDs else "Name" ttype = "mRNA" if s in use_mRNAs else "gene" if not opts.format: continue bedfile = s + ".bed" cdsfile = s + ".cds" gff_bed([gff, "--type={}".format(ttype), "--key={}".format(key), "-o", bedfile]) fasta_format([fa, cdsfile, r"--sep=|"])
def download_species_ensembl(species, valid_species, url): assert species in valid_species, "{0} is not in the species list".format(species) # We want to download assembly and annotation for given species ann_url = urljoin(url, "gtf/{0}".format(species)) cds_url = urljoin(url, "fasta/{0}/cds".format(species)) for u in (ann_url, cds_url): valid_files = [x for x in ls_ftp(u) if x.endswith(".gz")] for f in valid_files: f = urljoin(u, f) download(f)
def download_species_ensembl(species, valid_species, url): assert species in valid_species, \ "{0} is not in the species list".format(species) # We want to download assembly and annotation for given species ann_url = urljoin(url, "gtf/{0}".format(species)) cds_url = urljoin(url, "fasta/{0}/cds".format(species)) for u in (ann_url, cds_url): valid_files = [x for x in ls_ftp(u) if x.endswith(".gz")] for f in valid_files: f = urljoin(u, f) download(f)
def phytozome9(args): """ %prog phytozome9 species Retrieve genomes and annotations from phytozome FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome9 Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum """ p = OptionParser(phytozome9.__doc__) p.add_option( "--assembly", default=False, action="store_true", help="Download assembly", ) p.add_option( "--format", default=False, action="store_true", help="Format to CDS and BED for synteny inference", ) opts, args = p.parse_args(args) version = "9.0" url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".format(version) valid_species = [x for x in ls_ftp(url) if "." not in x] doc = "\n".join((phytozome9.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) (species, ) = args if species == "all": species = ",".join(valid_species) species = species.split(",") for s in species: res = download_species_phytozome9(s, valid_species, url, assembly=opts.assembly) if not res: logging.error("No files downloaded") gff, cdsfa = res.get("gff"), res.get("cds") if opts.format: format_bed_and_cds(s, gff, cdsfa)
def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum """ from jcvi.formats.gff import bed as gff_bed from jcvi.formats.fasta import format as fasta_format p = OptionParser(phytozome.__doc__) p.add_option("--version", default="9.0", help="Phytozome version [default: %default]") p.add_option("--assembly", default=False, action="store_true", help="Download assembly [default: %default]") p.add_option("--format", default=False, action="store_true", help="Format to CDS and BED for synteny inference") opts, args = p.parse_args(args) url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\ format(opts.version) valid_species = [x for x in ls_ftp(url) if "." not in x] doc = "\n".join((phytozome.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) species, = args species = species.split(",") for s in species: gff, fa = download_species_phytozome(s, valid_species, url, assembly=opts.assembly) if not opts.format: continue bedfile = s + ".bed" cdsfile = s + ".cds" gff_bed([gff, "--phytozome", "-o", bedfile]) fasta_format([fa, cdsfile, r"--sep=|"])
def download_species_phytozome9(species, valid_species, base_url, assembly=False): assert species in valid_species, "{} is not in the species list".format(species) # We want to download assembly and annotation for given species surl = urljoin(base_url, species) contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")] magic = contents[0].split("_")[1] # Get the magic number logging.debug("Found magic number for {0}: {1}".format(species, magic)) pf = "{0}_{1}".format(species, magic) asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf)) ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf)) cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf)) res = {} if assembly: res["asm"] = download(asm_url) res["gff"] = download(ann_url) res["cds"] = download(cds_url) return res
def download_species_phytozome(species, valid_species, url, assembly=False): from os.path import join as urljoin assert species in valid_species, \ "{0} is not in the species list".format(species) # We want to download assembly and annotation for given species surl = urljoin(url, species) contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")] magic = contents[0].split("_")[1] # Get the magic number logging.debug("Found magic number for {0}: {1}".format(species, magic)) pf = "{0}_{1}".format(species, magic) asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf)) ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf)) cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf)) if assembly: download(asm_url) for u in (ann_url, cds_url): download(u)