def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum """ from jcvi.formats.gff import bed as gff_bed from jcvi.formats.fasta import format as fasta_format p = OptionParser(phytozome.__doc__) p.add_option("--version", default="9.0", help="Phytozome version [default: %default]") p.add_option("--assembly", default=False, action="store_true", help="Download assembly [default: %default]") p.add_option("--format", default=False, action="store_true", help="Format to CDS and BED for synteny inference") opts, args = p.parse_args(args) url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\ format(opts.version) valid_species = [x for x in ls_ftp(url) if "." not in x] doc = "\n".join((phytozome.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) species, = args if species == "all": species = ",".join(valid_species) species = species.split(",") use_IDs = set() # We have to watch out when the gene names and mRNA names mismatch, in which # case we just extract the mRNA names use_mRNAs = set(["Cclementina", "Creinhardtii", "Csinensis", "Fvesca", "Lusitatissimum", "Mesculenta", "Mguttatus", "Ppersica", "Pvirgatum", "Rcommunis", "Sitalica", "Tcacao", "Thalophila", "Vcarteri", "Vvinifera", "Zmays"]) for s in species: gff, fa = download_species_phytozome(s, valid_species, url, assembly=opts.assembly) key = "ID" if s in use_IDs else "Name" ttype = "mRNA" if s in use_mRNAs else "gene" if not opts.format: continue bedfile = s + ".bed" cdsfile = s + ".cds" gff_bed([gff, "--type={}".format(ttype), "--key={}".format(key), "-o", bedfile]) fasta_format([fa, cdsfile, r"--sep=|"])
def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum """ from jcvi.formats.gff import bed as gff_bed from jcvi.formats.fasta import format as fasta_format p = OptionParser(phytozome.__doc__) p.add_option("--version", default="9.0", help="Phytozome version [default: %default]") p.add_option("--assembly", default=False, action="store_true", help="Download assembly [default: %default]") p.add_option("--format", default=False, action="store_true", help="Format to CDS and BED for synteny inference") opts, args = p.parse_args(args) url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\ format(opts.version) valid_species = [x for x in ls_ftp(url) if "." not in x] doc = "\n".join((phytozome.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) species, = args if species == "all": species = ",".join(valid_species) species = species.split(",") use_IDs = set() # We have to watch out when the gene names and mRNA names mismatch, in which # case we just extract the mRNA names use_mRNAs = set(["Cclementina", "Creinhardtii", "Csinensis", "Fvesca", "Lusitatissimum", "Mesculenta", "Mguttatus", "Ppersica", "Pvirgatum", "Rcommunis", "Sitalica", "Tcacao", "Thalophila", "Vcarteri", "Vvinifera", "Zmays"]) for s in species: gff, fa = download_species_phytozome(s, valid_species, url, assembly=opts.assembly) key = "ID" if s in use_IDs else "Name" ttype = "mRNA" if s in use_mRNAs else "gene" if not opts.format: continue bedfile = s + ".bed" cdsfile = s + ".cds" gff_bed([gff, "--type={}".format(ttype), "--key={}".format(key), "-o", bedfile]) fasta_format([fa, cdsfile, r"--sep=|"])
def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum """ from jcvi.formats.gff import bed as gff_bed from jcvi.formats.fasta import format as fasta_format p = OptionParser(phytozome.__doc__) p.add_option("--version", default="9.0", help="Phytozome version [default: %default]") p.add_option("--assembly", default=False, action="store_true", help="Download assembly [default: %default]") p.add_option("--format", default=False, action="store_true", help="Format to CDS and BED for synteny inference") opts, args = p.parse_args(args) url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\ format(opts.version) valid_species = [x for x in ls_ftp(url) if "." not in x] doc = "\n".join((phytozome.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) species, = args species = species.split(",") for s in species: gff, fa = download_species_phytozome(s, valid_species, url, assembly=opts.assembly) if not opts.format: continue bedfile = s + ".bed" cdsfile = s + ".cds" gff_bed([gff, "--phytozome", "-o", bedfile]) fasta_format([fa, cdsfile, r"--sep=|"])
def format_bed_and_cds(species, gff, cdsfa): """Run gff.format() and fasta.format() to generate BED and CDS files. This prepares the input files for the MCscan synteny workflow. https://github.com/tanghaibao/jcvi/wiki/MCscan-(Python-version) Args: species (str): Name of the species gff (str): Path to the GFF file fa (str): Path to the FASTA file """ from jcvi.formats.gff import bed as gff_bed from jcvi.formats.fasta import format as fasta_format # We have to watch out when the gene names and mRNA names mismatch, in which # case we just extract the mRNA names use_IDs = set() use_mRNAs = set([ "Cclementina", "Creinhardtii", "Csinensis", "Fvesca", "Lusitatissimum", "Mesculenta", "Mguttatus", "Ppersica", "Pvirgatum", "Rcommunis", "Sitalica", "Tcacao", "Thalophila", "Vcarteri", "Vvinifera", "Zmays", ]) key = "ID" if species in use_IDs else "Name" ttype = "mRNA" if species in use_mRNAs else "gene" bedfile = species + ".bed" cdsfile = species + ".cds" gff_bed([ gff, "--type={}".format(ttype), "--key={}".format(key), "-o", bedfile ]) fasta_format([cdsfa, cdsfile, r"--sep=|"])