def _download_minikraken(self, verbose=True): dv = DevTools() base = sequana_config_path + os.sep + "" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) logger.info("Downloading minikraken (4Gb)") filename = base + os.sep + "minikraken.tgz" if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2": logger.warning("%s already present" % filename) else: wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
def __init__(self): dv = DevTools() self.base = sequana_config_path + os.sep + "busco" dv.mkdir(self.base) self.filenames = sorted([ "bacteria_odb9", "proteobacteria_odb9", "rhizobiales_odb9", "betaproteobacteria_odb9", "gammaproteobacteria_odb9", "enterobacteriales_odb9", "deltaepsilonsub_odb9", "actinobacteria_odb9", "cyanobacteria_odb9", "firmicutes_odb9", "clostridia_odb9", "lactobacillales_odb9", "bacillales_odb9", "bacteroidetes_odb9", "spirochaetes_odb9", "tenericutes_odb9", "eukaryota_odb9", "fungi_odb9", "microsporidia_odb9", "dikarya_odb9", "ascomycota_odb9", "pezizomycotina_odb9", "eurotiomycetes_odb9", "sordariomyceta_odb9", "saccharomyceta_odb9", "saccharomycetales_odb9", "basidiomycota_odb9", "metazoa_odb9", "nematoda_odb9", "arthropoda_odb9", "insecta_odb9", "endopterygota_odb9", "hymenoptera_odb9", "diptera_odb9", "vertebrata_odb9", "actinopterygii_odb9", "tetrapoda_odb9", "aves_odb9", "mammalia_odb9", "euarchontoglires_odb9", "laurasiatheria_odb9", "embryophyta_odb9", "protists_ensembl", "alveolata_stramenophiles_ensembl"])
def __init__(self): dv = DevTools() self.base = sequana_config_path + os.sep + "busco" dv.mkdir(self.base) self.filenames = sorted([ "bacteria_odb9", "proteobacteria_odb9", "rhizobiales_odb9", "betaproteobacteria_odb9", "gammaproteobacteria_odb9", "enterobacteriales_odb9", "deltaepsilonsub_odb9", "actinobacteria_odb9", "cyanobacteria_odb9", "firmicutes_odb9", "clostridia_odb9", "lactobacillales_odb9", "bacillales_odb9", "bacteroidetes_odb9", "spirochaetes_odb9", "tenericutes_odb9", "eukaryota_odb9", "fungi_odb9", "microsporidia_odb9", "dikarya_odb9", "ascomycota_odb9", "pezizomycotina_odb9", "eurotiomycetes_odb9", "sordariomyceta_odb9", "saccharomyceta_odb9", "saccharomycetales_odb9", "basidiomycota_odb9", "metazoa_odb9", "nematoda_odb9", "arthropoda_odb9", "insecta_odb9", "endopterygota_odb9", "hymenoptera_odb9", "diptera_odb9", "vertebrata_odb9", "actinopterygii_odb9", "tetrapoda_odb9", "aves_odb9", "mammalia_odb9", "euarchontoglires_odb9", "laurasiatheria_odb9", "embryophyta_odb9", "protists_ensembl", "alveolata_stramenophiles_ensembl" ])
def _download_kraken_toydb(self, verbose=True): """Download the kraken DB toy example from sequana_data into .config/sequana directory Checks the md5 checksums. About 32Mb of data """ dv = DevTools() base = sequana_config_path + os.sep + "kraken_toydb" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) baseurl = "https://github.com/sequana/data/raw/master/" # download only if required logger.info("Downloading the database into %s" % base) md5sums = [ "28661f8baf0514105b0c6957bec0fc6e", "97a39d44ed86cadea470352d6f69748d", "d91a0fcbbc0f4bbac918755b6400dea6", "c8bae69565af2170ece194925b5fdeb9" ] filenames = [ "database.idx", "database.kdb", "taxonomy/names.dmp", "taxonomy/nodes.dmp" ] for filename, md5sum in zip(filenames, md5sums): url = baseurl + "kraken_toydb/%s" % filename filename = base + os.sep + filename if os.path.exists(filename) and md5(filename) == md5sum: logger.warning("%s already present" % filename) else: logger.info("Downloading %s" % url) wget(url, filename)
def _download_kraken_toydb(self, verbose=True): """Download the kraken DB toy example from sequana_data into .config/sequana directory Checks the md5 checksums. About 32Mb of data """ dv = DevTools() base = sequana_config_path + os.sep + "kraken_toydb" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) baseurl = "https://github.com/sequana/data/raw/master/" # download only if required logger.info("Downloading the database into %s" % base) md5sums = [ "28661f8baf0514105b0c6957bec0fc6e", "97a39d44ed86cadea470352d6f69748d", "d91a0fcbbc0f4bbac918755b6400dea6", "c8bae69565af2170ece194925b5fdeb9"] filenames = [ "database.idx", "database.kdb", "taxonomy/names.dmp", "taxonomy/nodes.dmp"] for filename, md5sum in zip(filenames, md5sums): url = baseurl + "kraken_toydb/%s" % filename filename = base + os.sep + filename if os.path.exists(filename) and md5(filename) == md5sum: logger.warning("%s already present" % filename) else: logger.info("Downloading %s" % url) wget(url, filename)
class KrakenPipeline(object): """Used by the standalone application sequana_taxonomy This runs Kraken on a set of FastQ files, transform the results in a format compatible for Krona, and creates a Krona HTML report. :: from sequana import KrakenPipeline kt = KrakenPipeline(["R1.fastq.gz", "R2.fastq.gz"], database="krakendb") kt.run() kt.show() .. warning:: We do not provide Kraken database within sequana. You may either download a database from https://ccb.jhu.edu/software/kraken/ or use this class to download a toy example that will be stored in e.g .config/sequana under Unix platforms. See :class:`KrakenDownload`. .. seealso:: We provide a standalone application of this class, which is called sequana_taxonomy and can be used within a command shell. """ def __init__(self, fastq, database, threads=4, output_directory="kraken", dbname=None): """.. rubric:: Constructor :param fastq: either a fastq filename or a list of 2 fastq filenames :param database: the path to a valid Kraken database :param threads: number of threads to be used by Kraken :param output_directory: output filename of the Krona HTML page :param dbname: Description: internally, once Kraken has performed an analysis, reads are associated to a taxon (or not). We then find the correponding lineage and scientific names to be stored within a Krona formatted file. KtImportTex is then used to create the Krona page. """ # Set and create output directory self._devtools = DevTools() self.output_directory = output_directory self._devtools.mkdir(output_directory) self.ka = KrakenAnalysis(fastq, database, threads) if dbname is None: self.dbname = os.path.basename(database) else: self.dbname = dbname def run(self, output_filename_classified=None, output_filename_unclassified=None, only_classified_output=False): """Run the analysis using Kraken and create the Krona output .. todo:: reuse the KrakenResults code to simplify this method. """ # Run Kraken (KrakenAnalysis) kraken_results = self.output_directory + os.sep + "kraken.out" self.ka.run(output_filename=kraken_results, output_filename_unclassified=output_filename_unclassified, output_filename_classified=output_filename_classified, only_classified_output=only_classified_output) # Translate kraken output to a format understood by Krona and save png # image self.kr = KrakenResults(kraken_results) df = self.kr.plot(kind="pie") pylab.savefig(self.output_directory + os.sep + "kraken.png") prefix = self.output_directory + os.sep self.kr.kraken_to_json(prefix + "kraken.json", self.dbname) self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname) # Transform to Krona HTML from snakemake import shell kraken_html = self.output_directory + os.sep + "kraken.html" status = self.kr.kraken_to_krona(output_filename=prefix + "kraken.out.summary") if status is True: shell("ktImportText %s -o %s" % (prefix + "kraken.out.summary", kraken_html)) else: shell("touch {}".format(kraken_html)) def show(self): """Opens the filename defined in the constructor""" from easydev import onweb onweb(self.output)
class KrakenBuilder(): """This class will help you building a custom Kraken database You will need a few steps, and depending on the FASTA files you want to include lots of resources (memory and space wise). In the following example, we will be reasonable and use only viruses FASTA files. First, we need to create the data structure directory. Let us call it **virusdb**:: from sequana import KrakenBuilder kb = KrakenBuilder("virusdb") We then need to download a large taxonomic database from NCBI. You may already have a local copy, in which case you would need to copy it in virusdb/taxonomy directory. If not, type:: kb.download_taxonomy() The virusdb/taxonomy directory will contain about 8.5G of data. Note that this currently requires the unix tools **wget** and **tar**. Then, we need to add some fasta files. You may download specific FASTA files if you know the accession numbers using :meth:`download_accession`. However, we also provide a method to download all viruses from ENA:: kb.download_viruses() This will take a while to download the more than 4500 FASTA files (10 minutes on a good connection). You will end up with a data set of about 100 Mb of FASTA files. If you wish to download other FASTA (e.g. all bacteria), you will need to use another class from the :mod:`sequana.databases`:: from sequana.databases import ENADownload ena = ENADownload() ena.download_fasta("bacteria.txt", output_dir="virusdb/library/added") Please see the documentation for more options and list of species to download. It is now time to build the DB itself. This is based on the kraken tool. You may do it yourself in a shell:: kraken-build --rebuild -db virusdb --minimizer-len 10 --max-db-size 4 --threads 4 --kmer-len 26 --jellyfish-hash-size 500000000 Or you the KrakenBuilder. First you need to look at the :attr:`params` attribute. The most important key/value that affect the size of the DB are:: kb.params['kmer_length'] (max value is 31) kb.params['max_db_size'] is tha max size of the DB files in Gb kb.params['minimizer_len'] To create a small DB quickly, we set those values:: kb.params['kmer_length'] = 26 kb.params['minimizer_len'] = 10 However, for production, we would recommend 31 and 13 (default) This takes about 2 minutes to build and the final DB is about 800Mb. Lots of useless files are in the direcory and can be removed using kraken itself. However we do a little bit more and therefore have our own cleaning function:: kb.clean_db() Kraken-build uses jellyfish. The **hash_size** parameter is the jellyfish hash_size parameter. If you set it to 6400M, the memory required is about 6.9bytes times 6400M that is 40Gb of memory. The default value used here means 3.5Gb are required. The size to store the DB itself should be :math: sD + 8 (4^M) where **s** is about 12 bytes (used to store a kmer/taxon pair, D is the number of kmer in the final database, which cannot be estimated before hand, and M the length minimiser parameter. The quick way: ===================== kb = KrakenBuilder("virusdb") kb.run(['virus']) # use only viruses from ENA list Here, you may want to re-run the analysis with different parameters for the database built. If you require the virus DB, it has been downloaded already so this step will be skip. The Taxon DB does not need to be downloaded again, so set download_taxonomy to False. Before, let us change the parameter to build a full database:: kb.params['kmer_length'] = 31 kb.params['minimizer_len'] = 13 We have here instead of 800Mb DB a new DB of 1.5Gb but it should take more or less the same time to build it Finally if you do not need to test it anymore, you may clean the DB once for all. This will remove useless files. The directory's name is the name of the DB that should be used in e.g. the quality_control pipeline. To clean the data directory, type:: kb.clean_db() """ def __init__(self, dbname): """.. rubric:: Constructor :param str dbname: Create the Kraken DB in this directory """ # See databases.py module self.dbname = dbname self.enadb = ENADownload() self.valid_dbs = self.enadb._metadata.keys() # mini_kraken uses minimiser-length = 13, max_db =4, others=default so # kmer-len=31 hashsize=default self.params = { "dbname": self.dbname, "minimizer_len": 10, "max_db_size": 4, "threads": 4, "kmer_length": 26, "hash_size": 500000000 } self.init() def init(self): # mkdir library self.library_path = self.dbname + os.sep + "library" self.taxon_path = self.dbname + os.sep + "taxonomy" self.fasta_path = self.library_path + os.sep + "added" self._devtools = DevTools() self._devtools.mkdir(self.dbname) self._devtools.mkdir(self.library_path) self._devtools.mkdir(self.fasta_path) self._devtools.mkdir(self.taxon_path) def download_accession(self, acc): """Donwload a specific Fasta from ENA given its accession number Note that if you want to add specific FASTA from ENA, you must use that function to make sure the header will be understood by Kraken; The header must use a GI number (not ENA) """ output = self.dbname + os.sep + "library" + os.sep + "added" """Download a specific FASTA file given its ENA accession number """ self.enadb.download_accession(acc, output=output) def download_viruses(self): self.enadb.download_fasta("virus.txt", output_dir=self.fasta_path) def run(self, dbs=[], download_taxon=True): """Create the Custom Kraken DB #. download taxonomy files #. Load the DBs (e.g. viruses) #. Build DB with kraken-build #. Clean it up """ # Start with the FASTA self._download_dbs(dbs) self.download_taxonomy() # search for taxon file. If not found, error required = self.taxon_path + os.sep + "gi_taxid_nucl.dmp" if required not in glob.glob(self.taxon_path + os.sep + "*"): raise IOError("Taxon file not found") print( "\nDepending on the input, this step may take a few hours to finish" ) self._build_kraken() def download_taxonomy(self, force=False): """Download kraken data The downloaded file is large (1.3Gb) and the unzipped file is about 9Gb. If already present, do not download the file except if the *force* parameter is set to True. """ # If the requested file exists, nothing to do expected_filename = self.taxon_path + os.sep + "gi_taxid_nucl.dmp" expected_md5 = "8c182ac2df452d836206ad13275cd8af" print( '\nDownloading taxonomy files. Takes a while depending on your connection' ) if os.path.exists(expected_filename) is False or \ md5(expected_filename) != expected_md5: # download taxonomy # We could use kraken-build --download-taxonomy + a subprocess but # even simpler to get the file via ftp FTP = "ftp.ncbi.nih.gov" execute( "wget %s/pub/taxonomy/gi_taxid_nucl.dmp.gz --directory-prefix %s" % (FTP, self.taxon_path)) # Unzip the files execute('unpigz %s/gi_taxid_nucl.dmp.gz' % self.taxon_path) else: print("Found local expected file %s " % expected_filename) expected_filename = self.taxon_path + os.sep + "names.dmp" expected_md5 = "90d88912ad4c94f6ac07dfab0443da9b" if os.path.exists(expected_filename) is False or \ md5(expected_filename) != expected_md5: execute( "wget %s/pub/taxonomy/taxdump.tar.gz --directory-prefix %s" % (FTP, self.taxon_path)) execute('tar xvfz %s/taxdump.tar.gz -C %s' % (self.taxon_path, self.taxon_path)) else: print("Found local expected file %s " % expected_filename) def _download_dbs(self, dbs=[]): print("Downloading all Fasta files for %s" % dbs) # Download the DBs in it from .databases import ENADownload for db in dbs: if db not in self.valid_dbs and os.path.exists(db) is False: msg = "db must be a local file with a list of ENA or one of" for this in self.ena._metadata.keys(): msg += " - %s" % this raise ValueError(msg) self.ena.download_fasta(db, output_dir=self.fasta_path) def _build_kraken(self): print('Building the kraken db ') self.params['hash_size'] = int(self.params["hash_size"]) cmd = """kraken-build --rebuild -db %(dbname)s \ --minimizer-len %(minimizer_len)s\ --max-db-size %(max_db_size)s \ --threads %(threads)s\ --kmer-len %(kmer_length)s \ --jellyfish-hash-size %(hash_size)s""" % self.params # again, kraken-build prints on stderr so we cannot use easydev.shellcmd execute(cmd) def clean_db(self): """Once called, you will not be able to append more FASTA files """ # Now we can clean the kraken db: print('Cleaning the kraken db ') # Clean the nodes.dmp and names.dmp print('Identifying the GI numbers') gis = self.get_gis() taxons = self.get_taxons_from_gis(gis) print("") self.gis = gis self.taxons = taxons # This cleans the nodes.dmp and names.dmp. This must be done # before kraken-build --clean since it requires the gi_taxid_nucl.dmp # file names_file = self.taxon_path + os.sep + "names.dmp" nodes_file = self.taxon_path + os.sep + "nodes.dmp" names_file_temp = self.taxon_path + os.sep + "names_temp.dmp" nodes_file_temp = self.taxon_path + os.sep + "nodes_temp.dmp" taxon_file_reader = NCBITaxonReader(names=names_file, nodes=nodes_file, verbose=True) print("Filtering") taxon_file_reader.filter_nodes_dmp_file(nodes_file, nodes_file_temp, taxons=taxons) taxon_file_reader.filter_names_dmp_file(names_file, names_file_temp, taxons=taxons) # mv the new files into the old ones os.rename(names_file_temp, names_file) os.rename(nodes_file_temp, nodes_file) # Finally, the kraken cleaning itself cmd = "kraken-build --clean --db %s" % self.params['dbname'] execute(cmd) def get_gis(self, extensions=['fa']): self.filenames = [] root = self.dbname for extension in extensions: self.filenames.extend( list(glob.iglob("%s/library/**/*%s" % (root, extension)))) for extension in extensions: self.filenames.extend( list(glob.iglob("%s/library/**/**/*%s" % (root, extension)))) N = len(self.filenames) pb = Progress(N) gis = [] for i, filename in enumerate(self.filenames): data = open(filename, "r") line = data.readline() if line.startswith('>'): assert "gi" in line, "expected >gi to be found at the beginning" gi = line[1:].split("|")[1] else: raise ValueError( "This file %s does not seem to be a FASTA file" % filename) gis.append(gi) pb.animate(i + 1) print() gis = [int(x) for x in gis] self.gis = gis assert len(gis) == len(self.filenames) return gis def get_taxons_from_gis(self, gis, filename="gi_taxid_nucl.dmp"): filename = self.taxon_path + os.sep + filename data = pd.read_csv(filename, chunksize=1000000, sep='\t', header=None) N = 560 # with time this number will be deprecated but good for now local_gis = gis[:] # We will found GI an order than different from the input gis list so # we will need to keep track of the order found_gis = [] taxons = [32644] * len(gis) # 32644 means unidentified # we search for the unique gis. Once found, we remove them from the # vector and keep going until the vector is empty or there is no more # chunks. A good sanity check is that the final gis vector should be # empty meaning all have been found. We do not care about the order # of the final taxons vector as compare to the GI vector print("Scanning %s to look for %s GI numbers" % (filename, len(gis))) pb = Progress(N) for i, chunk in enumerate(data): chunk.set_index(0, inplace=True) chunk = chunk.ix[local_gis].dropna() # keep the GI and Taxon found_gis.extend([int(x) for x in list(chunk.index)]) # update the remaining GIs and the taxons for gi, tax in zip(chunk.index, chunk.values): local_gis.remove(gi) index = gis.index(gi) taxons[index] = tax # no need to carry on if all GIs were found if len(local_gis) == 0: break pb.animate(i + 1) print("") taxons = [int(x) for x in taxons] return taxons
class KrakenPipeline(object): """Used by the standalone application sequana_taxonomy This runs Kraken on a set of FastQ files, transform the results in a format compatible for Krona, and creates a Krona HTML report. :: from sequana import KrakenPipeline kt = KrakenPipeline(["R1.fastq.gz", "R2.fastq.gz"], database="krakendb") kt.run() kt.show() .. warning:: We do not provide Kraken database within sequana. You may either download a database from https://ccb.jhu.edu/software/kraken/ or use this class to download a toy example that will be stored in e.g .config/sequana under Unix platforms. See :class:`KrakenDownload`. .. seealso:: We provide a standalone application of this class, which is called sequana_taxonomy and can be used within a command shell. """ def __init__(self, fastq, database, threads=4, output_directory="kraken", dbname=None): """.. rubric:: Constructor :param fastq: either a fastq filename or a list of 2 fastq filenames :param database: the path to a valid Kraken database :param threads: number of threads to be used by Kraken :param output_directory: output filename of the Krona HTML page :param dbname: Description: internally, once Kraken has performed an analysis, reads are associated to a taxon (or not). We then find the correponding lineage and scientific names to be stored within a Krona formatted file. KtImportTex is then used to create the Krona page. """ # Set and create output directory self._devtools = DevTools() self.output_directory = output_directory self._devtools.mkdir(output_directory) self.ka = KrakenAnalysis(fastq, database, threads) if dbname is None: self.dbname = os.path.basename(database) else: self.dbname = dbname def run(self, output_filename_classified=None, output_filename_unclassified=None, only_classified_output=False): """Run the analysis using Kraken and create the Krona output .. todo:: reuse the KrakenResults code to simplify this method. """ # Run Kraken (KrakenAnalysis) kraken_results = self.output_directory + os.sep + "kraken.out" self.ka.run( output_filename=kraken_results, output_filename_unclassified=output_filename_unclassified, output_filename_classified=output_filename_classified, only_classified_output=only_classified_output ) # Translate kraken output to a format understood by Krona and save png # image self.kr = KrakenResults(kraken_results) df = self.kr.plot(kind="pie") pylab.savefig(self.output_directory + os.sep + "kraken.png") prefix = self.output_directory + os.sep self.kr.kraken_to_json(prefix + "kraken.json", self.dbname) self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname) # Transform to Krona HTML from snakemake import shell kraken_html = self.output_directory + os.sep + "kraken.html" status = self.kr.kraken_to_krona(output_filename=prefix+"kraken.out.summary") if status is True: shell("ktImportText %s -o %s" % (prefix+"kraken.out.summary", kraken_html)) else: shell("touch {}".format(kraken_html)) def show(self): """Opens the filename defined in the constructor""" from easydev import onweb onweb(self.output)