Example #1
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Example #2
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Example #3
    def __init__(self):

        dv = DevTools()
        self.base = sequana_config_path + os.sep + "busco"
        self.filenames = sorted([
Example #4
    def __init__(self):

        dv = DevTools()
        self.base = sequana_config_path + os.sep + "busco"
        self.filenames = sorted([
            "bacteria_odb9", "proteobacteria_odb9", "rhizobiales_odb9",
            "betaproteobacteria_odb9", "gammaproteobacteria_odb9",
            "enterobacteriales_odb9", "deltaepsilonsub_odb9",
            "actinobacteria_odb9", "cyanobacteria_odb9", "firmicutes_odb9",
            "clostridia_odb9", "lactobacillales_odb9", "bacillales_odb9",
            "bacteroidetes_odb9", "spirochaetes_odb9", "tenericutes_odb9",
            "eukaryota_odb9", "fungi_odb9", "microsporidia_odb9",
            "dikarya_odb9", "ascomycota_odb9", "pezizomycotina_odb9",
            "eurotiomycetes_odb9", "sordariomyceta_odb9",
            "saccharomyceta_odb9", "saccharomycetales_odb9",
            "basidiomycota_odb9", "metazoa_odb9", "nematoda_odb9",
            "arthropoda_odb9", "insecta_odb9", "endopterygota_odb9",
            "hymenoptera_odb9", "diptera_odb9", "vertebrata_odb9",
            "actinopterygii_odb9", "tetrapoda_odb9", "aves_odb9",
            "mammalia_odb9", "euarchontoglires_odb9", "laurasiatheria_odb9",
            "embryophyta_odb9", "protists_ensembl",
Example #5
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
        filenames = [
            "database.idx", "database.kdb", "taxonomy/names.dmp",

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
                logger.info("Downloading %s" % url)
                wget(url, filename)
Example #6
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
        filenames = [

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
                logger.info("Downloading %s" % url)
                wget(url, filename)
Example #7
class KrakenPipeline(object):
    """Used by the standalone application sequana_taxonomy

    This runs Kraken on a set of FastQ files, transform the results
    in a format compatible for Krona, and creates a Krona HTML report.


        from sequana import KrakenPipeline
        kt = KrakenPipeline(["R1.fastq.gz", "R2.fastq.gz"], database="krakendb")

    .. warning:: We do not provide Kraken database within sequana. You may
        either download a database from https://ccb.jhu.edu/software/kraken/
        or use this class to download a toy example that will
        be stored in e.g .config/sequana under Unix platforms.
        See :class:`KrakenDownload`.

    .. seealso:: We provide a standalone application of this class, which is
        called sequana_taxonomy and can be used within a command shell.

    def __init__(self,
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output_directory: output filename of the Krona HTML page
        :param dbname:

        Description: internally, once Kraken has performed an analysis, reads
        are associated to a taxon (or not). We then find the correponding
        lineage and scientific names to be stored within a Krona formatted file.
        KtImportTex is then used to create the Krona page.

        # Set and create output directory
        self._devtools = DevTools()
        self.output_directory = output_directory
        self.ka = KrakenAnalysis(fastq, database, threads)

        if dbname is None:
            self.dbname = os.path.basename(database)
            self.dbname = dbname

    def run(self,
        """Run the analysis using Kraken and create the Krona output

        .. todo:: reuse the KrakenResults code to simplify this method.

        # Run Kraken (KrakenAnalysis)
        kraken_results = self.output_directory + os.sep + "kraken.out"


        # Translate kraken output to a format understood by Krona and save png
        # image
        self.kr = KrakenResults(kraken_results)

        df = self.kr.plot(kind="pie")
        pylab.savefig(self.output_directory + os.sep + "kraken.png")

        prefix = self.output_directory + os.sep

        self.kr.kraken_to_json(prefix + "kraken.json", self.dbname)
        self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname)

        # Transform to Krona HTML
        from snakemake import shell
        kraken_html = self.output_directory + os.sep + "kraken.html"
        status = self.kr.kraken_to_krona(output_filename=prefix +
        if status is True:
            shell("ktImportText %s -o %s" %
                  (prefix + "kraken.out.summary", kraken_html))
            shell("touch {}".format(kraken_html))

    def show(self):
        """Opens the filename defined in the constructor"""
        from easydev import onweb
Example #8
class KrakenBuilder():
    """This class will help you building a custom Kraken database

    You will need a few steps, and depending on the FASTA files you want to
    include lots of resources (memory and space wise). In the following example,
    we will be reasonable and use only viruses FASTA files.

    First, we need to create the data structure directory. Let us call it

        from sequana import KrakenBuilder
        kb = KrakenBuilder("virusdb")

    We then need to download a large taxonomic database from NCBI. You may
    already have a local copy, in which case you would need to copy it in
    virusdb/taxonomy directory. If not, type::


    The virusdb/taxonomy directory will contain about 8.5G of data.

    Note that this currently requires the unix tools **wget** and **tar**.

    Then, we need to add some fasta files. You may download specific FASTA files
    if you know the accession numbers using :meth:`download_accession`. However,
    we also provide a method to download all viruses from ENA::


    This will take a while to download the more than 4500 FASTA files (10
    minutes on a good connection). You will end up with a data set of about 100
    Mb of FASTA files.

    If you wish to download other FASTA (e.g. all bacteria), you will need to
    use another class from the :mod:`sequana.databases`::

        from sequana.databases import ENADownload
        ena = ENADownload()
        ena.download_fasta("bacteria.txt", output_dir="virusdb/library/added")

    Please see the documentation for more options and list of species to

    It is now time to build the DB itself. This is based on the kraken tool.
    You may do it yourself in a shell::

        kraken-build  --rebuild -db virusdb --minimizer-len 10 --max-db-size 4 --threads 4
        --kmer-len 26 --jellyfish-hash-size 500000000

    Or you the KrakenBuilder. First you need to look at the :attr:`params`
    attribute. The most important key/value that affect the size of the DB are::

        kb.params['kmer_length']  (max value is 31)
        kb.params['max_db_size'] is tha max size of the DB files in Gb

    To create a small DB quickly, we set those values::

        kb.params['kmer_length']  = 26
        kb.params['minimizer_len'] = 10

    However, for production, we would recommend 31 and 13 (default)

    This takes about 2 minutes to build and the final DB is about 800Mb.

    Lots of useless files are in the direcory and can be removed using kraken
    itself. However we do a little bit more and therefore have our own
    cleaning function::


    Kraken-build uses jellyfish. The **hash_size** parameter is the jellyfish
    hash_size parameter. If you set it to 6400M, the memory required is about
    6.9bytes times 6400M that is 40Gb of memory. The default value used here
    means 3.5Gb are required.

    The size to store the DB itself should be


        sD + 8 (4^M)

    where **s** is about 12 bytes (used to store a kmer/taxon pair, D is the
    number of kmer in the final database, which cannot be estimated before
    hand, and M the length minimiser parameter.

    The quick way:

        kb = KrakenBuilder("virusdb")
        kb.run(['virus']) # use only viruses from ENA list

    Here, you may want to re-run the analysis with different parameters
    for the database built. If you require the virus DB, it has been
    downloaded already so this step will be skip. The Taxon DB does not
    need to be downloaded again, so set download_taxonomy to False.

    Before, let us change the parameter to build a full database::

        kb.params['kmer_length']  = 31
        kb.params['minimizer_len'] = 13

    We have here instead of 800Mb DB a new DB of 1.5Gb but it should
    take more or less the same time to build it

    Finally if you do not need to test it anymore, you may clean the DB once for
    all. This will remove useless files. The directory's name is the name of the
    DB that should be used in e.g. the quality_control pipeline. To clean the
    data directory, type::


    def __init__(self, dbname):
        """.. rubric:: Constructor

        :param str dbname: Create the Kraken DB in this directory

        # See databases.py module
        self.dbname = dbname
        self.enadb = ENADownload()
        self.valid_dbs = self.enadb._metadata.keys()

        # mini_kraken uses minimiser-length = 13, max_db =4, others=default so
        # kmer-len=31 hashsize=default
        self.params = {
            "dbname": self.dbname,
            "minimizer_len": 10,
            "max_db_size": 4,
            "threads": 4,
            "kmer_length": 26,
            "hash_size": 500000000


    def init(self):
        # mkdir library
        self.library_path = self.dbname + os.sep + "library"
        self.taxon_path = self.dbname + os.sep + "taxonomy"
        self.fasta_path = self.library_path + os.sep + "added"

        self._devtools = DevTools()

    def download_accession(self, acc):
        """Donwload a specific Fasta from ENA given its accession number

        Note that if you want to add specific FASTA from ENA, you must use
        that function to make sure the header will be understood by Kraken;
        The header must use a GI number (not ENA)

        output = self.dbname + os.sep + "library" + os.sep + "added"
        """Download a specific FASTA file given its ENA accession number """
        self.enadb.download_accession(acc, output=output)

    def download_viruses(self):
        self.enadb.download_fasta("virus.txt", output_dir=self.fasta_path)

    def run(self, dbs=[], download_taxon=True):
        """Create the Custom Kraken DB

        #. download taxonomy files
        #. Load the DBs (e.g. viruses)
        #. Build DB with kraken-build
        #. Clean it up

        # Start with the FASTA


        # search for taxon file. If not found, error
        required = self.taxon_path + os.sep + "gi_taxid_nucl.dmp"

        if required not in glob.glob(self.taxon_path + os.sep + "*"):
            raise IOError("Taxon file not found")

            "\nDepending on the input, this step may take a few hours to finish"

    def download_taxonomy(self, force=False):
        """Download kraken data

        The downloaded file is large (1.3Gb) and the unzipped file is about 9Gb.

        If already present, do not download the file except if the *force*
        parameter is set to True.


        # If the requested file exists, nothing to do
        expected_filename = self.taxon_path + os.sep + "gi_taxid_nucl.dmp"
        expected_md5 = "8c182ac2df452d836206ad13275cd8af"
            '\nDownloading taxonomy files. Takes a while depending on your connection'

        if os.path.exists(expected_filename) is False or \
                md5(expected_filename) != expected_md5:
            # download taxonomy
            # We could use kraken-build --download-taxonomy + a subprocess but
            # even simpler to get the file via ftp
            FTP = "ftp.ncbi.nih.gov"
                "wget %s/pub/taxonomy/gi_taxid_nucl.dmp.gz --directory-prefix %s"
                % (FTP, self.taxon_path))
            # Unzip the files
            execute('unpigz %s/gi_taxid_nucl.dmp.gz' % self.taxon_path)
            print("Found local expected file %s " % expected_filename)

        expected_filename = self.taxon_path + os.sep + "names.dmp"
        expected_md5 = "90d88912ad4c94f6ac07dfab0443da9b"
        if os.path.exists(expected_filename) is False or \
                md5(expected_filename) != expected_md5:

                "wget %s/pub/taxonomy/taxdump.tar.gz --directory-prefix %s" %
                (FTP, self.taxon_path))

            execute('tar xvfz %s/taxdump.tar.gz -C %s' %
                    (self.taxon_path, self.taxon_path))
            print("Found local expected file %s " % expected_filename)

    def _download_dbs(self, dbs=[]):
        print("Downloading all Fasta files for %s" % dbs)
        # Download the DBs in it
        from .databases import ENADownload
        for db in dbs:
            if db not in self.valid_dbs and os.path.exists(db) is False:
                msg = "db must be a local file with a list of ENA or one of"
                for this in self.ena._metadata.keys():
                    msg += " - %s" % this
                raise ValueError(msg)
            self.ena.download_fasta(db, output_dir=self.fasta_path)

    def _build_kraken(self):
        print('Building the kraken db ')
        self.params['hash_size'] = int(self.params["hash_size"])

        cmd = """kraken-build  --rebuild -db %(dbname)s \
            --minimizer-len %(minimizer_len)s\
            --max-db-size %(max_db_size)s \
            --threads %(threads)s\
            --kmer-len %(kmer_length)s \
            --jellyfish-hash-size %(hash_size)s""" % self.params

        # again, kraken-build prints on stderr so we cannot use easydev.shellcmd

    def clean_db(self):
        """Once called, you will not be able to append more FASTA files

        # Now we can clean the kraken db:
        print('Cleaning the kraken db ')
        # Clean the nodes.dmp and names.dmp
        print('Identifying the GI numbers')
        gis = self.get_gis()
        taxons = self.get_taxons_from_gis(gis)

        self.gis = gis
        self.taxons = taxons

        # This cleans the nodes.dmp and names.dmp. This must be done
        # before kraken-build --clean since it requires the gi_taxid_nucl.dmp
        # file
        names_file = self.taxon_path + os.sep + "names.dmp"
        nodes_file = self.taxon_path + os.sep + "nodes.dmp"
        names_file_temp = self.taxon_path + os.sep + "names_temp.dmp"
        nodes_file_temp = self.taxon_path + os.sep + "nodes_temp.dmp"

        taxon_file_reader = NCBITaxonReader(names=names_file,

        # mv the new files into the old ones
        os.rename(names_file_temp, names_file)
        os.rename(nodes_file_temp, nodes_file)

        # Finally, the kraken cleaning itself
        cmd = "kraken-build --clean --db %s" % self.params['dbname']

    def get_gis(self, extensions=['fa']):
        self.filenames = []
        root = self.dbname
        for extension in extensions:
                list(glob.iglob("%s/library/**/*%s" % (root, extension))))
        for extension in extensions:
                list(glob.iglob("%s/library/**/**/*%s" % (root, extension))))

        N = len(self.filenames)
        pb = Progress(N)
        gis = []
        for i, filename in enumerate(self.filenames):
            data = open(filename, "r")
            line = data.readline()
            if line.startswith('>'):
                assert "gi" in line, "expected >gi to be found at the beginning"
                gi = line[1:].split("|")[1]
                raise ValueError(
                    "This file %s does not seem to be a FASTA file" % filename)
            pb.animate(i + 1)
        gis = [int(x) for x in gis]
        self.gis = gis

        assert len(gis) == len(self.filenames)
        return gis

    def get_taxons_from_gis(self, gis, filename="gi_taxid_nucl.dmp"):
        filename = self.taxon_path + os.sep + filename
        data = pd.read_csv(filename, chunksize=1000000, sep='\t', header=None)
        N = 560  # with time this number will be deprecated but good for now

        local_gis = gis[:]

        # We will found GI an order than different from the input gis list so
        # we will need to keep track of the order
        found_gis = []
        taxons = [32644] * len(gis)  # 32644 means unidentified
        # we search for the unique gis. Once found, we remove them from the
        # vector and keep going until the vector is empty or there is no more
        # chunks. A good sanity check is that the final gis vector should be
        # empty meaning all have been found. We do not care about the order
        # of the final taxons vector as compare to the GI vector

        print("Scanning %s to look for %s GI numbers" % (filename, len(gis)))
        pb = Progress(N)
        for i, chunk in enumerate(data):
            chunk.set_index(0, inplace=True)
            chunk = chunk.ix[local_gis].dropna()

            # keep the GI and Taxon
            found_gis.extend([int(x) for x in list(chunk.index)])

            # update the remaining GIs and the taxons
            for gi, tax in zip(chunk.index, chunk.values):
                index = gis.index(gi)
                taxons[index] = tax

            # no need to carry on if all GIs were found
            if len(local_gis) == 0:
            pb.animate(i + 1)

        taxons = [int(x) for x in taxons]
        return taxons
Example #9
class KrakenPipeline(object):
    """Used by the standalone application sequana_taxonomy

    This runs Kraken on a set of FastQ files, transform the results
    in a format compatible for Krona, and creates a Krona HTML report.


        from sequana import KrakenPipeline
        kt = KrakenPipeline(["R1.fastq.gz", "R2.fastq.gz"], database="krakendb")

    .. warning:: We do not provide Kraken database within sequana. You may
        either download a database from https://ccb.jhu.edu/software/kraken/
        or use this class to download a toy example that will
        be stored in e.g .config/sequana under Unix platforms.
        See :class:`KrakenDownload`.

    .. seealso:: We provide a standalone application of this class, which is
        called sequana_taxonomy and can be used within a command shell.

    def __init__(self, fastq, database, threads=4, output_directory="kraken",
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output_directory: output filename of the Krona HTML page
        :param dbname:

        Description: internally, once Kraken has performed an analysis, reads
        are associated to a taxon (or not). We then find the correponding
        lineage and scientific names to be stored within a Krona formatted file.
        KtImportTex is then used to create the Krona page.

        # Set and create output directory
        self._devtools = DevTools()
        self.output_directory = output_directory
        self.ka = KrakenAnalysis(fastq, database, threads)

        if dbname is None:
            self.dbname = os.path.basename(database)
            self.dbname = dbname

    def run(self, output_filename_classified=None,
        """Run the analysis using Kraken and create the Krona output

        .. todo:: reuse the KrakenResults code to simplify this method.

        # Run Kraken (KrakenAnalysis)
        kraken_results = self.output_directory + os.sep + "kraken.out"


        # Translate kraken output to a format understood by Krona and save png
        # image
        self.kr = KrakenResults(kraken_results)

        df = self.kr.plot(kind="pie")
        pylab.savefig(self.output_directory + os.sep + "kraken.png")

        prefix = self.output_directory + os.sep

        self.kr.kraken_to_json(prefix + "kraken.json", self.dbname)
        self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname)

        # Transform to Krona HTML
        from snakemake import shell
        kraken_html = self.output_directory + os.sep + "kraken.html"
        status = self.kr.kraken_to_krona(output_filename=prefix+"kraken.out.summary")
        if status is True:
            shell("ktImportText %s -o %s" % (prefix+"kraken.out.summary", kraken_html))
            shell("touch {}".format(kraken_html))

    def show(self):
        """Opens the filename defined in the constructor"""
        from easydev import onweb