Example #1
0
    def __init__(self,
                 fastq,
                 database,
                 threads=4,
                 output_directory="kraken",
                 dbname=None):
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output_directory: output filename of the Krona HTML page
        :param dbname:

        Description: internally, once Kraken has performed an analysis, reads
        are associated to a taxon (or not). We then find the correponding
        lineage and scientific names to be stored within a Krona formatted file.
        KtImportTex is then used to create the Krona page.

        """
        # Set and create output directory
        self._devtools = DevTools()
        self.output_directory = output_directory
        self._devtools.mkdir(output_directory)
        self.ka = KrakenAnalysis(fastq, database, threads)

        if dbname is None:
            self.dbname = os.path.basename(database)
        else:
            self.dbname = dbname
Example #2
0
    def __init__(self, organism='H**o sapiens', verbose=True, cache=False):
        """.. rubric:: Constructor

        :param str orgamism: the organism to look at. H**o sapiens
            is the default. Other possible organisms can be found
            in :attr:`organisms`.
        :param str verbose: a verbose level in ERROR/DEBUG/INFO/WARNING
            compatible with those used in BioServices.

        """
        super(Complexes, self).__init__(level=verbose)

        self.devtools = DevTools()
        self.webserv = IntactComplex(verbose=verbose, cache=cache)
        df = self.webserv.search('*', frmt='pandas')
        self.df = df

        #: list of valid organisms found in the database
        self.valid_organisms = list(set(df['organismName']))
        self.valid_organisms = [x.split(';')[0] for x in self.valid_organisms]


        #: list of valid organisms found in the database
        self.organisms = list(set(df['organismName']))
        self._organism = None
        if organism in self.organisms:
            self.organism = organism
        else:
            print("Organism not set yet. ")

        # This will populated on request as a cache/buffer
        self._details = None
        self._complexes = None
Example #3
0
    def __init__(self, fastq, database, threads=4):
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output: output filename of the Krona HTML page

        :param return:

        """
        self._devtools = DevTools()
        self._devtools.check_exists(database)

        self.database = database
        self.threads = threads

        # Fastq input
        if isinstance(fastq, str):
            self.paired = False
            self.fastq = [fastq]
        elif isinstance(fastq, list):
            if len(fastq) == 2:
                self.paired = True
            else:
                self.paired = False
            self.fastq = fastq
        else:
            raise ValueError(
                "Expected a fastq filename or list of 2 fastq filenames")

        for this in self.fastq:
            self._devtools.check_exists(database)
Example #4
0
    def __init__(self, organism="H**o sapiens", cache=False):
        """.. rubric:: Constructor

        :param str orgamism: the organism to look at. H**o sapiens
            is the default. Other possible organisms can be found
            in :attr:`organisms`.

        """
        self.logging = Logging()

        self.devtools = DevTools()
        self.webserv = IntactComplex(verbose=verbose, cache=cache)
        df = self.webserv.search("*", frmt="pandas")
        self.df = df

        #: list of valid organisms found in the database
        self.valid_organisms = list(set(df["organismName"]))
        self.valid_organisms = [x.split(";")[0] for x in self.valid_organisms]

        #: list of valid organisms found in the database
        self.organisms = list(set(df["organismName"]))
        self._organism = None
        if organism in self.organisms:
            self.organism = organism
        else:
            print("Organism not set yet. ")

        # This will populated on request as a cache/buffer
        self._details = None
        self._complexes = None
Example #5
0
    def __init__(self, name, url=None, verbose=True, requests_per_sec=10):
        """.. rubric:: Constructor

        :param str name: a name for this service
        :param str url: its URL
        :param bool verbose: prints informative messages if True (default is
            True)
        :param requests_per_sec: maximum number of requests per seconds
            are restricted to 3. You can change that value. If you reach the
            limit, an error is raise. The reason for this limitation is
            that some services (e.g.., NCBI) may black list you IP.
            If you need or can do more (e.g., ChEMBL does not seem to have
            restrictions), change the value. You can also have several instance
            but again, if you send too many requests at the same, your future
            requests may be retricted. Currently implemented for REST only


        All instances have an attribute called :attr:`~Service.logging` that
        is an instanceof the :mod:`logging` module. It can be used to print
        information, warning, error messages::

            self.logging.info("informative message")
            self.logging.warning("warning message")
            self.logging.error("error message")

        The attribute :attr:`~Service.debugLevel`  can be used to set the behaviour
        of the logging messages. If the argument verbose is True, the debugLebel
        is set to INFO. If verbose if False, the debugLevel is set to WARNING.
        However, you can use the :attr:`debugLevel` attribute to change it to
        one of DEBUG, INFO, WARNING, ERROR, CRITICAL. debugLevel=WARNING means
        that only WARNING, ERROR and CRITICAL messages are shown.

        """
        super(Service, self).__init__()
        self.requests_per_sec = requests_per_sec
        self.name = name
        self.logging = Logging("bioservices:%s" % self.name, verbose)

        self._url = url
        try:
            if self.url is not None:
                urlopen(self.url)
        except Exception as err:
            self.logging.warning("The URL (%s) provided cannot be reached." %
                                 self.url)
        self._easyXMLConversion = True

        # used by HGNC where some XML contains non-utf-8 characters !!
        # should be able to fix it with requests once HGNC works again
        #self._fixing_unicode = False
        #self._fixing_encoding = "utf-8"

        self.devtools = DevTools()
        self.settings = BioServicesConfig()

        self._last_call = 0
Example #6
0
    def init(self):
        # mkdir library
        self.library_path = self.dbname + os.sep + "library"
        self.taxon_path = self.dbname + os.sep + "taxonomy"
        self.fasta_path = self.library_path + os.sep + "added"

        self._devtools = DevTools()
        self._devtools.mkdir(self.dbname)
        self._devtools.mkdir(self.library_path)
        self._devtools.mkdir(self.fasta_path)
        self._devtools.mkdir(self.taxon_path)
Example #7
0
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Example #8
0
    def __init__(self):

        dv = DevTools()
        self.base = sequana_config_path + os.sep + "busco"
        dv.mkdir(self.base)
        self.filenames = sorted([
            "bacteria_odb9",
            "proteobacteria_odb9",
            "rhizobiales_odb9",
            "betaproteobacteria_odb9",
            "gammaproteobacteria_odb9",
            "enterobacteriales_odb9",
            "deltaepsilonsub_odb9",
            "actinobacteria_odb9",
            "cyanobacteria_odb9",
            "firmicutes_odb9",
            "clostridia_odb9",
            "lactobacillales_odb9",
            "bacillales_odb9",
            "bacteroidetes_odb9",
            "spirochaetes_odb9",
            "tenericutes_odb9",
            "eukaryota_odb9",
            "fungi_odb9",
            "microsporidia_odb9",
            "dikarya_odb9",
            "ascomycota_odb9",
            "pezizomycotina_odb9",
            "eurotiomycetes_odb9",
            "sordariomyceta_odb9",
            "saccharomyceta_odb9",
            "saccharomycetales_odb9",
            "basidiomycota_odb9",
            "metazoa_odb9",
            "nematoda_odb9",
            "arthropoda_odb9",
            "insecta_odb9",
            "endopterygota_odb9",
            "hymenoptera_odb9",
            "diptera_odb9",
            "vertebrata_odb9",
            "actinopterygii_odb9",
            "tetrapoda_odb9",
            "aves_odb9",
            "mammalia_odb9",
            "euarchontoglires_odb9",
            "laurasiatheria_odb9",
            "embryophyta_odb9",
            "protists_ensembl",
            "alveolata_stramenophiles_ensembl"])
Example #9
0
    def __init__(self, fastq, database, threads=4 ):
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output: output filename of the Krona HTML page

        :param return:

        """
        self._devtools = DevTools()
        self._devtools.check_exists(database)

        self.database = database
        self.threads = threads

        # Fastq input
        if isinstance(fastq, str):
            self.paired = False
            self.fastq = [fastq]
        elif isinstance(fastq, list):
            if len(fastq) == 2:
                self.paired = True
            else:
                self.paired = False
            self.fastq = fastq
        else:
            raise ValueError("Expected a fastq filename or list of 2 fastq filenames")

        for this in self.fastq:
            self._devtools.check_exists(database)
Example #10
0
    def __init__(self, fastq, database, threads=4, output_directory="kraken",
            dbname=None):
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output_directory: output filename of the Krona HTML page
        :param dbname:

        Description: internally, once Kraken has performed an analysis, reads
        are associated to a taxon (or not). We then find the correponding
        lineage and scientific names to be stored within a Krona formatted file.
        KtImportTex is then used to create the Krona page.

        """
        # Set and create output directory
        self._devtools = DevTools()
        self.output_directory = output_directory
        self._devtools.mkdir(output_directory)
        self.ka = KrakenAnalysis(fastq, database, threads)

        if dbname is None:
            self.dbname = os.path.basename(database)
        else:
            self.dbname = dbname
Example #11
0
class Tools(object):
    # Helper class to simplify following code
    dv = DevTools()

    def __init__(self, verbose=True):
        self.verbose = verbose

    def purple(self, txt, force=False):
        if self.verbose or force is True:
            print(purple(txt))

    def red(self, txt, force=False):
        if self.verbose or force is True:
            print(red(txt))

    def green(self, txt, force=False):
        if self.verbose or force is True:
            print(green(txt))

    def blue(self, txt, force=False):
        if self.verbose or force is True:
            print(blue(txt))

    def mkdir(self, name):
        self.dv.mkdir(name)
Example #12
0
    def __init__(self,
                 pattern="**/summary.json",
                 output_filename=None,
                 verbose=True,
                 **kargs):
        super().__init__()

        from sequana import logger
        logger.level = "INFO"
        if verbose is False:
            logger.level = "WARNING"

        logger.info(
            "Sequana Summary is still a tool in progress and have been " +
            "  tested with the quality_control pipeline only for now.")
        self.title = "Sequana multiple summary"
        self.devtools = DevTools()

        self.filenames = list(glob.iglob(pattern, recursive=True))
        self.summaries = [ReadSummary(filename) for filename in self.filenames]
        self.projects = [
            ReadSummary(filename).data['project']
            for filename in self.filenames
        ]
        self.create_report_content()
        self.create_html(output_filename)
Example #13
0
    def __init__(self, name, url=None, verbose=True, requests_per_sec=3):
        """.. rubric:: Constructor

        :param str name: a name for this service
        :param str url: its URL
        :param bool verbose: prints informative messages if True (default is
            True)
        :param requests_per_sec: maximum number of requests per seconds
            are restricted to 3. You can change that value. If you reach the
            limit, an error is raise. The reason for this limitation is
            that some services (e.g.., NCBI) may black list you IP. 
            If you need or can do more (e.g., ChEMBL does not seem to have
            restrictions), change the value. You can also have several instance
            but again, if you send too many requests at the same, your future
            requests may be retricted. Currently implemented for REST only


        All instances have an attribute called :attr:`~Service.logging` that
        is an instanceof the :mod:`logging` module. It can be used to print
        information, warning, error messages::

            self.logging.info("informative message")
            self.logging.warning("warning message")
            self.logging.error("error message")

        The attribute :attr:`~Service.debugLevel`  can be used to set the behaviour
        of the logging messages. If the argument verbose is True, the debugLebel
        is set to INFO. If verbose if False, the debugLevel is set to WARNING.
        However, you can use the :attr:`debugLevel` attribute to change it to
        one of DEBUG, INFO, WARNING, ERROR, CRITICAL. debugLevel=WARNING means
        that only WARNING, ERROR and CRITICAL messages are shown.

        """
        super(Service, self).__init__()
        self.requests_per_sec = requests_per_sec

        self.name = name
        self.logging = Logging("bioservices:%s" % self.name, verbose)

        self._url = url
        try:
            if self.url is not None:
                urlopen(self.url)
        except Exception as err:
            self.logging.warning("The URL (%s) provided cannot be reached." % self.url)
        self._easyXMLConversion = True

        # used by HGNC where some XML contains non-utf-8 characters !!
        # should be able to fix it with requests once HGNC works again
        #self._fixing_unicode = False
        #self._fixing_encoding = "utf-8"

        self.devtools = DevTools()
        self.settings = BioServicesConfig()
Example #14
0
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"]
        filenames = [
            "database.idx",
            "database.kdb",
            "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)
Example #15
0
    def __init__(self):

        dv = DevTools()
        self.base = sequana_config_path + os.sep + "busco"
        dv.mkdir(self.base)
        self.filenames = sorted([
            "bacteria_odb9", "proteobacteria_odb9", "rhizobiales_odb9",
            "betaproteobacteria_odb9", "gammaproteobacteria_odb9",
            "enterobacteriales_odb9", "deltaepsilonsub_odb9",
            "actinobacteria_odb9", "cyanobacteria_odb9", "firmicutes_odb9",
            "clostridia_odb9", "lactobacillales_odb9", "bacillales_odb9",
            "bacteroidetes_odb9", "spirochaetes_odb9", "tenericutes_odb9",
            "eukaryota_odb9", "fungi_odb9", "microsporidia_odb9",
            "dikarya_odb9", "ascomycota_odb9", "pezizomycotina_odb9",
            "eurotiomycetes_odb9", "sordariomyceta_odb9",
            "saccharomyceta_odb9", "saccharomycetales_odb9",
            "basidiomycota_odb9", "metazoa_odb9", "nematoda_odb9",
            "arthropoda_odb9", "insecta_odb9", "endopterygota_odb9",
            "hymenoptera_odb9", "diptera_odb9", "vertebrata_odb9",
            "actinopterygii_odb9", "tetrapoda_odb9", "aves_odb9",
            "mammalia_odb9", "euarchontoglires_odb9", "laurasiatheria_odb9",
            "embryophyta_odb9", "protists_ensembl",
            "alveolata_stramenophiles_ensembl"
        ])
Example #16
0
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Example #17
0
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"
        ]
        filenames = [
            "database.idx", "database.kdb", "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"
        ]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)
Example #18
0
class KrakenPipeline(object):
    """Used by the standalone application sequana_taxonomy

    This runs Kraken on a set of FastQ files, transform the results
    in a format compatible for Krona, and creates a Krona HTML report.

    ::

        from sequana import KrakenPipeline
        kt = KrakenPipeline(["R1.fastq.gz", "R2.fastq.gz"], database="krakendb")
        kt.run()
        kt.show()

    .. warning:: We do not provide Kraken database within sequana. You may
        either download a database from https://ccb.jhu.edu/software/kraken/
        or use this class to download a toy example that will
        be stored in e.g .config/sequana under Unix platforms.
        See :class:`KrakenDownload`.

    .. seealso:: We provide a standalone application of this class, which is
        called sequana_taxonomy and can be used within a command shell.

    """
    def __init__(self, fastq, database, threads=4, output_directory="kraken",
            dbname=None):
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output_directory: output filename of the Krona HTML page
        :param dbname:

        Description: internally, once Kraken has performed an analysis, reads
        are associated to a taxon (or not). We then find the correponding
        lineage and scientific names to be stored within a Krona formatted file.
        KtImportTex is then used to create the Krona page.

        """
        # Set and create output directory
        self._devtools = DevTools()
        self.output_directory = output_directory
        self._devtools.mkdir(output_directory)
        self.ka = KrakenAnalysis(fastq, database, threads)

        if dbname is None:
            self.dbname = os.path.basename(database)
        else:
            self.dbname = dbname

    def run(self, output_filename_classified=None,
                output_filename_unclassified=None,
                only_classified_output=False):
        """Run the analysis using Kraken and create the Krona output

        .. todo:: reuse the KrakenResults code to simplify this method.

        """
        # Run Kraken (KrakenAnalysis)
        kraken_results = self.output_directory + os.sep + "kraken.out"

        self.ka.run(
            output_filename=kraken_results,
            output_filename_unclassified=output_filename_unclassified,
            output_filename_classified=output_filename_classified,
            only_classified_output=only_classified_output
        )

        # Translate kraken output to a format understood by Krona and save png
        # image
        self.kr = KrakenResults(kraken_results)

        df = self.kr.plot(kind="pie")
        pylab.savefig(self.output_directory + os.sep + "kraken.png")

        prefix = self.output_directory + os.sep

        self.kr.kraken_to_json(prefix + "kraken.json", self.dbname)
        self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname)

        # Transform to Krona HTML
        from snakemake import shell
        kraken_html = self.output_directory + os.sep + "kraken.html"
        status = self.kr.kraken_to_krona(output_filename=prefix+"kraken.out.summary")
        if status is True:
            shell("ktImportText %s -o %s" % (prefix+"kraken.out.summary", kraken_html))
        else:
            shell("touch {}".format(kraken_html))

    def show(self):
        """Opens the filename defined in the constructor"""
        from easydev import onweb
        onweb(self.output)
Example #19
0
class Complexes(Logging):
    """Manipulate complexes of Proteins

    This class uses Intact Complex database to extract information about
    complexes of proteins. 

    When creating an instance, the default organism is "H**o sapiens". 
    The organism can be set to another one during the instanciation or later::

        >>> from biokit.network.complexes import Complexes
        >>> c = Complexes(organism='H**o sapiens')
        >>> c.organism = 'Rattus norvegicus'

    Valid organisms can be found in :attr:`organisms`. When changing the
    organism, a request to the Intact database is sent, which may take some
    time to update. Once done, information related to  this organism is stored
    in the :attr:`df` attribute, which is a Pandas dataframe. It 
    contains 4 columns. Here is for example one row::

        complexAC                                             EBI-2660609
        complexName                            COP9 signalosome variant 1
        description     Essential regulator of the ubiquitin (Ubl) con...
        organismName                                   H**o sapiens; 9606

    This is basic information but once a complex accession (e.g., EBI-2660609)
    is known, you can retrieve detailled information. This is done
    automatically for all the accession when needed. The first time, it will 
    take a while (20 seconds for 250 accession) but will be cache for this 
    instance. 

    The :attr:`complexes` contains all details about the entries found in
    :attr:`df`. It is a dictionary where keys are the complex accession. For 
    instance::

        >>> c.complexes['EBI-2660609']

    In general, one is interested in the participants of the complex, that is
    the proteins that form the complex. Another attribute is set for you::

        >>> c.participants['EBI-2660609']

    Finally, you may even want to obtain just the identifier of the participants
    for each complex. This is stored in the :attr:`identifiers`::

        >>> c.identifiers['EBI-2660609']

    Note however, that the identifiers are not neceseraly uniprot identifiers.
    Could be ChEBI or sometimes even set to None. The :meth:`strict_filter`
    removes the complexes with less than 2 (strictly) uniprot identifiers.
    
    Some basic statistics can be printed with :meth:`stats` that indeticates
    the number of complexes, number of identifiers in those complexes ,and
    number of unique identifiers. A histogram of number of appearance of each 
    identifier is also shown.

    The :meth:`hist_participants` shows the number of participants per complex.

    Finally, the meth:`search_complexes` can be used in the context of 
    logic modelling to infer the AND gates from a list of uniprot identifiers
    provided by the user. See :meth:`search_complexes` for details.

    Access to the Intact Complex database is performed using the 
    package BioServices provided in Pypi.
    """
    def __init__(self, organism='H**o sapiens', verbose=True, cache=False):
        """.. rubric:: Constructor

        :param str orgamism: the organism to look at. H**o sapiens
            is the default. Other possible organisms can be found
            in :attr:`organisms`.
        :param str verbose: a verbose level in ERROR/DEBUG/INFO/WARNING
            compatible with those used in BioServices.

        """
        super(Complexes, self).__init__(level=verbose)

        self.devtools = DevTools()
        self.webserv = IntactComplex(verbose=verbose, cache=cache)
        df = self.webserv.search('*', frmt='pandas')
        self.df = df

        #: list of valid organisms found in the database
        self.valid_organisms = list(set(df['organismName']))
        self.valid_organisms = [x.split(';')[0] for x in self.valid_organisms]


        #: list of valid organisms found in the database
        self.organisms = list(set(df['organismName']))
        self._organism = None
        if organism in self.organisms:
            self.organism = organism
        else:
            print("Organism not set yet. ")

        # This will populated on request as a cache/buffer
        self._details = None
        self._complexes = None

    def _get_organism(self):
        return self._organism
    def _set_organism(self, organism):
        self.devtools.check_param_in_list(organism, [str(x.split(";")[0]) for x in self.valid_organisms])
        self._organism = organism

        self.df = self.webserv.search('*', frmt='pandas',
                filters='species_f:("%s")' % self.organism)
        self._complexes = None
    organism = property(_get_organism, _set_organism, 
        doc="Getter/Setter of the organism")

    def hist_participants(self):
        """Histogram of the number of participants per complexes

        :return: a dictionary with complex identifiers as keys and
            number of participants as values

        ::

            from biokit.network.complexes import Complexes
            c = Complexes()
            c.hist_participants()

        """
        N = []
        count = {}
        for i, identifier in enumerate(self.complexes.keys()):
            n = len(self.complexes[identifier]['participants'])
            N.append(n)
            count[identifier] = n

        _ = pylab.hist(N, bins=range(0, max(N)))
        pylab.title('Number of participants per complex')
        pylab.grid()
        return count

    def stats(self):
        """Prints some stats about the number of complexes and histogram of the
        number of appearances of each species"""
        species = []
        for k in self.participants.keys():
            species.extend([x['identifier'] for x in self.participants[k]])
            N = []
        for spec in set(species):
            N.append(species.count(spec))
        _ = pylab.hist(N, bins=range(0, max(N)))
        pylab.title("Number of appaerances of each species")
        pylab.grid()
        print("""There are %s complexes involving %s participants with %s unique species. """ %
                 (len(self.complexes), len(species), len(set(species))))

    def _get_participants(self):
        participants = {}
        for k,v in self.complexes.items():
            participants[k] = v['participants']
        return participants
    participants = property(_get_participants, 
        doc="""Getter of the complex participants (full details)""")

    def _get_identifiers(self):
        identifiers = {}
        for k,v in self.participants.items():
            identifiers[k] = [x['identifier'] for x in v]
        return identifiers
    identifiers = property(_get_identifiers,
            doc="""Getter of the identifiers of the complex participants""")

    def _get_complexes(self):
        if self._complexes is None:
            self._load_complexes()
        return self._complexes.copy()
    complexes = property(_get_complexes,
        doc="""Getter of the complexes (full details""")

    def _load_complexes(self, show_progress=True):
        from easydev import Progress
        import time
        pb = Progress(len(self.df.complexAC))
        complexes = {}
        self.logging.info("Loading all details from the IntactComplex database")
        for i, identifier in enumerate(self.df.complexAC):
            res = self.webserv.details(identifier)
            complexes[identifier] = res
            if show_progress:
                pb.animate(i+1)
        self._complexes = complexes

    def remove_homodimers(self):
        """Remove identifiers that are None or starts with CHEBI
        and keep complexes that have at least 2 participants
        
        
        :return: list of complex identifiers that have been removed.
        """


        # None are actually h**o dimers
        toremove = []
        for k,this in self.identifiers.items():
            remains = [x for x in this if x is not None]
            if len(remains)<=1:
                toremove.append(k)
        self.logging.info("removing %s homodimers complexes" % len(toremove))
        for this in toremove:
            del self._complexes[this]
        return toremove

    def search_complexes(self, user_species, verbose=False):
        """Given a list of uniprot identifiers, return complexes and
            possible complexes.

        :param list user_species: list of uniprot identifiers to be
            found in the complexes
        :return: two dictionaries. First one contains the complexes
            for which all participants have been found in the user_species
            list. The second one contains complexes for which some participants
            (not all) have been found in the user_species list.

        """
        level = self.debugLevel[:]
        if verbose:
            self.debugLevel = 'INFO'
        else:
            self.debugLevel = 'ERROR'

        and_gates = {}
        candidates = {}

        identifiers = self.identifiers.values()

        for k, identifiers in self.identifiers.items():

            # get rid of suffixes such as -1 or -PRO_xxx
            prefixes = [x.split("-")[0] if x is not None else x for x in identifiers]


            # You may have a complex with ['P12222', 'P33333-PRO1',
            # 'P33333-PRO2'], in which case P33333 is found only once and
            # thereofre the final number of found participants is not the length
            # of the complexes...so we need to get rid of the duplicates if any
            prefixes = list(set(prefixes))
            N = len(prefixes)
            found = [spec for spec in user_species if spec in prefixes]

            if len(found) == N:
                self.logging.info('Found entire complex %s ' % k)
                and_gates[k] = identifiers[:]
            elif len(found) >= 1:
                self.logging.info('Found partial complex %s with %s participants out of %s' % 
                        (k, len(found), len(identifiers)))
                candidates[k] = {'participants': identifiers, 'found': found}
        self.debugLevel = level[:]
        return and_gates, candidates

    def search(self, name):
        """Search for a unique identifier (e.g. uniprot) in all complexes

        :return: list of complex identifiers where the name was found
        """
        found = []
        for k, identifiers in self.identifiers.items():
            prefixes = [x.split("-")[0] if x is not None else x for x in identifiers ]
            if name in prefixes:
                self.logging.info("Found %s in complex %s (%s)" % (name, k,
                    identifiers))
                found.append(k)
        return found

    def chebi2name(self, name):
        """Return the ASCII name of a CHEBI identifier"""
        from bioservices import ChEBI
        c = ChEBI()
        name = dict(c.getLiteEntity(name)[0])['chebiAsciiName']
        return name

    def uniprot2genename(self, name):
        """Return the gene names of a UniProt identifier"""
        from bioservices import UniProt
        c = UniProt(cache=True)

        try:
            res = pd.read_csv(StringIO(c.search(name, limit=1)), sep='\t')
            return list(res['Gene names'].values)
        except:
            print("Could not find %s" % name)

    def report(self, species):
        complete, partial = self.search_complexes(species, verbose=False)
        res = {'Found':[], 'Participants':[], 'Complete':[],
                'Identifier':[], 'Number found':[], 'Number of participants':[],
                'Name':[]}

        for k, v in complete.items():
            res['Name'].append(self.complexes[k]['name'])
            res['Found'].append(";".join(v))
            res['Number found'].append(len(v))
            res['Participants'].append(";".join(self.identifiers[k]))
            res['Number of participants'].append(len(self.identifiers[k]))
            res['Complete'].append(True)
            res['Identifier'].append(k)

        for k, v in partial.items():
            res['Name'].append(self.complexes[k]['name'])
            res['Found'].append(";".join(v['found']))
            res['Number found'].append(len(v['found']))
            res['Participants'].append(";".join(self.identifiers[k]))
            res['Number of participants'].append(len(self.identifiers[k]))
            res['Complete'].append(False)
            res['Identifier'].append(k)

        res = pd.DataFrame(res, columns=['Found', 'Participants', 'Identifier', 'Name', 'Number found', 
            'Number of participants', 'Complete'])
        return res
Example #20
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.level = options.level

    if options.update_taxonomy is True:
        from sequana.taxonomy import Taxonomy
        tax = Taxonomy()
        from sequana import sequana_config_path as cfg
        logger.info(
            "Will overwrite the local database taxonomy.dat in {}".format(cfg))
        tax.download_taxonomic_file(overwrite=True)
        sys.exit(0)

    # We put the import here to make the --help faster
    from sequana import KrakenPipeline
    from sequana.kraken import KrakenSequential
    devtools = DevTools()

    if options.download:
        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.download(options.download)
        sys.exit()

    fastq = []
    if options.file1:
        devtools.check_exists(options.file1)
        fastq.append(options.file1)
    if options.file2:
        devtools.check_exists(options.file2)
        fastq.append(options.file2)

    from sequana import sequana_config_path as scfg
    if options.databases is None:
        logger.critical("You must provide a database")
        sys.exit(1)

    databases = []
    for database in options.databases:
        if database == "toydb":
            database = "kraken_toydb"
        elif database == "minikraken":
            database = "minikraken_20141208"

        if os.path.exists(scfg + os.sep + database):  # in Sequana path
            databases.append(scfg + os.sep + database)
        elif os.path.exists(database):  # local database
            databases.append(database)
        else:
            msg = "Invalid database name (%s). Neither found locally "
            msg += "or in the sequana path %s; Use the --download option"
            raise ValueError(msg % (database, scfg))

    output_directory = options.directory + os.sep + "kraken"
    devtools.mkdirs(output_directory)

    # if there is only one database, use the pipeline else KrakenHierarchical
    _pathto = lambda x: "{}/kraken/{}".format(options.directory, x) if x else x
    if len(databases) == 1:
        logger.info("Using 1 database")
        k = KrakenPipeline(fastq,
                           databases[0],
                           threads=options.thread,
                           output_directory=output_directory,
                           confidence=options.confidence)

        k.run(output_filename_classified=_pathto(options.classified_out),
              output_filename_unclassified=_pathto(options.unclassified_out))
    else:
        logger.info("Using %s databases" % len(databases))
        k = KrakenSequential(fastq,
                             databases,
                             threads=options.thread,
                             output_directory=output_directory + os.sep,
                             force=True,
                             keep_temp_files=options.keep_temp_files,
                             output_filename_unclassified=_pathto(
                                 options.unclassified_out),
                             confidence=options.confidence)
        k.run(output_prefix="kraken")

    # This statements sets the directory where HTML will be saved
    from sequana.utils import config
    config.output_dir = options.directory

    # output_directory first argument: the directory where to find the data
    # output_filename is relative to the config.output_dir defined above
    kk = KrakenModule(output_directory, output_filename="summary.html")

    logger.info("Open ./%s/summary.html" % options.directory)
    logger.info("or ./%s/kraken/kraken.html" % options.directory)

    if options.html is True:
        ss.onweb()
Example #21
0
# source
# http://nbviewer.ipython.org/github/tritemio/notebooks/blob/master/Mixture_Model_Fitting.ipynb

from easydev import DevTools
devtools = DevTools()
from scipy.optimize import minimize, show_options
import scipy.stats as ss
import numpy as np
import pylab
from easydev import AttrDict

from . import criteria

import numpy as np

half_log_two_pi = 0.5 * np.log(2 * np.pi)


class Model(object):
    """New base model"""
    def __init__(self):
        pass

    def log_density(self, data):
        raise NotImplementedError

    def estimate(self, data, weights):
        raise NotImplementedError

    def generate(self):
        raise NotImplementedError
Example #22
0
class KrakenAnalysis(object):
    """Run kraken on a set of FastQ files

    In order to run a Kraken analysis, we firtst need a local database.
    We provide a Toy example. The ToyDB is downloadable as follows ( you will
    need to run the following code only once)::

        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.download_kraken_toydb()

    .. seealso:: :class:`KrakenDownload`  for more database and
        :class:`sequana.kraken_builder.KrakenBuilder` to build your own
        databases

    The path to the database is required to run the analysis. It has been
    stored in the directory ./config/sequana/kraken_toydb under Linux platforms
    The following code should be platform independent::

        import os
        from sequana import sequana_config_path
        database = sequana_config_path + os.sep + "kraken_toydb")

    Finally, we can run the analysis on the toy data set::

        from sequana import sequana_data
        data = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz", "data")
        ka = KrakenAnalysis(data, database=database)
        ka.run()

    This creates a file named *kraken.out*. It can be interpreted with
    :class:`KrakenResults`
    """
    def __init__(self, fastq, database, threads=4 ):
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output: output filename of the Krona HTML page

        :param return:

        """
        self._devtools = DevTools()
        self._devtools.check_exists(database)

        self.database = database
        self.threads = threads

        # Fastq input
        if isinstance(fastq, str):
            self.paired = False
            self.fastq = [fastq]
        elif isinstance(fastq, list):
            if len(fastq) == 2:
                self.paired = True
            else:
                self.paired = False
            self.fastq = fastq
        else:
            raise ValueError("Expected a fastq filename or list of 2 fastq filenames")

        for this in self.fastq:
            self._devtools.check_exists(database)

    def run(self, output_filename=None, output_filename_classified=None,
            output_filename_unclassified=None, only_classified_output=False):
        """Performs the kraken analysis

        :param str output_filename: if not provided, a temporary file is used
            and stored in :attr:`kraken_output`.
        :param str output_filename_classified: not compressed
        :param str output_filename_unclassified: not compressed

        """
        if output_filename is None:
            self.kraken_output = TempFile().name
        else:
            self.kraken_output = output_filename

        params = {
            "database": self.database,
            "thread": self.threads,
            "file1": self.fastq[0],
            "kraken_output": self.kraken_output,
            "output_filename_unclassified": output_filename_unclassified,
            "output_filename_classified": output_filename_classified,
            }

        if self.paired:
            params["file2"] = self.fastq[1]

        command = "kraken -db %(database)s %(file1)s "

        if self.paired:
            command += " %(file2)s --paired"
        command += " --threads %(thread)s --output %(kraken_output)s "
        command += " --out-fmt legacy"

        if output_filename_unclassified:
            command +=  " --unclassified-out %(output_filename_unclassified)s "

        if only_classified_output is True:
            command += " --only-classified-output"

        if output_filename_classified:
            command +=  " --classified-out %(output_filename_classified)s "

        command = command % params
        # Somehow there is an error using easydev.execute with pigz
        from snakemake import shell
        shell(command)
Example #23
0
class Service(object):
    """Base class for WSDL and REST classes

    .. seealso:: :class:`REST`, :class:`WSDLService`
    """

    #: some useful response codes
    response_codes = {
        200: 'OK',
        201: 'Created',
        400: 'Bad Request. There is a problem with your input',
        404: 'Not found. The resource you requests does not exist',
        405: 'Method not allowed',
        406: "Not Acceptable. Usually headers issue",
        410: 'Gone. The resource you requested was removed.',
        415: "Unsupported Media Type",
        500: 'Internal server error. Most likely a temporary problem',
        503:
        'Service not available. The server is being updated, try again later'
    }

    def __init__(self, name, url=None, verbose=True, requests_per_sec=10):
        """.. rubric:: Constructor

        :param str name: a name for this service
        :param str url: its URL
        :param bool verbose: prints informative messages if True (default is
            True)
        :param requests_per_sec: maximum number of requests per seconds
            are restricted to 3. You can change that value. If you reach the
            limit, an error is raise. The reason for this limitation is
            that some services (e.g.., NCBI) may black list you IP.
            If you need or can do more (e.g., ChEMBL does not seem to have
            restrictions), change the value. You can also have several instance
            but again, if you send too many requests at the same, your future
            requests may be retricted. Currently implemented for REST only


        All instances have an attribute called :attr:`~Service.logging` that
        is an instanceof the :mod:`logging` module. It can be used to print
        information, warning, error messages::

            self.logging.info("informative message")
            self.logging.warning("warning message")
            self.logging.error("error message")

        The attribute :attr:`~Service.debugLevel`  can be used to set the behaviour
        of the logging messages. If the argument verbose is True, the debugLebel
        is set to INFO. If verbose if False, the debugLevel is set to WARNING.
        However, you can use the :attr:`debugLevel` attribute to change it to
        one of DEBUG, INFO, WARNING, ERROR, CRITICAL. debugLevel=WARNING means
        that only WARNING, ERROR and CRITICAL messages are shown.

        """
        super(Service, self).__init__()
        self.requests_per_sec = requests_per_sec
        self.name = name
        self.logging = Logging("bioservices:%s" % self.name, verbose)

        self._url = url
        try:
            if self.url is not None:
                urlopen(self.url)
        except Exception as err:
            self.logging.warning("The URL (%s) provided cannot be reached." %
                                 self.url)
        self._easyXMLConversion = True

        # used by HGNC where some XML contains non-utf-8 characters !!
        # should be able to fix it with requests once HGNC works again
        #self._fixing_unicode = False
        #self._fixing_encoding = "utf-8"

        self.devtools = DevTools()
        self.settings = BioServicesConfig()

        self._last_call = 0

    def _calls(self):
        time_lapse = 1. / self.requests_per_sec
        current_time = time.time()
        dt = current_time - self._last_call

        if self._last_call == 0:
            self._last_call = current_time
            return
        else:
            self._last_call = current_time
            if dt > time_lapse:
                return
            else:
                time.sleep(time_lapse - dt)

    def _get_caching(self):
        return self.settings.params['cache.on'][0]

    def _set_caching(self, caching):
        self.devtools.check_param_in_list(caching, [True, False])
        self.settings.params['cache.on'][0] = caching
        # reset the session, which will be automatically created if we
        # access to the session attribute
        self._session = None

    CACHING = property(_get_caching, _set_caching)

    def _get_url(self):
        return self._url

    def _set_url(self, url):
        # something more clever here to check the URL e.g. starts with http
        if url is not None:
            url = url.rstrip("/")
            self._url = url

    url = property(_get_url, _set_url, doc="URL of this service")

    def _get_easyXMLConversion(self):
        return self._easyXMLConversion

    def _set_easyXMLConversion(self, value):
        if isinstance(value, bool) is False:
            raise TypeError("value must be a boolean value (True/False)")
        self._easyXMLConversion = value

    easyXMLConversion = property(
        _get_easyXMLConversion,
        _set_easyXMLConversion,
        doc=
        """If True, xml output from a request are converted to easyXML object (Default behaviour)."""
    )

    def easyXML(self, res):
        """Use this method to convert a XML document into an
            :class:`~bioservices.xmltools.easyXML` object

        The easyXML object provides utilities to ease access to the XML
        tag/attributes.

        Here is a simple example starting from the following XML

        .. doctest::

            >>> from bioservices import *
            >>> doc = "<xml> <id>1</id> <id>2</id> </xml>"
            >>> s = Service("name")
            >>> res = s.easyXML(doc)
            >>> res.findAll("id")
            [<id>1</id>, <id>2</id>]

        """
        from bioservices import xmltools
        return xmltools.easyXML(res)

    def __str__(self):
        txt = "This is an instance of %s service" % self.name
        return txt

    def pubmed(self, Id):
        """Open a pubmed Id into a browser tab

        :param Id: a valid pubmed Id in string or integer format.

        The URL is a concatenation of the pubmed URL
        http://www.ncbi.nlm.nih.gov/pubmed/ and the provided Id.

        """
        url = "http://www.ncbi.nlm.nih.gov/pubmed/"
        import webbrowser
        webbrowser.open(url + str(Id))

    def on_web(self, url):
        """Open a URL into a browser"""
        import webbrowser
        webbrowser.open(url)

    def save_str_to_image(self, data, filename):
        """Save string object into a file converting into binary"""
        with open(filename, 'wb') as f:
            import binascii
            try:
                #python3
                newres = binascii.a2b_base64(bytes(data, "utf-8"))
            except:
                newres = binascii.a2b_base64(data)
            f.write(newres)
Example #24
0
def scoring(args=None):
    """This function is used by the standalone application called dreamscoring

    ::

        dreamscoring --help

    """
    d = DevTools()

    if args is None:
        args = sys.argv[:]
    user_options = Options(prog="dreamtools")

    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    if options.version is True:
        print("%s" % dreamtools.version)
        sys.exit()

    # Check on the challenge name
    if options.challenge is None:
        print_color('--challenge must be provided', red)
        sys.exit()
    else:
        options.challenge = options.challenge.upper()
        options.challenge = options.challenge.replace('DOT', 'dot')

        from dreamtools.admin.download_data import get_challenge_list
        if options.challenge not in get_challenge_list():
            print_color("This challenge %s is not registered in dreamtools." %
                    options.challenge, red)
            print("Here is the list of registered challenges: " +
                ", ".join(get_challenge_list()))
            sys.exit()

    # Check that the challenge can be loaded
    class_inst = get_challenge(options.challenge)
    try:
        this = class_inst.import_scoring_class()
    except NotImplementedError as err:
        print("\n"+str(err))
        sys.exit()
    else:
        # User may just request some information about the challenge.
        if options.info is True:
            print(this)
            sys.exit()
        elif options.onweb is True:
            this.onweb()
            sys.exit()

    # Checks name of the sub-challenges
    subchallenges = get_subchallenges(options.challenge)

    if len(subchallenges) and options.sub_challenge is None:
        txt = "This challenge requires a sub challenge name. "
        txt += "Please use --sub-challenge followed by one value in %s " % subchallenges
        print_color(txt, red)
        sys.exit(0)

    if options.sub_challenge is not None and len(subchallenges) != 0:
        try:
            d.check_param_in_list(options.sub_challenge, subchallenges)
        except ValueError as err:
            txt = "DREAMTools error: unknown sub challenge or not implemented"
            txt += "--->" + str(err)
            print_color(txt, red)
            sys.exit()

    # maybe users just need a template
    if options.download_template is True:
        c = Challenge(options.challenge)
        class_inst = c.import_scoring_class()
        if options.sub_challenge is None:
            print(class_inst.download_template())
        else:
            print(class_inst.download_template(options.sub_challenge))
        return

    # similary for the GS
    if options.download_goldstandard is True:
        c = Challenge(options.challenge)
        class_inst = c.import_scoring_class()
        if options.sub_challenge is None:
            print(class_inst.download_goldstandard())
        else:
            print(class_inst.download_goldstandard(options.sub_challenge))
        return

    # finally, we need a submission
    if options.filename is None:
        txt = "---> filename not provided. You must provide a filename with correct format\n"
        txt += "You may get a template using --download-template \n"
        txt += "Alternatively, you can user either --info or --onweb option to get information about the challenge.\n"
        txt += "https://github.com/dreamtools/dreamtools, or http://dreamchallenges.org\n"
        print_color(txt, red)
        sys.exit()

    # filename
    # filename in general is a single string but could be a list of filenames
    # Because on the parser, we must convert the string into a single string
    # if the list haa a length of 1
    for filename in options.filename:
        if os.path.exists(filename) is False:
            raise IOError("file %s does not seem to exists" % filename)
    if len(options.filename) == 1:
        options.filename = options.filename[0]

    print_color("DREAMTools scoring", purple, underline=True)
    print('Challenge %s (sub challenge %s)\n\n' % (options.challenge,
        options.sub_challenge))

    res = generic_scoring(options.challenge,
            options.filename,
            subname=options.sub_challenge,
            goldstandard=options.goldstandard)

    txt = "Solution for %s in challenge %s" % (options.filename,
            options.challenge)

    if options.sub_challenge is not None:
        txt += " (sub-challenge %s)" % options.sub_challenge
    txt += " is :\n"

    for k in sorted(res.keys()):
        txt += darkgreen("     %s:\n %s\n" %(k, res[k]))
    print(txt)
Example #25
0
class KrakenAnalysis(object):
    """Run kraken on a set of FastQ files

    In order to run a Kraken analysis, we firtst need a local database.
    We provide a Toy example. The ToyDB is downloadable as follows ( you will
    need to run the following code only once)::

        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.download_kraken_toydb()

    .. seealso:: :class:`KrakenDownload`  for more database and
        :class:`sequana.kraken_builder.KrakenBuilder` to build your own
        databases

    The path to the database is required to run the analysis. It has been
    stored in the directory ./config/sequana/kraken_toydb under Linux platforms
    The following code should be platform independent::

        import os
        from sequana import sequana_config_path
        database = sequana_config_path + os.sep + "kraken_toydb")

    Finally, we can run the analysis on the toy data set::

        from sequana import sequana_data
        data = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz", "data")
        ka = KrakenAnalysis(data, database=database)
        ka.run()

    This creates a file named *kraken.out*. It can be interpreted with
    :class:`KrakenResults`
    """
    def __init__(self, fastq, database, threads=4):
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output: output filename of the Krona HTML page

        :param return:

        """
        self._devtools = DevTools()
        self._devtools.check_exists(database)

        self.database = database
        self.threads = threads

        # Fastq input
        if isinstance(fastq, str):
            self.paired = False
            self.fastq = [fastq]
        elif isinstance(fastq, list):
            if len(fastq) == 2:
                self.paired = True
            else:
                self.paired = False
            self.fastq = fastq
        else:
            raise ValueError(
                "Expected a fastq filename or list of 2 fastq filenames")

        for this in self.fastq:
            self._devtools.check_exists(database)

    def run(self,
            output_filename=None,
            output_filename_classified=None,
            output_filename_unclassified=None,
            only_classified_output=False):
        """Performs the kraken analysis

        :param str output_filename: if not provided, a temporary file is used
            and stored in :attr:`kraken_output`.
        :param str output_filename_classified: not compressed
        :param str output_filename_unclassified: not compressed

        """
        if output_filename is None:
            self.kraken_output = TempFile().name
        else:
            self.kraken_output = output_filename

        params = {
            "database": self.database,
            "thread": self.threads,
            "file1": self.fastq[0],
            "kraken_output": self.kraken_output,
            "output_filename_unclassified": output_filename_unclassified,
            "output_filename_classified": output_filename_classified,
        }

        if self.paired:
            params["file2"] = self.fastq[1]

        command = "kraken -db %(database)s %(file1)s "

        if self.paired:
            command += " %(file2)s --paired"
        command += " --threads %(thread)s --out %(kraken_output)s"

        if output_filename_unclassified:
            command += " --unclassified-out %(output_filename_unclassified)s "

        if only_classified_output is True:
            command += " --only-classified-output"

        if output_filename_classified:
            command += " --classified-out %(output_filename_classified)s "

        command = command % params
        # Somehow there is an error using easydev.execute with pigz
        from snakemake import shell
        shell(command)
Example #26
0
class KrakenPipeline(object):
    """Used by the standalone application sequana_taxonomy

    This runs Kraken on a set of FastQ files, transform the results
    in a format compatible for Krona, and creates a Krona HTML report.

    ::

        from sequana import KrakenPipeline
        kt = KrakenPipeline(["R1.fastq.gz", "R2.fastq.gz"], database="krakendb")
        kt.run()
        kt.show()

    .. warning:: We do not provide Kraken database within sequana. You may
        either download a database from https://ccb.jhu.edu/software/kraken/
        or use this class to download a toy example that will
        be stored in e.g .config/sequana under Unix platforms.
        See :class:`KrakenDownload`.

    .. seealso:: We provide a standalone application of this class, which is
        called sequana_taxonomy and can be used within a command shell.

    """
    def __init__(self,
                 fastq,
                 database,
                 threads=4,
                 output_directory="kraken",
                 dbname=None):
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output_directory: output filename of the Krona HTML page
        :param dbname:

        Description: internally, once Kraken has performed an analysis, reads
        are associated to a taxon (or not). We then find the correponding
        lineage and scientific names to be stored within a Krona formatted file.
        KtImportTex is then used to create the Krona page.

        """
        # Set and create output directory
        self._devtools = DevTools()
        self.output_directory = output_directory
        self._devtools.mkdir(output_directory)
        self.ka = KrakenAnalysis(fastq, database, threads)

        if dbname is None:
            self.dbname = os.path.basename(database)
        else:
            self.dbname = dbname

    def run(self,
            output_filename_classified=None,
            output_filename_unclassified=None,
            only_classified_output=False):
        """Run the analysis using Kraken and create the Krona output

        .. todo:: reuse the KrakenResults code to simplify this method.

        """
        # Run Kraken (KrakenAnalysis)
        kraken_results = self.output_directory + os.sep + "kraken.out"

        self.ka.run(output_filename=kraken_results,
                    output_filename_unclassified=output_filename_unclassified,
                    output_filename_classified=output_filename_classified,
                    only_classified_output=only_classified_output)

        # Translate kraken output to a format understood by Krona and save png
        # image
        self.kr = KrakenResults(kraken_results)

        df = self.kr.plot(kind="pie")
        pylab.savefig(self.output_directory + os.sep + "kraken.png")

        prefix = self.output_directory + os.sep

        self.kr.kraken_to_json(prefix + "kraken.json", self.dbname)
        self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname)

        # Transform to Krona HTML
        from snakemake import shell
        kraken_html = self.output_directory + os.sep + "kraken.html"
        status = self.kr.kraken_to_krona(output_filename=prefix +
                                         "kraken.out.summary")
        if status is True:
            shell("ktImportText %s -o %s" %
                  (prefix + "kraken.out.summary", kraken_html))
        else:
            shell("touch {}".format(kraken_html))

    def show(self):
        """Opens the filename defined in the constructor"""
        from easydev import onweb
        onweb(self.output)
Example #27
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
       options = user_options.parse_args(args[1:])

    logger.level = options.level

    # We put the import here to make the --help faster
    from sequana import KrakenPipeline
    from sequana.kraken import KrakenHierarchical
    devtools = DevTools()

    if options.download:
        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.download(options.download)
        sys.exit()

    fastq = []
    if options.file1:
        devtools.check_exists(options.file1)
        fastq.append(options.file1)
    if options.file2:
        devtools.check_exists(options.file2)
        fastq.append(options.file2)


    from sequana import sequana_config_path as scfg
    if options.databases is None:
        _log.critical("You must provide a database")
        sys.exit(1)

    databases = []
    for database in options.databases:
        if database == "toydb":
            database = "kraken_toydb"
        elif database == "minikraken":
            database = "minikraken_20141208"

        if os.path.exists(scfg + os.sep + database): # in Sequana path
            databases.append(scfg + os.sep + database)
        elif os.path.exists(database): # local database
            databases.append(database)
        else:
            msg = "Invalid database name (%s). Neither found locally "
            msg += "or in the sequana path %s; Use the --download option"
            raise ValueError(msg % (database, scfg))

    output_directory = options.directory + os.sep + "kraken"
    devtools.mkdirs(output_directory)

    # if there is only one database, use the pipeline else KrakenHierarchical
    if len(databases) == 1:
        _log.info("Using 1 database")
        k = KrakenPipeline(fastq, databases[0], threads=options.thread,
            output_directory=output_directory)

        _pathto = lambda x: "{}/kraken/{}".format(options.directory, x) if x else x
        k.run(output_filename_classified=_pathto(options.classified_out),
              output_filename_unclassified=_pathto(options.unclassified_out))
    else:
        _log.info("Using %s databases" % len(databases))
        k = KrakenHierarchical(fastq, databases, threads=options.thread,
            output_directory=output_directory+os.sep, force=True,
            keep_temp_files=options.keep_temp_files)
        k.run(output_prefix="kraken")

    # This statements sets the directory where HTML will be saved
    from sequana.utils import config
    config.output_dir = options.directory

    # output_directory first argument: the directory where to find the data
    # output_filename is relative to the config.output_dir defined above
    kk = KrakenModule(output_directory, output_filename="summary.html")

    _log.info("Open ./%s/summary.html" % options.directory)
    _log.info("or ./%s/kraken/kraken.html" % options.directory)

    if options.html is True:
        ss.onweb()
Example #28
0
def scoring(args=None):
    """This function is used by the standalone application called dreamscoring

    ::

        dreamscoring --help

    """
    d = DevTools()

    if args is None:
        args = sys.argv[:]
    user_options = Options(prog="dreamtools")

    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    if options.version is True:
        print("%s" % dreamtools.version)
        sys.exit()

    # Check on the challenge name
    if options.challenge is None:
        print_color('--challenge must be provided', red)
        sys.exit()
    else:
        options.challenge = options.challenge.upper()
        options.challenge = options.challenge.replace('DOT', 'dot')

        from dreamtools.admin.download_data import get_challenge_list
        if options.challenge not in get_challenge_list():
            print_color(
                "This challenge %s is not registered in dreamtools." %
                options.challenge, red)
            print("Here is the list of registered challenges: " +
                  ", ".join(get_challenge_list()))
            sys.exit()

    # Check that the challenge can be loaded
    class_inst = get_challenge(options.challenge)
    try:
        this = class_inst.import_scoring_class()
    except NotImplementedError as err:
        print("\n" + str(err))
        sys.exit()
    else:
        # User may just request some information about the challenge.
        if options.info is True:
            print(this)
            sys.exit()
        elif options.onweb is True:
            this.onweb()
            sys.exit()

    # Checks name of the sub-challenges
    subchallenges = get_subchallenges(options.challenge)

    if len(subchallenges) and options.sub_challenge is None:
        txt = "This challenge requires a sub challenge name. "
        txt += "Please use --sub-challenge followed by one value in %s " % subchallenges
        print_color(txt, red)
        sys.exit(0)

    if options.sub_challenge is not None and len(subchallenges) != 0:
        try:
            d.check_param_in_list(options.sub_challenge, subchallenges)
        except ValueError as err:
            txt = "DREAMTools error: unknown sub challenge or not implemented"
            txt += "--->" + str(err)
            print_color(txt, red)
            sys.exit()

    # maybe users just need a template
    if options.download_template is True:
        c = Challenge(options.challenge)
        class_inst = c.import_scoring_class()
        if options.sub_challenge is None:
            print(class_inst.download_template())
        else:
            print(class_inst.download_template(options.sub_challenge))
        return

    # similary for the GS
    if options.download_goldstandard is True:
        c = Challenge(options.challenge)
        class_inst = c.import_scoring_class()
        if options.sub_challenge is None:
            print(class_inst.download_goldstandard())
        else:
            print(class_inst.download_goldstandard(options.sub_challenge))
        return

    # finally, we need a submission
    if options.filename is None:
        txt = "---> filename not provided. You must provide a filename with correct format\n"
        txt += "You may get a template using --download-template \n"
        txt += "Alternatively, you can user either --info or --onweb option to get information about the challenge.\n"
        txt += "https://github.com/dreamtools/dreamtools, or http://dreamchallenges.org\n"
        print_color(txt, red)
        sys.exit()

    # filename
    # filename in general is a single string but could be a list of filenames
    # Because on the parser, we must convert the string into a single string
    # if the list haa a length of 1
    for filename in options.filename:
        if os.path.exists(filename) is False:
            raise IOError("file %s does not seem to exists" % filename)
    if len(options.filename) == 1:
        options.filename = options.filename[0]

    print_color("DREAMTools scoring", purple, underline=True)
    print('Challenge %s (sub challenge %s)\n\n' %
          (options.challenge, options.sub_challenge))

    res = generic_scoring(options.challenge,
                          options.filename,
                          subname=options.sub_challenge,
                          goldstandard=options.goldstandard)

    txt = "Solution for %s in challenge %s" % (options.filename,
                                               options.challenge)

    if options.sub_challenge is not None:
        txt += " (sub-challenge %s)" % options.sub_challenge
    txt += " is :\n"

    for k in sorted(res.keys()):
        txt += darkgreen("     %s:\n %s\n" % (k, res[k]))
    print(txt)
Example #29
0
def scoring(args=None):
    """This function is used by the standalone application called dreamscoring

    ::

        dreamscoring --help

    """
    d = DevTools()

    if args == None:
        args = sys.argv[:]
    user_options = Options(prog="dreamtools")

    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    # Check on the challenge name
    if options.challenge is None:
        print_color("--challenge and --sub-challenge must be provided", red)
        sys.exit()
    else:
        options.challenge = options.challenge.upper()
        options.challenge = options.challenge.replace("DOT", "dot")

    # Check that the challenge can be loaded
    class_inst = get_challenge(options.challenge)
    try:
        class_inst.import_scoring_class()
    except NotImplementedError as err:
        print("\n" + err.message)
        sys.exit()

    # Checks name of the sub-challenges
    subchallenges = get_subchallenges(options.challenge)

    if len(subchallenges) and options.sub_challenge is None:
        txt = "This challenge requires a sub challenge name."
        txt += "Please provide one amongst %s " % subchallenges
        print_color(txt, red)
        sys.exit(0)

    if options.sub_challenge is not None and len(subchallenges) != 0:
        try:
            d.check_param_in_list(options.sub_challenge, subchallenges)
        except ValueError as err:
            txt = "DreamTools error: unknown sub challenge or not implemented"
            txt += "--->" + err.message
            print_color(txt, red)
            sys.exit()

    if options.download_template is True:
        c = Challenge(options.challenge)
        class_inst = c.import_scoring_class()
        if options.sub_challenge is None:
            print(class_inst.download_template())
        else:
            print(class_inst.download_template(options.sub_challenge))
        return

    # similary for the GS
    if options.download_goldstandard is True:
        c = Challenge(options.challenge)
        class_inst = c.import_scoring_class()
        if options.sub_challenge is None:
            print(class_inst.download_goldstandard())
        else:
            print(class_inst.download_goldstandard(options.sub_challenge))
        return

    if options.filename is None:
        txt = "---> filename not provided. You must provide a filename with correct format\n"
        txt += "You may get a template using --download-template option\n"
        txt += "https://github.com/dreamtools/dreamtools, or http://dreamchallenges.org\n"
        print_color(txt, red)
        sys.exit()

    # filename
    # filename in general is a single string but could be a list of filenames
    # Because on the parser, we must convert the string into a single string
    # if the list haa a length of 1
    for filename in options.filename:
        if os.path.exists(filename) is False:
            raise IOError("file %s does not seem to exists" % filename)
    if len(options.filename) == 1:
        options.filename = options.filename[0]

    print_color("Dreamtools scoring", purple, underline=True)
    print("Challenge %s (sub challenge %s)\n\n" % (options.challenge, options.sub_challenge))

    res = "??"

    if options.challenge == "D8C1":
        if options.sub_challenge == "sc1a":
            res = d8c1_sc1a(options.filename, verbose=options.verbose)
        elif options.sub_challenge == "sc1b":
            res = d8c1_sc1b(options.filename, verbose=options.verbose)
        elif options.sub_challenge == "sc2a":
            res = d8c1_sc2a(options.filename, verbose=options.verbose)
        elif options.sub_challenge == "sc2b":
            res = d8c1_sc2b(options.filename, verbose=options.verbose)
    else:
        res = generic_scoring(
            options.challenge, options.filename, subname=options.sub_challenge, goldstandard=options.goldstandard
        )

    txt = "Solution for %s in challenge %s" % (options.filename, options.challenge)
    if options.sub_challenge is not None:
        txt += " (sub-challenge %s)" % options.sub_challenge
    txt += " is :\n"

    for k in sorted(res.keys()):
        txt += darkgreen("     %s:\n %s\n" % (k, res[k]))

    print(txt)
Example #30
0
class KrakenDownload(object):
    """Utility to download Kraken DB and place them in a local directory

    ::

        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.download('toydb')
        kd.download('minikraken')

    A large database (8Gb) is available on synapse and has the following DOI::

        doi:10.7303/syn6171000

    It can be downloaded manually or if you have a Synapse login
    (https://www.synapse.org), you can use::

        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.downloaded("sequana_db1")
    """
    dv = DevTools()

    def download(self, name, verbose=True):
        if name == "minikraken":
            self._download_minikraken(verbose=verbose)
        elif name == "toydb":
            self._download_kraken_toydb(verbose=verbose)
        elif name == "sequana_db1":
            self._download_sequana_db1(verbose=verbose)
        else:
            raise ValueError(
                "name must be toydb or minikraken, or sequana_db1")

    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"
        ]
        filenames = [
            "database.idx", "database.kdb", "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"
        ]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)

    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(
                filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz",
                 filename)
        # unzipping. requires tar and gzip

    def _download_from_synapse(self, synid, target_dir):
        try:
            from synapseclient import Synapse
        except ImportError:
            raise ImportError(
                "Please install synapseclient using 'pip install synapseclient'"
            )
        try:
            self._synapse.get(synid, downloadLocation=target_dir)
        except:
            self._synapse = Synapse()
            self._synapse.login()
            self._synapse.get(synid, downloadLocation=target_dir)

    def _download_sequana_db1(self, verbose=True):
        dbname = "sequana_db1"
        from easydev import md5
        dir1 = sequana_config_path + os.sep + dbname
        dir2 = dir1 + os.sep + "taxonomy"
        self.dv.mkdir(dir1)
        self.dv.mkdir(dir2)

        logger.info(
            "Downloading about 8Gb of data (if not already downloaded) from"
            " Synapse into %s" % dir1)

        from os.path import exists
        filename = dir1 + "ena_list.txt"
        if exists(filename) and md5(
                filename) == "a9cc6268f3338d1632c4712a412593f2":
            pass
        else:
            self._download_from_synapse('syn6171700', dir1)

        # database.idx
        filename = dir1 + "database.idx"
        if exists(filename) and md5(
                filename) == "2fa4a99a4f52f2f04c5a965adb1534ac":
            pass
        else:
            self._download_from_synapse('syn6171017', dir1)

        # database.kdb ; this one is large (8Gb)
        filename = dir1 + "database.kdb"
        if exists(filename) and md5(
                filename) == "ff698696bfc88fe83bc201937cd9cbdf":
            pass
        else:
            self._download_from_synapse('syn6171107', dir1)

        # Then, the taxonomy directory
        filename = dir1 + "names.dmp"
        if exists(filename) and md5(
                filename) == "10bc7a63c579de02112d125a51fd65d0":
            pass
        else:
            self._download_from_synapse('syn6171286', dir2)

        filename = dir1 + "nodes.dmp"
        if exists(filename) and md5(
                filename) == "a68af5a60434e2067c4a0a16df873980":
            pass
        else:
            self._download_from_synapse('syn6171289', dir2)

        filename = dir1 + "taxons.txt"
        if exists(filename) and md5(
                filename) == "e78fbb43b3b41cbf4511d6af16c0287f":
            pass
        else:
            self._download_from_synapse('syn6171290', dir2)
        logger.info('done. You should have a kraken DB in %s' % dir1)

        # The annotations
        wget(
            "https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv",
            dir1 + os.sep + "annotations.csv")
Example #31
0
class Service(object):
    """Base class for WSDL and REST classes

    .. seealso:: :class:`REST`, :class:`WSDLService`
    """

    #: some useful response codes
    response_codes = {
        200: 'OK',
        201: 'Created',
        400: 'Bad Request. There is a problem with your input',
        404: 'Not found. The resource you requests does not exist',
        405: 'Method not allowed',
        406: "Not Acceptable. Usually headers issue",
        410:  'Gone. The resource you requested was removed.',
        415: "Unsupported Media Type",
        500: 'Internal server error. Most likely a temporary problem',
        503: 'Service not available. The server is being updated, try again later'
        }

    def __init__(self, name, url=None, verbose=True, requests_per_sec=3):
        """.. rubric:: Constructor

        :param str name: a name for this service
        :param str url: its URL
        :param bool verbose: prints informative messages if True (default is
            True)
        :param requests_per_sec: maximum number of requests per seconds
            are restricted to 3. You can change that value. If you reach the
            limit, an error is raise. The reason for this limitation is
            that some services (e.g.., NCBI) may black list you IP. 
            If you need or can do more (e.g., ChEMBL does not seem to have
            restrictions), change the value. You can also have several instance
            but again, if you send too many requests at the same, your future
            requests may be retricted. Currently implemented for REST only


        All instances have an attribute called :attr:`~Service.logging` that
        is an instanceof the :mod:`logging` module. It can be used to print
        information, warning, error messages::

            self.logging.info("informative message")
            self.logging.warning("warning message")
            self.logging.error("error message")

        The attribute :attr:`~Service.debugLevel`  can be used to set the behaviour
        of the logging messages. If the argument verbose is True, the debugLebel
        is set to INFO. If verbose if False, the debugLevel is set to WARNING.
        However, you can use the :attr:`debugLevel` attribute to change it to
        one of DEBUG, INFO, WARNING, ERROR, CRITICAL. debugLevel=WARNING means
        that only WARNING, ERROR and CRITICAL messages are shown.

        """
        super(Service, self).__init__()
        self.requests_per_sec = requests_per_sec

        self.name = name
        self.logging = Logging("bioservices:%s" % self.name, verbose)

        self._url = url
        try:
            if self.url is not None:
                urlopen(self.url)
        except Exception as err:
            self.logging.warning("The URL (%s) provided cannot be reached." % self.url)
        self._easyXMLConversion = True

        # used by HGNC where some XML contains non-utf-8 characters !!
        # should be able to fix it with requests once HGNC works again
        #self._fixing_unicode = False
        #self._fixing_encoding = "utf-8"

        self.devtools = DevTools()
        self.settings = BioServicesConfig()

    def _get_caching(self):
        return self.settings.params['cache.on'][0]
    def _set_caching(self, caching):
        self.devtools.check_param_in_list(caching, [True, False])
        self.settings.params['cache.on'][0] = caching
        # reset the session, which will be automatically created if we
        # access to the session attribute
        self._session = None
    CACHING = property(_get_caching, _set_caching)

    def _get_url(self):
        return self._url

    def _set_url(self, url):
        # something more clever here to check the URL e.g. starts with http
        if url is not None:
            url = url.rstrip("/")
            self._url = url
    url = property(_get_url, _set_url, doc="URL of this service")

    def _get_easyXMLConversion(self):
        return self._easyXMLConversion

    def _set_easyXMLConversion(self, value):
        if isinstance(value, bool) is False:
            raise TypeError("value must be a boolean value (True/False)")
        self._easyXMLConversion = value
    easyXMLConversion = property(_get_easyXMLConversion,
            _set_easyXMLConversion, 
            doc="""If True, xml output from a request are converted to easyXML object (Default behaviour).""")

    def easyXML(self, res):
        """Use this method to convert a XML document into an
            :class:`~bioservices.xmltools.easyXML` object

        The easyXML object provides utilities to ease access to the XML
        tag/attributes.

        Here is a simple example starting from the following XML

        .. doctest::

            >>> from bioservices import *
            >>> doc = "<xml> <id>1</id> <id>2</id> </xml>"
            >>> s = Service("name")
            >>> res = s.easyXML(doc)
            >>> res.findAll("id")
            [<id>1</id>, <id>2</id>]

        """
        from bioservices import xmltools
        return xmltools.easyXML(res)


    def __str__(self):
        txt = "This is an instance of %s service" % self.name
        return txt

    def pubmed(self, Id):
        """Open a pubmed Id into a browser tab

        :param Id: a valid pubmed Id in string or integer format.

        The URL is a concatenation of the pubmed URL
        http://www.ncbi.nlm.nih.gov/pubmed/ and the provided Id.

        """
        url = "http://www.ncbi.nlm.nih.gov/pubmed/"
        import webbrowser
        webbrowser.open(url + str(Id))

    def on_web(self, url):
        """Open a URL into a browser"""
        import webbrowser
        webbrowser.open(url)

    def save_str_to_image(self, data, filename):
        """Save string object into a file converting into binary"""
        with open(filename,'wb') as f:
            import binascii
            try:
                #python3
                newres = binascii.a2b_base64(bytes(data, "utf-8"))
            except:
                newres = binascii.a2b_base64(data)
            f.write(newres)
Example #32
0
class KrakenBuilder():
    """This class will help you building a custom Kraken database


    You will need a few steps, and depending on the FASTA files you want to
    include lots of resources (memory and space wise). In the following example,
    we will be reasonable and use only viruses FASTA files.

    First, we need to create the data structure directory. Let us call it
    **virusdb**::

        from sequana import KrakenBuilder
        kb = KrakenBuilder("virusdb")

    We then need to download a large taxonomic database from NCBI. You may
    already have a local copy, in which case you would need to copy it in
    virusdb/taxonomy directory. If not, type::

        kb.download_taxonomy()

    The virusdb/taxonomy directory will contain about 8.5G of data.

    Note that this currently requires the unix tools **wget** and **tar**.

    Then, we need to add some fasta files. You may download specific FASTA files
    if you know the accession numbers using :meth:`download_accession`. However,
    we also provide a method to download all viruses from ENA::

        kb.download_viruses()

    This will take a while to download the more than 4500 FASTA files (10
    minutes on a good connection). You will end up with a data set of about 100
    Mb of FASTA files.

    If you wish to download other FASTA (e.g. all bacteria), you will need to
    use another class from the :mod:`sequana.databases`::

        from sequana.databases import ENADownload
        ena = ENADownload()
        ena.download_fasta("bacteria.txt", output_dir="virusdb/library/added")

    Please see the documentation for more options and list of species to
    download.

    It is now time to build the DB itself. This is based on the kraken tool.
    You may do it yourself in a shell::

        kraken-build  --rebuild -db virusdb --minimizer-len 10 --max-db-size 4 --threads 4
        --kmer-len 26 --jellyfish-hash-size 500000000

    Or you the KrakenBuilder. First you need to look at the :attr:`params`
    attribute. The most important key/value that affect the size of the DB are::

        kb.params['kmer_length']  (max value is 31)
        kb.params['max_db_size'] is tha max size of the DB files in Gb
        kb.params['minimizer_len']

    To create a small DB quickly, we set those values::

        kb.params['kmer_length']  = 26
        kb.params['minimizer_len'] = 10

    However, for production, we would recommend 31 and 13 (default)

    This takes about 2 minutes to build and the final DB is about 800Mb.

    Lots of useless files are in the direcory and can be removed using kraken
    itself. However we do a little bit more and therefore have our own
    cleaning function::

        kb.clean_db()

    Kraken-build uses jellyfish. The **hash_size** parameter is the jellyfish
    hash_size parameter. If you set it to 6400M, the memory required is about
    6.9bytes times 6400M that is 40Gb of memory. The default value used here
    means 3.5Gb are required.

    The size to store the DB itself should be

    :math:

        sD + 8 (4^M)

    where **s** is about 12 bytes (used to store a kmer/taxon pair, D is the
    number of kmer in the final database, which cannot be estimated before
    hand, and M the length minimiser parameter.


    The quick way:
    =====================

        kb = KrakenBuilder("virusdb")
        kb.run(['virus']) # use only viruses from ENA list

    Here, you may want to re-run the analysis with different parameters
    for the database built. If you require the virus DB, it has been
    downloaded already so this step will be skip. The Taxon DB does not
    need to be downloaded again, so set download_taxonomy to False.

    Before, let us change the parameter to build a full database::

        kb.params['kmer_length']  = 31
        kb.params['minimizer_len'] = 13

    We have here instead of 800Mb DB a new DB of 1.5Gb but it should
    take more or less the same time to build it

    Finally if you do not need to test it anymore, you may clean the DB once for
    all. This will remove useless files. The directory's name is the name of the
    DB that should be used in e.g. the quality_control pipeline. To clean the
    data directory, type::

        kb.clean_db()

    """
    def __init__(self, dbname):
        """.. rubric:: Constructor

        :param str dbname: Create the Kraken DB in this directory


        """
        # See databases.py module
        self.dbname = dbname
        self.enadb = ENADownload()
        self.valid_dbs = self.enadb._metadata.keys()

        # mini_kraken uses minimiser-length = 13, max_db =4, others=default so
        # kmer-len=31 hashsize=default
        self.params = {
            "dbname": self.dbname,
            "minimizer_len": 10,
            "max_db_size": 4,
            "threads": 4,
            "kmer_length": 26,
            "hash_size": 500000000
        }

        self.init()

    def init(self):
        # mkdir library
        self.library_path = self.dbname + os.sep + "library"
        self.taxon_path = self.dbname + os.sep + "taxonomy"
        self.fasta_path = self.library_path + os.sep + "added"

        self._devtools = DevTools()
        self._devtools.mkdir(self.dbname)
        self._devtools.mkdir(self.library_path)
        self._devtools.mkdir(self.fasta_path)
        self._devtools.mkdir(self.taxon_path)

    def download_accession(self, acc):
        """Donwload a specific Fasta from ENA given its accession number

        Note that if you want to add specific FASTA from ENA, you must use
        that function to make sure the header will be understood by Kraken;
        The header must use a GI number (not ENA)

        """
        output = self.dbname + os.sep + "library" + os.sep + "added"
        """Download a specific FASTA file given its ENA accession number """
        self.enadb.download_accession(acc, output=output)

    def download_viruses(self):
        self.enadb.download_fasta("virus.txt", output_dir=self.fasta_path)

    def run(self, dbs=[], download_taxon=True):
        """Create the Custom Kraken DB

        #. download taxonomy files
        #. Load the DBs (e.g. viruses)
        #. Build DB with kraken-build
        #. Clean it up

        """
        # Start with the FASTA
        self._download_dbs(dbs)

        self.download_taxonomy()

        # search for taxon file. If not found, error
        required = self.taxon_path + os.sep + "gi_taxid_nucl.dmp"

        if required not in glob.glob(self.taxon_path + os.sep + "*"):
            raise IOError("Taxon file not found")

        print(
            "\nDepending on the input, this step may take a few hours to finish"
        )
        self._build_kraken()

    def download_taxonomy(self, force=False):
        """Download kraken data

        The downloaded file is large (1.3Gb) and the unzipped file is about 9Gb.

        If already present, do not download the file except if the *force*
        parameter is set to True.

        """

        # If the requested file exists, nothing to do
        expected_filename = self.taxon_path + os.sep + "gi_taxid_nucl.dmp"
        expected_md5 = "8c182ac2df452d836206ad13275cd8af"
        print(
            '\nDownloading taxonomy files. Takes a while depending on your connection'
        )

        if os.path.exists(expected_filename) is False or \
                md5(expected_filename) != expected_md5:
            # download taxonomy
            # We could use kraken-build --download-taxonomy + a subprocess but
            # even simpler to get the file via ftp
            FTP = "ftp.ncbi.nih.gov"
            execute(
                "wget %s/pub/taxonomy/gi_taxid_nucl.dmp.gz --directory-prefix %s"
                % (FTP, self.taxon_path))
            # Unzip the files
            execute('unpigz %s/gi_taxid_nucl.dmp.gz' % self.taxon_path)
        else:
            print("Found local expected file %s " % expected_filename)

        expected_filename = self.taxon_path + os.sep + "names.dmp"
        expected_md5 = "90d88912ad4c94f6ac07dfab0443da9b"
        if os.path.exists(expected_filename) is False or \
                md5(expected_filename) != expected_md5:

            execute(
                "wget %s/pub/taxonomy/taxdump.tar.gz --directory-prefix %s" %
                (FTP, self.taxon_path))

            execute('tar xvfz %s/taxdump.tar.gz -C %s' %
                    (self.taxon_path, self.taxon_path))
        else:
            print("Found local expected file %s " % expected_filename)

    def _download_dbs(self, dbs=[]):
        print("Downloading all Fasta files for %s" % dbs)
        # Download the DBs in it
        from .databases import ENADownload
        for db in dbs:
            if db not in self.valid_dbs and os.path.exists(db) is False:
                msg = "db must be a local file with a list of ENA or one of"
                for this in self.ena._metadata.keys():
                    msg += " - %s" % this
                raise ValueError(msg)
            self.ena.download_fasta(db, output_dir=self.fasta_path)

    def _build_kraken(self):
        print('Building the kraken db ')
        self.params['hash_size'] = int(self.params["hash_size"])

        cmd = """kraken-build  --rebuild -db %(dbname)s \
            --minimizer-len %(minimizer_len)s\
            --max-db-size %(max_db_size)s \
            --threads %(threads)s\
            --kmer-len %(kmer_length)s \
            --jellyfish-hash-size %(hash_size)s""" % self.params

        # again, kraken-build prints on stderr so we cannot use easydev.shellcmd
        execute(cmd)

    def clean_db(self):
        """Once called, you will not be able to append more FASTA files

        """
        # Now we can clean the kraken db:
        print('Cleaning the kraken db ')
        # Clean the nodes.dmp and names.dmp
        print('Identifying the GI numbers')
        gis = self.get_gis()
        taxons = self.get_taxons_from_gis(gis)
        print("")

        self.gis = gis
        self.taxons = taxons

        # This cleans the nodes.dmp and names.dmp. This must be done
        # before kraken-build --clean since it requires the gi_taxid_nucl.dmp
        # file
        names_file = self.taxon_path + os.sep + "names.dmp"
        nodes_file = self.taxon_path + os.sep + "nodes.dmp"
        names_file_temp = self.taxon_path + os.sep + "names_temp.dmp"
        nodes_file_temp = self.taxon_path + os.sep + "nodes_temp.dmp"

        taxon_file_reader = NCBITaxonReader(names=names_file,
                                            nodes=nodes_file,
                                            verbose=True)
        print("Filtering")
        taxon_file_reader.filter_nodes_dmp_file(nodes_file,
                                                nodes_file_temp,
                                                taxons=taxons)
        taxon_file_reader.filter_names_dmp_file(names_file,
                                                names_file_temp,
                                                taxons=taxons)

        # mv the new files into the old ones
        os.rename(names_file_temp, names_file)
        os.rename(nodes_file_temp, nodes_file)

        # Finally, the kraken cleaning itself
        cmd = "kraken-build --clean --db %s" % self.params['dbname']
        execute(cmd)

    def get_gis(self, extensions=['fa']):
        self.filenames = []
        root = self.dbname
        for extension in extensions:
            self.filenames.extend(
                list(glob.iglob("%s/library/**/*%s" % (root, extension))))
        for extension in extensions:
            self.filenames.extend(
                list(glob.iglob("%s/library/**/**/*%s" % (root, extension))))

        N = len(self.filenames)
        pb = Progress(N)
        gis = []
        for i, filename in enumerate(self.filenames):
            data = open(filename, "r")
            line = data.readline()
            if line.startswith('>'):
                assert "gi" in line, "expected >gi to be found at the beginning"
                gi = line[1:].split("|")[1]
            else:
                raise ValueError(
                    "This file %s does not seem to be a FASTA file" % filename)
            gis.append(gi)
            pb.animate(i + 1)
        print()
        gis = [int(x) for x in gis]
        self.gis = gis

        assert len(gis) == len(self.filenames)
        return gis

    def get_taxons_from_gis(self, gis, filename="gi_taxid_nucl.dmp"):
        filename = self.taxon_path + os.sep + filename
        data = pd.read_csv(filename, chunksize=1000000, sep='\t', header=None)
        N = 560  # with time this number will be deprecated but good for now

        local_gis = gis[:]

        # We will found GI an order than different from the input gis list so
        # we will need to keep track of the order
        found_gis = []
        taxons = [32644] * len(gis)  # 32644 means unidentified
        # we search for the unique gis. Once found, we remove them from the
        # vector and keep going until the vector is empty or there is no more
        # chunks. A good sanity check is that the final gis vector should be
        # empty meaning all have been found. We do not care about the order
        # of the final taxons vector as compare to the GI vector

        print("Scanning %s to look for %s GI numbers" % (filename, len(gis)))
        pb = Progress(N)
        for i, chunk in enumerate(data):
            chunk.set_index(0, inplace=True)
            chunk = chunk.ix[local_gis].dropna()

            # keep the GI and Taxon
            found_gis.extend([int(x) for x in list(chunk.index)])

            # update the remaining GIs and the taxons
            for gi, tax in zip(chunk.index, chunk.values):
                local_gis.remove(gi)
                index = gis.index(gi)
                taxons[index] = tax

            # no need to carry on if all GIs were found
            if len(local_gis) == 0:
                break
            pb.animate(i + 1)
        print("")

        taxons = [int(x) for x in taxons]
        return taxons