Ejemplo n.º 1
0
class Tools(object):
    # Helper class to simplify following code
    dv = DevTools()

    def __init__(self, verbose=True):
        self.verbose = verbose

    def purple(self, txt, force=False):
        if self.verbose or force is True:
            print(purple(txt))

    def red(self, txt, force=False):
        if self.verbose or force is True:
            print(red(txt))

    def green(self, txt, force=False):
        if self.verbose or force is True:
            print(green(txt))

    def blue(self, txt, force=False):
        if self.verbose or force is True:
            print(blue(txt))

    def mkdir(self, name):
        self.dv.mkdir(name)
Ejemplo n.º 2
0
    def __init__(self, organism="H**o sapiens", cache=False):
        """.. rubric:: Constructor

        :param str orgamism: the organism to look at. H**o sapiens
            is the default. Other possible organisms can be found
            in :attr:`organisms`.

        """
        self.logging = Logging()

        self.devtools = DevTools()
        self.webserv = IntactComplex(verbose=verbose, cache=cache)
        df = self.webserv.search("*", frmt="pandas")
        self.df = df

        #: list of valid organisms found in the database
        self.valid_organisms = list(set(df["organismName"]))
        self.valid_organisms = [x.split(";")[0] for x in self.valid_organisms]

        #: list of valid organisms found in the database
        self.organisms = list(set(df["organismName"]))
        self._organism = None
        if organism in self.organisms:
            self.organism = organism
        else:
            print("Organism not set yet. ")

        # This will populated on request as a cache/buffer
        self._details = None
        self._complexes = None
Ejemplo n.º 3
0
    def __init__(self, organism='H**o sapiens', verbose=True, cache=False):
        """.. rubric:: Constructor

        :param str orgamism: the organism to look at. H**o sapiens
            is the default. Other possible organisms can be found
            in :attr:`organisms`.
        :param str verbose: a verbose level in ERROR/DEBUG/INFO/WARNING
            compatible with those used in BioServices.

        """
        super(Complexes, self).__init__(level=verbose)

        self.devtools = DevTools()
        self.webserv = IntactComplex(verbose=verbose, cache=cache)
        df = self.webserv.search('*', frmt='pandas')
        self.df = df

        #: list of valid organisms found in the database
        self.valid_organisms = list(set(df['organismName']))
        self.valid_organisms = [x.split(';')[0] for x in self.valid_organisms]


        #: list of valid organisms found in the database
        self.organisms = list(set(df['organismName']))
        self._organism = None
        if organism in self.organisms:
            self.organism = organism
        else:
            print("Organism not set yet. ")

        # This will populated on request as a cache/buffer
        self._details = None
        self._complexes = None
Ejemplo n.º 4
0
    def __init__(self,
                 fastq,
                 database,
                 threads=4,
                 output_directory="kraken",
                 dbname=None):
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output_directory: output filename of the Krona HTML page
        :param dbname:

        Description: internally, once Kraken has performed an analysis, reads
        are associated to a taxon (or not). We then find the correponding
        lineage and scientific names to be stored within a Krona formatted file.
        KtImportTex is then used to create the Krona page.

        """
        # Set and create output directory
        self._devtools = DevTools()
        self.output_directory = output_directory
        self._devtools.mkdir(output_directory)
        self.ka = KrakenAnalysis(fastq, database, threads)

        if dbname is None:
            self.dbname = os.path.basename(database)
        else:
            self.dbname = dbname
Ejemplo n.º 5
0
    def __init__(self, fastq, database, threads=4):
        """.. rubric:: Constructor

        :param fastq: either a fastq filename or a list of 2 fastq filenames
        :param database: the path to a valid Kraken database
        :param threads: number of threads to be used by Kraken
        :param output: output filename of the Krona HTML page

        :param return:

        """
        self._devtools = DevTools()
        self._devtools.check_exists(database)

        self.database = database
        self.threads = threads

        # Fastq input
        if isinstance(fastq, str):
            self.paired = False
            self.fastq = [fastq]
        elif isinstance(fastq, list):
            if len(fastq) == 2:
                self.paired = True
            else:
                self.paired = False
            self.fastq = fastq
        else:
            raise ValueError(
                "Expected a fastq filename or list of 2 fastq filenames")

        for this in self.fastq:
            self._devtools.check_exists(database)
Ejemplo n.º 6
0
    def __init__(self,
                 pattern="**/summary.json",
                 output_filename=None,
                 verbose=True,
                 **kargs):
        super().__init__()

        from sequana import logger
        logger.level = "INFO"
        if verbose is False:
            logger.level = "WARNING"

        logger.info(
            "Sequana Summary is still a tool in progress and have been " +
            "  tested with the quality_control pipeline only for now.")
        self.title = "Sequana multiple summary"
        self.devtools = DevTools()

        self.filenames = list(glob.iglob(pattern, recursive=True))
        self.summaries = [ReadSummary(filename) for filename in self.filenames]
        self.projects = [
            ReadSummary(filename).data['project']
            for filename in self.filenames
        ]
        self.create_report_content()
        self.create_html(output_filename)
Ejemplo n.º 7
0
    def __init__(self, name, url=None, verbose=True, requests_per_sec=10):
        """.. rubric:: Constructor

        :param str name: a name for this service
        :param str url: its URL
        :param bool verbose: prints informative messages if True (default is
            True)
        :param requests_per_sec: maximum number of requests per seconds
            are restricted to 3. You can change that value. If you reach the
            limit, an error is raise. The reason for this limitation is
            that some services (e.g.., NCBI) may black list you IP.
            If you need or can do more (e.g., ChEMBL does not seem to have
            restrictions), change the value. You can also have several instance
            but again, if you send too many requests at the same, your future
            requests may be retricted. Currently implemented for REST only


        All instances have an attribute called :attr:`~Service.logging` that
        is an instanceof the :mod:`logging` module. It can be used to print
        information, warning, error messages::

            self.logging.info("informative message")
            self.logging.warning("warning message")
            self.logging.error("error message")

        The attribute :attr:`~Service.debugLevel`  can be used to set the behaviour
        of the logging messages. If the argument verbose is True, the debugLebel
        is set to INFO. If verbose if False, the debugLevel is set to WARNING.
        However, you can use the :attr:`debugLevel` attribute to change it to
        one of DEBUG, INFO, WARNING, ERROR, CRITICAL. debugLevel=WARNING means
        that only WARNING, ERROR and CRITICAL messages are shown.

        """
        super(Service, self).__init__()
        self.requests_per_sec = requests_per_sec
        self.name = name
        self.logging = Logging("bioservices:%s" % self.name, verbose)

        self._url = url
        try:
            if self.url is not None:
                urlopen(self.url)
        except Exception as err:
            self.logging.warning("The URL (%s) provided cannot be reached." %
                                 self.url)
        self._easyXMLConversion = True

        # used by HGNC where some XML contains non-utf-8 characters !!
        # should be able to fix it with requests once HGNC works again
        #self._fixing_unicode = False
        #self._fixing_encoding = "utf-8"

        self.devtools = DevTools()
        self.settings = BioServicesConfig()

        self._last_call = 0
Ejemplo n.º 8
0
    def init(self):
        # mkdir library
        self.library_path = self.dbname + os.sep + "library"
        self.taxon_path = self.dbname + os.sep + "taxonomy"
        self.fasta_path = self.library_path + os.sep + "added"

        self._devtools = DevTools()
        self._devtools.mkdir(self.dbname)
        self._devtools.mkdir(self.library_path)
        self._devtools.mkdir(self.fasta_path)
        self._devtools.mkdir(self.taxon_path)
Ejemplo n.º 9
0
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Ejemplo n.º 10
0
    def __init__(self):

        dv = DevTools()
        self.base = sequana_config_path + os.sep + "busco"
        dv.mkdir(self.base)
        self.filenames = sorted([
            "bacteria_odb9", "proteobacteria_odb9", "rhizobiales_odb9",
            "betaproteobacteria_odb9", "gammaproteobacteria_odb9",
            "enterobacteriales_odb9", "deltaepsilonsub_odb9",
            "actinobacteria_odb9", "cyanobacteria_odb9", "firmicutes_odb9",
            "clostridia_odb9", "lactobacillales_odb9", "bacillales_odb9",
            "bacteroidetes_odb9", "spirochaetes_odb9", "tenericutes_odb9",
            "eukaryota_odb9", "fungi_odb9", "microsporidia_odb9",
            "dikarya_odb9", "ascomycota_odb9", "pezizomycotina_odb9",
            "eurotiomycetes_odb9", "sordariomyceta_odb9",
            "saccharomyceta_odb9", "saccharomycetales_odb9",
            "basidiomycota_odb9", "metazoa_odb9", "nematoda_odb9",
            "arthropoda_odb9", "insecta_odb9", "endopterygota_odb9",
            "hymenoptera_odb9", "diptera_odb9", "vertebrata_odb9",
            "actinopterygii_odb9", "tetrapoda_odb9", "aves_odb9",
            "mammalia_odb9", "euarchontoglires_odb9", "laurasiatheria_odb9",
            "embryophyta_odb9", "protists_ensembl",
            "alveolata_stramenophiles_ensembl"
        ])
Ejemplo n.º 11
0
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"
        ]
        filenames = [
            "database.idx", "database.kdb", "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"
        ]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)
Ejemplo n.º 12
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.level = options.level

    if options.update_taxonomy is True:
        from sequana.taxonomy import Taxonomy
        tax = Taxonomy()
        from sequana import sequana_config_path as cfg
        logger.info(
            "Will overwrite the local database taxonomy.dat in {}".format(cfg))
        tax.download_taxonomic_file(overwrite=True)
        sys.exit(0)

    # We put the import here to make the --help faster
    from sequana import KrakenPipeline
    from sequana.kraken import KrakenSequential
    devtools = DevTools()

    if options.download:
        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.download(options.download)
        sys.exit()

    fastq = []
    if options.file1:
        devtools.check_exists(options.file1)
        fastq.append(options.file1)
    if options.file2:
        devtools.check_exists(options.file2)
        fastq.append(options.file2)

    from sequana import sequana_config_path as scfg
    if options.databases is None:
        logger.critical("You must provide a database")
        sys.exit(1)

    databases = []
    for database in options.databases:
        if database == "toydb":
            database = "kraken_toydb"
        elif database == "minikraken":
            database = "minikraken_20141208"

        if os.path.exists(scfg + os.sep + database):  # in Sequana path
            databases.append(scfg + os.sep + database)
        elif os.path.exists(database):  # local database
            databases.append(database)
        else:
            msg = "Invalid database name (%s). Neither found locally "
            msg += "or in the sequana path %s; Use the --download option"
            raise ValueError(msg % (database, scfg))

    output_directory = options.directory + os.sep + "kraken"
    devtools.mkdirs(output_directory)

    # if there is only one database, use the pipeline else KrakenHierarchical
    _pathto = lambda x: "{}/kraken/{}".format(options.directory, x) if x else x
    if len(databases) == 1:
        logger.info("Using 1 database")
        k = KrakenPipeline(fastq,
                           databases[0],
                           threads=options.thread,
                           output_directory=output_directory,
                           confidence=options.confidence)

        k.run(output_filename_classified=_pathto(options.classified_out),
              output_filename_unclassified=_pathto(options.unclassified_out))
    else:
        logger.info("Using %s databases" % len(databases))
        k = KrakenSequential(fastq,
                             databases,
                             threads=options.thread,
                             output_directory=output_directory + os.sep,
                             force=True,
                             keep_temp_files=options.keep_temp_files,
                             output_filename_unclassified=_pathto(
                                 options.unclassified_out),
                             confidence=options.confidence)
        k.run(output_prefix="kraken")

    # This statements sets the directory where HTML will be saved
    from sequana.utils import config
    config.output_dir = options.directory

    # output_directory first argument: the directory where to find the data
    # output_filename is relative to the config.output_dir defined above
    kk = KrakenModule(output_directory, output_filename="summary.html")

    logger.info("Open ./%s/summary.html" % options.directory)
    logger.info("or ./%s/kraken/kraken.html" % options.directory)

    if options.html is True:
        ss.onweb()
Ejemplo n.º 13
0
# source
# http://nbviewer.ipython.org/github/tritemio/notebooks/blob/master/Mixture_Model_Fitting.ipynb

from easydev import DevTools
devtools = DevTools()
from scipy.optimize import minimize, show_options
import scipy.stats as ss
import numpy as np
import pylab
from easydev import AttrDict

from . import criteria

import numpy as np

half_log_two_pi = 0.5 * np.log(2 * np.pi)


class Model(object):
    """New base model"""
    def __init__(self):
        pass

    def log_density(self, data):
        raise NotImplementedError

    def estimate(self, data, weights):
        raise NotImplementedError

    def generate(self):
        raise NotImplementedError
Ejemplo n.º 14
0
class KrakenDownload(object):
    """Utility to download Kraken DB and place them in a local directory

    ::

        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.download('toydb')
        kd.download('minikraken')

    A large database (8Gb) is available on synapse and has the following DOI::

        doi:10.7303/syn6171000

    It can be downloaded manually or if you have a Synapse login
    (https://www.synapse.org), you can use::

        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.downloaded("sequana_db1")
    """
    dv = DevTools()

    def download(self, name, verbose=True):
        if name == "minikraken":
            self._download_minikraken(verbose=verbose)
        elif name == "toydb":
            self._download_kraken_toydb(verbose=verbose)
        elif name == "sequana_db1":
            self._download_sequana_db1(verbose=verbose)
        else:
            raise ValueError(
                "name must be toydb or minikraken, or sequana_db1")

    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"
        ]
        filenames = [
            "database.idx", "database.kdb", "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"
        ]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)

    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(
                filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz",
                 filename)
        # unzipping. requires tar and gzip

    def _download_from_synapse(self, synid, target_dir):
        try:
            from synapseclient import Synapse
        except ImportError:
            raise ImportError(
                "Please install synapseclient using 'pip install synapseclient'"
            )
        try:
            self._synapse.get(synid, downloadLocation=target_dir)
        except:
            self._synapse = Synapse()
            self._synapse.login()
            self._synapse.get(synid, downloadLocation=target_dir)

    def _download_sequana_db1(self, verbose=True):
        dbname = "sequana_db1"
        from easydev import md5
        dir1 = sequana_config_path + os.sep + dbname
        dir2 = dir1 + os.sep + "taxonomy"
        self.dv.mkdir(dir1)
        self.dv.mkdir(dir2)

        logger.info(
            "Downloading about 8Gb of data (if not already downloaded) from"
            " Synapse into %s" % dir1)

        from os.path import exists
        filename = dir1 + "ena_list.txt"
        if exists(filename) and md5(
                filename) == "a9cc6268f3338d1632c4712a412593f2":
            pass
        else:
            self._download_from_synapse('syn6171700', dir1)

        # database.idx
        filename = dir1 + "database.idx"
        if exists(filename) and md5(
                filename) == "2fa4a99a4f52f2f04c5a965adb1534ac":
            pass
        else:
            self._download_from_synapse('syn6171017', dir1)

        # database.kdb ; this one is large (8Gb)
        filename = dir1 + "database.kdb"
        if exists(filename) and md5(
                filename) == "ff698696bfc88fe83bc201937cd9cbdf":
            pass
        else:
            self._download_from_synapse('syn6171107', dir1)

        # Then, the taxonomy directory
        filename = dir1 + "names.dmp"
        if exists(filename) and md5(
                filename) == "10bc7a63c579de02112d125a51fd65d0":
            pass
        else:
            self._download_from_synapse('syn6171286', dir2)

        filename = dir1 + "nodes.dmp"
        if exists(filename) and md5(
                filename) == "a68af5a60434e2067c4a0a16df873980":
            pass
        else:
            self._download_from_synapse('syn6171289', dir2)

        filename = dir1 + "taxons.txt"
        if exists(filename) and md5(
                filename) == "e78fbb43b3b41cbf4511d6af16c0287f":
            pass
        else:
            self._download_from_synapse('syn6171290', dir2)
        logger.info('done. You should have a kraken DB in %s' % dir1)

        # The annotations
        wget(
            "https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv",
            dir1 + os.sep + "annotations.csv")
Ejemplo n.º 15
0
def scoring(args=None):
    """This function is used by the standalone application called dreamscoring

    ::

        dreamscoring --help

    """
    d = DevTools()

    if args is None:
        args = sys.argv[:]
    user_options = Options(prog="dreamtools")

    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    if options.version is True:
        print("%s" % dreamtools.version)
        sys.exit()

    # Check on the challenge name
    if options.challenge is None:
        print_color('--challenge must be provided', red)
        sys.exit()
    else:
        options.challenge = options.challenge.upper()
        options.challenge = options.challenge.replace('DOT', 'dot')

        from dreamtools.admin.download_data import get_challenge_list
        if options.challenge not in get_challenge_list():
            print_color(
                "This challenge %s is not registered in dreamtools." %
                options.challenge, red)
            print("Here is the list of registered challenges: " +
                  ", ".join(get_challenge_list()))
            sys.exit()

    # Check that the challenge can be loaded
    class_inst = get_challenge(options.challenge)
    try:
        this = class_inst.import_scoring_class()
    except NotImplementedError as err:
        print("\n" + str(err))
        sys.exit()
    else:
        # User may just request some information about the challenge.
        if options.info is True:
            print(this)
            sys.exit()
        elif options.onweb is True:
            this.onweb()
            sys.exit()

    # Checks name of the sub-challenges
    subchallenges = get_subchallenges(options.challenge)

    if len(subchallenges) and options.sub_challenge is None:
        txt = "This challenge requires a sub challenge name. "
        txt += "Please use --sub-challenge followed by one value in %s " % subchallenges
        print_color(txt, red)
        sys.exit(0)

    if options.sub_challenge is not None and len(subchallenges) != 0:
        try:
            d.check_param_in_list(options.sub_challenge, subchallenges)
        except ValueError as err:
            txt = "DREAMTools error: unknown sub challenge or not implemented"
            txt += "--->" + str(err)
            print_color(txt, red)
            sys.exit()

    # maybe users just need a template
    if options.download_template is True:
        c = Challenge(options.challenge)
        class_inst = c.import_scoring_class()
        if options.sub_challenge is None:
            print(class_inst.download_template())
        else:
            print(class_inst.download_template(options.sub_challenge))
        return

    # similary for the GS
    if options.download_goldstandard is True:
        c = Challenge(options.challenge)
        class_inst = c.import_scoring_class()
        if options.sub_challenge is None:
            print(class_inst.download_goldstandard())
        else:
            print(class_inst.download_goldstandard(options.sub_challenge))
        return

    # finally, we need a submission
    if options.filename is None:
        txt = "---> filename not provided. You must provide a filename with correct format\n"
        txt += "You may get a template using --download-template \n"
        txt += "Alternatively, you can user either --info or --onweb option to get information about the challenge.\n"
        txt += "https://github.com/dreamtools/dreamtools, or http://dreamchallenges.org\n"
        print_color(txt, red)
        sys.exit()

    # filename
    # filename in general is a single string but could be a list of filenames
    # Because on the parser, we must convert the string into a single string
    # if the list haa a length of 1
    for filename in options.filename:
        if os.path.exists(filename) is False:
            raise IOError("file %s does not seem to exists" % filename)
    if len(options.filename) == 1:
        options.filename = options.filename[0]

    print_color("DREAMTools scoring", purple, underline=True)
    print('Challenge %s (sub challenge %s)\n\n' %
          (options.challenge, options.sub_challenge))

    res = generic_scoring(options.challenge,
                          options.filename,
                          subname=options.sub_challenge,
                          goldstandard=options.goldstandard)

    txt = "Solution for %s in challenge %s" % (options.filename,
                                               options.challenge)

    if options.sub_challenge is not None:
        txt += " (sub-challenge %s)" % options.sub_challenge
    txt += " is :\n"

    for k in sorted(res.keys()):
        txt += darkgreen("     %s:\n %s\n" % (k, res[k]))
    print(txt)