Exemple #1
0
    def _download_sequana_db1(self, verbose=True):
        dbname = "sequana_db1"
        from easydev import md5
        dir1 = sequana_config_path + os.sep + dbname
        dir2 = dir1 + os.sep + "taxonomy"
        self.dv.mkdir(dir1)
        self.dv.mkdir(dir2)

        logger.info("Downloading about 8Gb of data (if not already downloaded) from"
            " Synapse into %s" % dir1)

        from os.path import exists
        filename = dir1 + "ena_list.txt"
        if exists(filename) and md5(filename) == "a9cc6268f3338d1632c4712a412593f2":
            pass
        else:
            self._download_from_synapse('syn6171700', dir1)

        # database.idx
        filename = dir1 + "database.idx"
        if exists(filename) and md5(filename) == "2fa4a99a4f52f2f04c5a965adb1534ac":
            pass
        else:
            self._download_from_synapse('syn6171017', dir1)

        # database.kdb ; this one is large (8Gb)
        filename = dir1 + "database.kdb"
        if exists(filename) and md5(filename) == "ff698696bfc88fe83bc201937cd9cbdf":
            pass
        else:
            self._download_from_synapse('syn6171107', dir1)

        # Then, the taxonomy directory
        filename = dir1 + "names.dmp"
        if exists(filename) and md5(filename) == "10bc7a63c579de02112d125a51fd65d0":
            pass
        else:
            self._download_from_synapse('syn6171286', dir2)

        filename = dir1 + "nodes.dmp"
        if exists(filename) and md5(filename) == "a68af5a60434e2067c4a0a16df873980":
            pass
        else:
            self._download_from_synapse('syn6171289', dir2)

        filename = dir1 + "taxons.txt"
        if exists(filename) and md5(filename) == "e78fbb43b3b41cbf4511d6af16c0287f":
            pass
        else:
            self._download_from_synapse('syn6171290', dir2)
        logger.info('done. You should have a kraken DB in %s' % dir1)

        # The annotations
        wget("https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv",
            dir1 + os.sep + "annotations.csv")
Exemple #2
0
    def _download_sequana_db1(self, verbose=True):
        dbname = "sequana_db1"
        from easydev import md5
        dir1 = sequana_config_path + os.sep + dbname
        dir2 = dir1 + os.sep + "taxonomy"
        self.dv.mkdir(dir1)
        self.dv.mkdir(dir2)

        logger.info("Downloading about 8Gb of data (if not already downloaded) from"
            " Synapse into %s" % dir1)

        from os.path import exists
        filename = dir1 + "ena_list.txt"
        if exists(filename) and md5(filename) == "a9cc6268f3338d1632c4712a412593f2":
            pass
        else:
            self._download_from_synapse('syn6171700', dir1)

        # database.idx
        filename = dir1 + "database.idx"
        if exists(filename) and md5(filename) == "2fa4a99a4f52f2f04c5a965adb1534ac":
            pass
        else:
            self._download_from_synapse('syn6171017', dir1)

        # database.kdb ; this one is large (8Gb)
        filename = dir1 + "database.kdb"
        if exists(filename) and md5(filename) == "ff698696bfc88fe83bc201937cd9cbdf":
            pass
        else:
            self._download_from_synapse('syn6171107', dir1)

        # Then, the taxonomy directory
        filename = dir1 + "names.dmp"
        if exists(filename) and md5(filename) == "10bc7a63c579de02112d125a51fd65d0":
            pass
        else:
            self._download_from_synapse('syn6171286', dir2)

        filename = dir1 + "nodes.dmp"
        if exists(filename) and md5(filename) == "a68af5a60434e2067c4a0a16df873980":
            pass
        else:
            self._download_from_synapse('syn6171289', dir2)

        filename = dir1 + "taxons.txt"
        if exists(filename) and md5(filename) == "e78fbb43b3b41cbf4511d6af16c0287f":
            pass
        else:
            self._download_from_synapse('syn6171290', dir2)
        logger.info('done. You should have a kraken DB in %s' % dir1)

        # The annotations
        wget("https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv",
            dir1 + os.sep + "annotations.csv")
Exemple #3
0
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Exemple #4
0
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Exemple #5
0
    def download(self, uncompress=True):
        """Download the datasets (tar.gz) and uncompress them

        :param bool uncompress: if True, uncompress the tar.gz and delete it
        """
        url = "http://busco.ezlab.org/v2/datasets"
        for filename in self.filenames:
            basename = filename + ".tar.gz"
            target = self.base + "/" + basename
            print(url + "/" + basename)
            wget(url + "/" + basename, target)
            # TODO untar datasets and cleanup the tar.gz
            if uncompress:
                execute("tar xvfz %s -C %s" % (target, self.base))
                execute("rm -f %s" % (target))
Exemple #6
0
    def download(self, uncompress=True):
        """Download the datasets (tar.gz) and uncompress them

        :param bool uncompress: if True, uncompress the tar.gz and delete it
        """
        url = "http://busco.ezlab.org/v2/datasets"
        for filename in self.filenames:
            basename = filename + ".tar.gz"
            target = self.base + "/" + basename
            print(url + "/" + basename)
            wget(url + "/" + basename, target)
            # TODO untar datasets and cleanup the tar.gz 
            if uncompress:
                execute("tar xvfz %s -C %s" % (target, self.base))
                execute("rm -f %s" % ( target))
def krakendb():
    # todo
    try:
        taxonomy.main([prog, '--download', 'toydb'])
    except TypeError:  # Fails on travis so we download manually (appdirs returns
        # none instead of the expected user config path
        HOME = os.getenv('HOME')
        from sequana.misc import wget
        baseurl = "https://github.com/sequana/data/raw/master/kraken_toydb/"
        filenames = [
            "database.idx", "database.kdb", "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"
        ]
        for filename in filenames:
            from easydev import mkdirs
            mkdirs(HOME + os.sep + "database/taxonomy")
            wget(baseurl + os.sep + filename,
                 os.sep.join([HOME, "database", filename]))
    except SystemExit:
        pass
Exemple #8
0
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"
        ]
        filenames = [
            "database.idx", "database.kdb", "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"
        ]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)
Exemple #9
0
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"]
        filenames = [
            "database.idx",
            "database.kdb",
            "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)
Exemple #10
0
    def __init__(self, names, nodes):
        """

        :param names: can be a local file or URL
        :param nodes: can be a local file or URL

        """
        # Path to existing files
        logger.info("Reading input files")
        self.names = names
        self.nodes = nodes

        # First, the nodes
        if os.path.exists(nodes):
            self.df_nodes = pd.read_csv(nodes, sep="|", header=None)
        else:
            with TempFile() as fout_nodes:
                logger.info("Loading nodes.dmp from an URL {}".format(nodes))
                wget(nodes, fout_nodes.name)
                self.df_nodes = pd.read_csv(fout_nodes.name,
                                            sep="|",
                                            header=None)
        for i, _type in enumerate(self.df_nodes.dtypes):
            if _type == "O":
                self.df_nodes[i] = self.df_nodes[i].str.strip('\t')
        """
        tax_id                  -- node id in GenBank taxonomy database
        parent tax_id               -- parent node id in GenBank taxonomy database
        rank                    -- rank of this node (superkingdom, kingdom, ...) 
        embl code               -- locus-name prefix; not unique
        division id             -- see division.dmp file
        inherited div flag  (1 or 0)        -- 1 if node inherits division from parent
        genetic code id             -- see gencode.dmp file
        inherited GC  flag  (1 or 0)        -- 1 if node inherits genetic code from parent
        mitochondrial genetic code id       -- see gencode.dmp file
        inherited MGC flag  (1 or 0)        -- 1 if node inherits mitochondrial gencode from parent
        GenBank hidden flag (1 or 0)            -- 1 if name is suppressed in
        GenBank entry lineage
        hidden subtree root flag (1 or 0)       -- 1 if this subtree has no sequence data yet
        comments                -- free-text comments and citations
        """

        try:
            self.df_nodes.columns = [
                "taxid", "parent", "rank", 4, 5, "gc_id", "mt_id", 7, 8, 9, 10,
                11, 12, 13
            ]
            del self.df_nodes[13]
        except:
            self.df_nodes.columns = ["taxid", "parent", "rank", 4, 5]
            del self.df_nodes[5]

        # make sure they are ordered by taxon ID
        self.df_nodes.sort_values("taxid", inplace=True)
        self.df_nodes.set_index("taxid", inplace=True)

        # now we read the names
        if os.path.exists(names):
            self.df_names = pd.read_csv(names, sep="|", header=None)
        else:
            with TempFile() as fout_names:
                logger.info("Loading names.dmp from an URL {}".format(names))
                wget(names, fout_names.name)
                self.df_names = pd.read_csv(fout_names.name,
                                            sep="|",
                                            header=None)

        for i, _type in enumerate(self.df_names.dtypes):
            if _type == "O":
                self.df_names[i] = self.df_names[i].str.strip('\t')
        del self.df_names[4]
        self.df_names.columns = ['taxid', 'name', 'unique_name', 'key']
        self.df_names.set_index("taxid", inplace=True)