def _download_sequana_db1(self, verbose=True): dbname = "sequana_db1" from easydev import md5 dir1 = sequana_config_path + os.sep + dbname dir2 = dir1 + os.sep + "taxonomy" self.dv.mkdir(dir1) self.dv.mkdir(dir2) logger.info("Downloading about 8Gb of data (if not already downloaded) from" " Synapse into %s" % dir1) from os.path import exists filename = dir1 + "ena_list.txt" if exists(filename) and md5(filename) == "a9cc6268f3338d1632c4712a412593f2": pass else: self._download_from_synapse('syn6171700', dir1) # database.idx filename = dir1 + "database.idx" if exists(filename) and md5(filename) == "2fa4a99a4f52f2f04c5a965adb1534ac": pass else: self._download_from_synapse('syn6171017', dir1) # database.kdb ; this one is large (8Gb) filename = dir1 + "database.kdb" if exists(filename) and md5(filename) == "ff698696bfc88fe83bc201937cd9cbdf": pass else: self._download_from_synapse('syn6171107', dir1) # Then, the taxonomy directory filename = dir1 + "names.dmp" if exists(filename) and md5(filename) == "10bc7a63c579de02112d125a51fd65d0": pass else: self._download_from_synapse('syn6171286', dir2) filename = dir1 + "nodes.dmp" if exists(filename) and md5(filename) == "a68af5a60434e2067c4a0a16df873980": pass else: self._download_from_synapse('syn6171289', dir2) filename = dir1 + "taxons.txt" if exists(filename) and md5(filename) == "e78fbb43b3b41cbf4511d6af16c0287f": pass else: self._download_from_synapse('syn6171290', dir2) logger.info('done. You should have a kraken DB in %s' % dir1) # The annotations wget("https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv", dir1 + os.sep + "annotations.csv")
def _download_minikraken(self, verbose=True): dv = DevTools() base = sequana_config_path + os.sep + "" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) logger.info("Downloading minikraken (4Gb)") filename = base + os.sep + "minikraken.tgz" if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2": logger.warning("%s already present" % filename) else: wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
def download(self, uncompress=True): """Download the datasets (tar.gz) and uncompress them :param bool uncompress: if True, uncompress the tar.gz and delete it """ url = "http://busco.ezlab.org/v2/datasets" for filename in self.filenames: basename = filename + ".tar.gz" target = self.base + "/" + basename print(url + "/" + basename) wget(url + "/" + basename, target) # TODO untar datasets and cleanup the tar.gz if uncompress: execute("tar xvfz %s -C %s" % (target, self.base)) execute("rm -f %s" % (target))
def download(self, uncompress=True): """Download the datasets (tar.gz) and uncompress them :param bool uncompress: if True, uncompress the tar.gz and delete it """ url = "http://busco.ezlab.org/v2/datasets" for filename in self.filenames: basename = filename + ".tar.gz" target = self.base + "/" + basename print(url + "/" + basename) wget(url + "/" + basename, target) # TODO untar datasets and cleanup the tar.gz if uncompress: execute("tar xvfz %s -C %s" % (target, self.base)) execute("rm -f %s" % ( target))
def krakendb(): # todo try: taxonomy.main([prog, '--download', 'toydb']) except TypeError: # Fails on travis so we download manually (appdirs returns # none instead of the expected user config path HOME = os.getenv('HOME') from sequana.misc import wget baseurl = "https://github.com/sequana/data/raw/master/kraken_toydb/" filenames = [ "database.idx", "database.kdb", "taxonomy/names.dmp", "taxonomy/nodes.dmp" ] for filename in filenames: from easydev import mkdirs mkdirs(HOME + os.sep + "database/taxonomy") wget(baseurl + os.sep + filename, os.sep.join([HOME, "database", filename])) except SystemExit: pass
def _download_kraken_toydb(self, verbose=True): """Download the kraken DB toy example from sequana_data into .config/sequana directory Checks the md5 checksums. About 32Mb of data """ dv = DevTools() base = sequana_config_path + os.sep + "kraken_toydb" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) baseurl = "https://github.com/sequana/data/raw/master/" # download only if required logger.info("Downloading the database into %s" % base) md5sums = [ "28661f8baf0514105b0c6957bec0fc6e", "97a39d44ed86cadea470352d6f69748d", "d91a0fcbbc0f4bbac918755b6400dea6", "c8bae69565af2170ece194925b5fdeb9" ] filenames = [ "database.idx", "database.kdb", "taxonomy/names.dmp", "taxonomy/nodes.dmp" ] for filename, md5sum in zip(filenames, md5sums): url = baseurl + "kraken_toydb/%s" % filename filename = base + os.sep + filename if os.path.exists(filename) and md5(filename) == md5sum: logger.warning("%s already present" % filename) else: logger.info("Downloading %s" % url) wget(url, filename)
def _download_kraken_toydb(self, verbose=True): """Download the kraken DB toy example from sequana_data into .config/sequana directory Checks the md5 checksums. About 32Mb of data """ dv = DevTools() base = sequana_config_path + os.sep + "kraken_toydb" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) baseurl = "https://github.com/sequana/data/raw/master/" # download only if required logger.info("Downloading the database into %s" % base) md5sums = [ "28661f8baf0514105b0c6957bec0fc6e", "97a39d44ed86cadea470352d6f69748d", "d91a0fcbbc0f4bbac918755b6400dea6", "c8bae69565af2170ece194925b5fdeb9"] filenames = [ "database.idx", "database.kdb", "taxonomy/names.dmp", "taxonomy/nodes.dmp"] for filename, md5sum in zip(filenames, md5sums): url = baseurl + "kraken_toydb/%s" % filename filename = base + os.sep + filename if os.path.exists(filename) and md5(filename) == md5sum: logger.warning("%s already present" % filename) else: logger.info("Downloading %s" % url) wget(url, filename)
def __init__(self, names, nodes): """ :param names: can be a local file or URL :param nodes: can be a local file or URL """ # Path to existing files logger.info("Reading input files") self.names = names self.nodes = nodes # First, the nodes if os.path.exists(nodes): self.df_nodes = pd.read_csv(nodes, sep="|", header=None) else: with TempFile() as fout_nodes: logger.info("Loading nodes.dmp from an URL {}".format(nodes)) wget(nodes, fout_nodes.name) self.df_nodes = pd.read_csv(fout_nodes.name, sep="|", header=None) for i, _type in enumerate(self.df_nodes.dtypes): if _type == "O": self.df_nodes[i] = self.df_nodes[i].str.strip('\t') """ tax_id -- node id in GenBank taxonomy database parent tax_id -- parent node id in GenBank taxonomy database rank -- rank of this node (superkingdom, kingdom, ...) embl code -- locus-name prefix; not unique division id -- see division.dmp file inherited div flag (1 or 0) -- 1 if node inherits division from parent genetic code id -- see gencode.dmp file inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent mitochondrial genetic code id -- see gencode.dmp file inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet comments -- free-text comments and citations """ try: self.df_nodes.columns = [ "taxid", "parent", "rank", 4, 5, "gc_id", "mt_id", 7, 8, 9, 10, 11, 12, 13 ] del self.df_nodes[13] except: self.df_nodes.columns = ["taxid", "parent", "rank", 4, 5] del self.df_nodes[5] # make sure they are ordered by taxon ID self.df_nodes.sort_values("taxid", inplace=True) self.df_nodes.set_index("taxid", inplace=True) # now we read the names if os.path.exists(names): self.df_names = pd.read_csv(names, sep="|", header=None) else: with TempFile() as fout_names: logger.info("Loading names.dmp from an URL {}".format(names)) wget(names, fout_names.name) self.df_names = pd.read_csv(fout_names.name, sep="|", header=None) for i, _type in enumerate(self.df_names.dtypes): if _type == "O": self.df_names[i] = self.df_names[i].str.strip('\t') del self.df_names[4] self.df_names.columns = ['taxid', 'name', 'unique_name', 'key'] self.df_names.set_index("taxid", inplace=True)