def to_js(self, output="krona.html", onweb=False): if self._data_created == False: status = self.kraken_to_krona() execute("ktImportText %s -o %s" % (self.output_filename, output)) if onweb is True: import easydev easydev.onweb(output)
def _build_kraken(self): print('Building the kraken db ') self.params['hash_size'] = int(self.params["hash_size"]) cmd = """kraken-build --rebuild -db %(dbname)s \ --minimizer-len %(minimizer_len)s\ --max-db-size %(max_db_size)s \ --threads %(threads)s\ --kmer-len %(kmer_length)s \ --jellyfish-hash-size %(hash_size)s""" % self.params # again, kraken-build prints on stderr so we cannot use easydev.shellcmd execute(cmd)
def download(self, uncompress=True): """Download the datasets (tar.gz) and uncompress them :param bool uncompress: if True, uncompress the tar.gz and delete it """ url = "http://busco.ezlab.org/v2/datasets" for filename in self.filenames: basename = filename + ".tar.gz" target = self.base + "/" + basename print(url + "/" + basename) wget(url + "/" + basename, target) # TODO untar datasets and cleanup the tar.gz if uncompress: execute("tar xvfz %s -C %s" % (target, self.base)) execute("rm -f %s" % (target))
def download(self, uncompress=True): """Download the datasets (tar.gz) and uncompress them :param bool uncompress: if True, uncompress the tar.gz and delete it """ url = "http://busco.ezlab.org/v2/datasets" for filename in self.filenames: basename = filename + ".tar.gz" target = self.base + "/" + basename print(url + "/" + basename) wget(url + "/" + basename, target) # TODO untar datasets and cleanup the tar.gz if uncompress: execute("tar xvfz %s -C %s" % (target, self.base)) execute("rm -f %s" % ( target))
def clean_db(self): """Once called, you will not be able to append more FASTA files """ # Now we can clean the kraken db: print('Cleaning the kraken db ') # Clean the nodes.dmp and names.dmp print('Identifying the GI numbers') gis = self.get_gis() taxons = self.get_taxons_from_gis(gis) print("") self.gis = gis self.taxons = taxons # This cleans the nodes.dmp and names.dmp. This must be done # before kraken-build --clean since it requires the gi_taxid_nucl.dmp # file names_file = self.taxon_path + os.sep + "names.dmp" nodes_file = self.taxon_path + os.sep + "nodes.dmp" names_file_temp = self.taxon_path + os.sep + "names_temp.dmp" nodes_file_temp = self.taxon_path + os.sep + "nodes_temp.dmp" taxon_file_reader = NCBITaxonReader(names=names_file, nodes=nodes_file, verbose=True) print("Filtering") taxon_file_reader.filter_nodes_dmp_file(nodes_file, nodes_file_temp, taxons=taxons) taxon_file_reader.filter_names_dmp_file(names_file, names_file_temp, taxons=taxons) # mv the new files into the old ones os.rename(names_file_temp, names_file) os.rename(nodes_file_temp, nodes_file) # Finally, the kraken cleaning itself cmd = "kraken-build --clean --db %s" % self.params['dbname'] execute(cmd)
def download_taxonomy(self, force=False): """Download kraken data, once for all instead of doing it for each build. The downloaded file is large (1.3Gb) and the unzipped file is about 9Gb. If already present, do not download the file except if the *force* parameter is set to True. """ # valid with kraken 1.1 urls #ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz #ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_wgs.accession2taxid.gz #ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz # If the requested file exists, nothing to do expected_filename = self.taxon_path + os.sep + "gi_taxid_nucl.dmp" expected_md5 = "8c182ac2df452d836206ad13275cd8af" print( '\nDownloading taxonomy files. Takes a while depending on your connection' ) if os.path.exists(expected_filename) is False or \ md5(expected_filename) != expected_md5: # download taxonomy # We could use kraken-build --download-taxonomy + a subprocess but # even simpler to get the file via ftp FTP = "ftp.ncbi.nih.gov" execute( "wget %s/pub/taxonomy/gi_taxid_nucl.dmp.gz --directory-prefix %s" % (FTP, self.taxon_path)) # Unzip the files execute('unpigz %s/gi_taxid_nucl.dmp.gz' % self.taxon_path) else: print("Found local expected file %s " % expected_filename) expected_filename = self.taxon_path + os.sep + "names.dmp" expected_md5 = "90d88912ad4c94f6ac07dfab0443da9b" if os.path.exists(expected_filename) is False or \ md5(expected_filename) != expected_md5: execute( "wget %s/pub/taxonomy/taxdump.tar.gz --directory-prefix %s" % (FTP, self.taxon_path)) execute('tar xvfz %s/taxdump.tar.gz -C %s' % (self.taxon_path, self.taxon_path)) else: print("Found local expected file %s " % expected_filename)
def main(args=None): user_options = Options(prog="sequana") if args is None: args = sys.argv # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) if options.version: import sequana print(sequana.version) sys.exit() if options.jobs > 20 and options.bypass is False: raise ValueError('The number of jobs is limited to 20. You can ' + 'force this limit by using --bypass-job-limit') if misc.on_cluster("tars-") and options.unlock is False: if options.cluster is None: raise ValueError("You are on TARS (Institut Pasteur). You " + " must use --cluster option to provide the scheduler " + " options (typically ' --cluster 'sbatch --qos normal' )") # valid codecs: valid_extensions = [("fastq." + ext2).rstrip(".") for ext2 in ['', 'bz2', 'gz', 'dsrc']] valid_extensions += [("fq." + ext2).rstrip(".") for ext2 in ['', 'bz2', 'gz', 'dsrc']] valid_combos = [(x, y) for x in valid_extensions for y in valid_extensions if x!=y] if (options.source, options.target) not in valid_combos: raise ValueError("""--target and --source combo not valid. Must be one of fastq, fastq.gz, fastq.bz2 or fastq.dsrc""") # Create the config file locally module = Module("compressor") with TempFile(suffix=".yaml", dir=".") as temp: cfg = SequanaConfig(module.config) cfg.config.compressor.source = options.source cfg.config.compressor.target = options.target cfg.config.compressor.recursive = options.recursive cfg.config.compressor.verbose = options.verbose cfg.config.compressor.threads = options.threads cfg._update_yaml() cfg.save(filename=temp.name) # The Snakefile can stay in its original place: rule = module.path + os.sep + "compressor.rules" # Run the snakemake command itself. cmd = 'snakemake -s %s --configfile %s -j %s ' % \ (rule, temp.name, options.jobs) if options.dryrun: cmd += " --dryrun " if options.verbose is False: cmd += " --quiet " else: cmd += " -p " # for slurm only: --cores-per-socket if options.cluster: cluster = ' --cluster "%s" ' % options.cluster cmd += cluster if options.snakemake: if " -s " in options.snakemake or " -j " in options.snakemake: raise ValueError("-s or -j cannot be used in " + " --snakemake-options (already used internally") cmd += options.snakemake if options.unlock: cmd += " --unlock " if options.verbose: print(cmd) # On travis, snakemake.shell command from snakemake fails. # Most probably because travis itself uses a subprocess. # excute from easydev uses pexpect.spawn, which seems to work well from easydev import execute execute(cmd, showcmd=False)
def test_only(qtbot): from easydev import execute execute("sequanix --no-splash --testing")
def download_list(self): """Download all standard lists of accession numbers from ENA""" for key, values in self._metadata.items(): execute("wget -q -t 3 http://www.ebi.ac.uk/genomes/%s -O %s" % (values[0], values[0]))