def kraken2_full_build_hash(taxonomy, path_output, p): # Build hash table taxon_link = osp.join(path_output, "taxonomy") if osp.islink(taxon_link): logger.debug(f"removing existing link at {taxon_link}") os.unlink(taxon_link) os.symlink(taxonomy, taxon_link) cmd = [ "kraken2-build", "--build", "--threads", f"{main.cores}", "--db", path_output, "--kmer-len", p['k'], "--minimizer-len", p['l'], "--minimizer-spaces", p['s'], ] bash_process( cmd, f"Launching CMD to build KRAKEN2 Hash, will take lots of time and memory: " )
def kraken2_clean(path_bins_hash, n_clusters): """ Use of kraken2-build --clean option to remove temporary files. No cleaning by default because the library is the same for various values of k, l and s """ if n_clusters <= 1: logger.info( f"kraken2-build --clean, for all the hashes under {path_bins_hash}" ) cmd = [ "kraken2-build", "--clean", "--threads", f"{main.cores}", "--db", path_bins_hash ] bash_process(cmd, "Launching cleaning with kraken2-build --clean") else: logger.info( f"kraken2-build --clean, for all the hashes under {path_bins_hash}" ) for cluster in tqdm(range(n_clusters), dynamic_ncols=True): bin_id = f"{cluster}/" cmd = [ "kraken2-build", "--clean", "--threads", f"{main.cores}", "--db", osp.join(path_bins_hash, bin_id) ] bash_process(cmd, "Launching cleaning with kraken2-build --clean") logger.info(f"Cleaning done")
def kraken2_full_add_lib(path_refseq, path_output): """ Build the hash table with the same genomes, but without binning, for comparison """ # todo: adapt for centrifuge as well delete_folder_if_exists(path_output) create_path(path_output) add_file_with_parameters( path_output, add_description=f"no binning database for comparison") logger.warning( f"DO NOT INTERRUPT this process, you will have restart from scratches." ) # Add genomes to for folder in os.scandir(path_refseq): if not osp.isdir(folder.path): continue if any([to_omit in folder.name for to_omit in main.omit_folders]): logger.info(f"skipping {folder.name}") continue else: cmd = [ "find", folder.path, "-name", "'*.fna'", "-print0", "|", "xargs", "-P", f"{main.cores}", "-0", "-I{}", "-n1", "kraken2-build", "--add-to-library", "{}", "--db", path_output ] bash_process(" ".join(cmd), "adding genomes for kraken2 libraries")
def centrifuge(self, fastq_input, folder_hash, arg="unknown"): """ Centrifuge calls https://ccb.jhu.edu/software/centrifuge/manual.shtml#command-line """ hashes_file = [osp.join(folder_hash, f"cf_index.{i}.cf") for i in range(1, 4)] hash_root = osp.join(folder_hash, "cf_index") assert osp.isfile(hashes_file[0]), FileNotFoundError(f"Hash table not found ! {hash_root}*") self.hash_size[arg] = sum(osp.getsize(f) for f in hashes_file) self.logger.info(f'start to classify reads from file ({f_size(fastq_input)}) {fastq_input}') self.logger.info(f'with centrifuge, {arg}. hash table is ({f_size(self.hash_size[arg])}) {hash_root}*') out_path = f"{self.path_out}.{arg}" if self.db_type == "bins" else f"{self.path_out}" out_file = f"{out_path}.out" self.logger.info(f'output is {out_file}') self.cmd = [ "centrifuge", "-x", hash_root, "-U", fastq_input, "-S", out_file, "--report-file", f"{out_path}.centrifuge-report.tsv", "--time", "--threads", f"{THREADS}", ] if self.dry_run: self.logger.debug(" ".join(self.cmd)) else: bash_process(self.cmd, f"launching centrifuge classification on {fastq_input}") # Then do the kraken2 report cmd2 = ["centrifuge-kreport", "-x", hash_root, out_file, ">", f"{out_path}.report"] bash_process(" ".join(cmd2), f"launching centrifuge kreport on {fastq_input}")
def add_library(path_refseq_binned, path_bins_hash, n_clusters, classifier): """ launch kraken2-build add-to-library. DELETE EXISTING FOLDER !! https://htmlpreview.github.io/?https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.html#custom-databases """ create_n_folders(path_bins_hash, n_clusters) add_file_with_parameters(path_bins_hash, add_description=f"cluster number = {n_clusters}") logger.info( f"{classifier} add_to_library, {n_clusters} clusters, under {path_bins_hash} " ) for cluster in tqdm(range(n_clusters), dynamic_ncols=True): bin_id = f"{cluster}/" if "kraken2" in classifier: # if library exist in another folder (other classifier parameters, but same binning param), make a link to it ! existing_lib = glob( f"{osp.dirname(path_bins_hash)}/*/{bin_id}/library") path_new_lib = osp.join(path_bins_hash, bin_id, "library") # If library has already been done, skip it if osp.isdir(path_new_lib): logger.debug( f"Library {bin_id} already existing. Delete folder if reinstall needed: {path_new_lib}" ) # If done with other parameters, k25, can reuse it elif len(existing_lib) > 0: os.symlink(existing_lib[0], path_new_lib) else: cmd = [ "find", osp.join(path_refseq_binned, bin_id), "-name", "'*.fna'", "-print0", "|", "xargs", "-P", f"{main.cores}", "-0", "-I{}", "-n1", "kraken2-build", "--add-to-library", "{}", "--db", osp.join(path_bins_hash, bin_id) ] bash_process(" ".join(cmd), "Adding genomes to kraken2 library") elif "centrifuge" in classifier: # Concat all .fna files in a bin into one file. path_fnas = osp.join(path_bins_hash, bin_id, "library.fna") if osp.isfile(path_fnas): logger.info( f"Library file for centrifuge, bin {cluster} exists, skipping step" ) continue with open(path_fnas, 'w') as concatenated_fna: logger.debug( f"for centrifuge library, concatenated fna files into {path_fnas}" ) for path in tqdm(Path(path_refseq_binned, bin_id).rglob("*.fna"), leave=False): with open(path) as fna: concatenated_fna.write(fna.read()) else: raise NotImplementedError(f"classifier unsupported {classifier}")
def kraken2(self, fastq_input, folder_hash, arg="unknown"): if "hash.k2d" in folder_hash: folder_hash = osp.dirname(folder_hash) hash_file = osp.join(folder_hash, "hash.k2d") assert osp.isfile(hash_file), FileNotFoundError(f"Hash table not found ! {hash_file}") self.hash_size[arg] = osp.getsize(hash_file) self.logger.info(f'start to classify reads from file ({f_size(fastq_input)}) {fastq_input}') self.logger.info(f'with kraken2, {arg}. hash table is ({f_size(hash_file)}) {hash_file}') formatted_out = f"{self.path_out}.{arg}" if self.db_type == "bins" else f"{self.path_out}" self.logger.info(f'output is {formatted_out}.out') self.cmd = [ "kraken2", "--threads", f"{THREADS}", "--db", folder_hash, fastq_input, "--output", f"{formatted_out}.out", "--report", f"{formatted_out}.report", ] if self.dry_run: self.logger.debug(" ".join(self.cmd)) else: bash_process(self.cmd, f"launching kraken2 classification on {fastq_input}")
def build_indexes(path_taxonomy, path_classifier, n_clusters, p): """ launch kraken build on each bin https://htmlpreview.github.io/?https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.html#custom-databases Skip skipping by checking if folder exists: **check_step NO FOLDER CHECK** (DON'T REMOVE) """ assert osp.isdir(path_taxonomy), logger.error( f"Path to taxonomy doesn't seem to be a directory: {path_taxonomy}") add_file_with_parameters( path_classifier, add_description=f"cluster = {n_clusters} \ntaxonomy = {path_taxonomy}") logger.info( f"{p['name']} build its {n_clusters} indexes, will take lots of time. Under: {path_classifier}" ) for cluster in tqdm(range(n_clusters), dynamic_ncols=True): bin_id = f"{cluster}/" if "kraken2" in p['name']: # check if hash has already been done path_kraken2 = osp.join(path_classifier, bin_id) path_kraken2_hash = osp.join(path_kraken2, "hash.k2d") if osp.isfile(path_kraken2_hash) and not any( [fname.endswith('.tmp') for fname in os.listdir(path_kraken2)]): logger.debug( f"Hash table already created, skipping bin {cluster}") continue # add link to taxonomy taxon_in_cluster = osp.join(path_classifier, bin_id, "taxonomy") if osp.islink(taxon_in_cluster): logger.debug(f"removing existing link at {taxon_in_cluster}") os.unlink(taxon_in_cluster) os.symlink(path_taxonomy, taxon_in_cluster) # Build cmd = [ "kraken2-build", "--build", "--threads", f"{main.cores}", "--db", path_kraken2, "--kmer-len", p['k'], "--minimizer-len", p['l'], "--minimizer-spaces", p['s'], ] bash_process(cmd, "launching kraken2-build") elif "centrifuge" in p['name']: path_bin = osp.join(path_classifier, bin_id) p_seqtxid = Path(path_classifier).parent.joinpath( "kraken2/k35_l31_s7", bin_id, "seqid2taxid.map").as_posix() path_lib = osp.join(path_bin, "library.fna") path_cf = osp.join(path_bin, "cf_index") # if one cf_index.1.cf exists, and there's no more *.sa files, and all *.cf files are not empty... if osp.isfile(f"{path_cf}.1.cf") and not list(Path(path_bin).rglob("*.sa")) \ and all([f.stat().st_size > 0 for f in Path(path_bin).rglob("*.cf")]): logger.info( f"index has already been generated, skipping bin {cluster}" ) continue cmd = [ "centrifuge-build", "-p", f"{main.cores}", "--conversion-table", p_seqtxid, "--taxonomy-tree", osp.join(path_taxonomy, "nodes.dmp"), "--name-table", osp.join(path_taxonomy, "names.dmp"), path_lib, path_cf, ] bash_process( cmd, "launching centrifuge-build. Expect very long run time (in hours)" ) logger.info(f"{p['name']} finished building hash tables. " + ( "You can clean the intermediate files with: kraken2-build --clean {path_bins_hash}/<bin number>" if "kraken2" in p['name'] else "All files, except the index *.[123].cf, can be removed"))