Beispiel #1
0
def kraken2_full_build_hash(taxonomy, path_output, p):

    # Build hash table
    taxon_link = osp.join(path_output, "taxonomy")
    if osp.islink(taxon_link):
        logger.debug(f"removing existing link at {taxon_link}")
        os.unlink(taxon_link)
    os.symlink(taxonomy, taxon_link)
    cmd = [
        "kraken2-build",
        "--build",
        "--threads",
        f"{main.cores}",
        "--db",
        path_output,
        "--kmer-len",
        p['k'],
        "--minimizer-len",
        p['l'],
        "--minimizer-spaces",
        p['s'],
    ]
    bash_process(
        cmd,
        f"Launching CMD to build KRAKEN2 Hash, will take lots of time and memory: "
    )
Beispiel #2
0
def kraken2_clean(path_bins_hash, n_clusters):
    """ Use of kraken2-build --clean option to remove temporary files.
        No cleaning by default because the library is the same for various values of k, l and s
    """
    if n_clusters <= 1:
        logger.info(
            f"kraken2-build --clean, for all the hashes under {path_bins_hash}"
        )
        cmd = [
            "kraken2-build", "--clean", "--threads", f"{main.cores}", "--db",
            path_bins_hash
        ]
        bash_process(cmd, "Launching cleaning with kraken2-build --clean")

    else:
        logger.info(
            f"kraken2-build --clean, for all the hashes under {path_bins_hash}"
        )
        for cluster in tqdm(range(n_clusters), dynamic_ncols=True):
            bin_id = f"{cluster}/"
            cmd = [
                "kraken2-build", "--clean", "--threads", f"{main.cores}",
                "--db",
                osp.join(path_bins_hash, bin_id)
            ]
            bash_process(cmd, "Launching cleaning with kraken2-build --clean")
    logger.info(f"Cleaning done")
Beispiel #3
0
def kraken2_full_add_lib(path_refseq, path_output):
    """ Build the hash table with the same genomes, but without binning, for comparison """
    # todo: adapt for centrifuge as well
    delete_folder_if_exists(path_output)
    create_path(path_output)
    add_file_with_parameters(
        path_output, add_description=f"no binning database for comparison")

    logger.warning(
        f"DO NOT INTERRUPT this process, you will have restart from scratches."
    )
    # Add genomes to
    for folder in os.scandir(path_refseq):
        if not osp.isdir(folder.path):
            continue
        if any([to_omit in folder.name for to_omit in main.omit_folders]):
            logger.info(f"skipping {folder.name}")
            continue
        else:
            cmd = [
                "find", folder.path, "-name", "'*.fna'", "-print0", "|",
                "xargs", "-P", f"{main.cores}", "-0", "-I{}", "-n1",
                "kraken2-build", "--add-to-library", "{}", "--db", path_output
            ]
            bash_process(" ".join(cmd), "adding genomes for kraken2 libraries")
Beispiel #4
0
 def centrifuge(self, fastq_input, folder_hash, arg="unknown"):
     """ Centrifuge calls
         https://ccb.jhu.edu/software/centrifuge/manual.shtml#command-line
     """
     hashes_file = [osp.join(folder_hash, f"cf_index.{i}.cf") for i in range(1, 4)]
     hash_root = osp.join(folder_hash, "cf_index")
     assert osp.isfile(hashes_file[0]), FileNotFoundError(f"Hash table not found ! {hash_root}*")
     self.hash_size[arg] = sum(osp.getsize(f) for f in hashes_file)
     self.logger.info(f'start to classify reads from file ({f_size(fastq_input)}) {fastq_input}')
     self.logger.info(f'with centrifuge, {arg}. hash table is ({f_size(self.hash_size[arg])}) {hash_root}*')
     out_path = f"{self.path_out}.{arg}" if self.db_type == "bins" else f"{self.path_out}"
     out_file = f"{out_path}.out"
     self.logger.info(f'output is {out_file}')
     self.cmd = [
         "centrifuge", "-x", hash_root, "-U", fastq_input,
         "-S", out_file, "--report-file", f"{out_path}.centrifuge-report.tsv",
         "--time", "--threads", f"{THREADS}",
     ]
     if self.dry_run:
         self.logger.debug(" ".join(self.cmd))
     else:
         bash_process(self.cmd, f"launching centrifuge classification on {fastq_input}")
         # Then do the kraken2 report
         cmd2 = ["centrifuge-kreport", "-x", hash_root, out_file, ">", f"{out_path}.report"]
         bash_process(" ".join(cmd2), f"launching centrifuge kreport on {fastq_input}")
Beispiel #5
0
def add_library(path_refseq_binned, path_bins_hash, n_clusters, classifier):
    """ launch kraken2-build add-to-library. DELETE EXISTING FOLDER !!
        https://htmlpreview.github.io/?https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.html#custom-databases
    """
    create_n_folders(path_bins_hash, n_clusters)
    add_file_with_parameters(path_bins_hash,
                             add_description=f"cluster number = {n_clusters}")

    logger.info(
        f"{classifier} add_to_library, {n_clusters} clusters, under {path_bins_hash} "
    )
    for cluster in tqdm(range(n_clusters), dynamic_ncols=True):
        bin_id = f"{cluster}/"

        if "kraken2" in classifier:
            # if library exist in another folder (other classifier parameters, but same binning param), make a link to it !
            existing_lib = glob(
                f"{osp.dirname(path_bins_hash)}/*/{bin_id}/library")
            path_new_lib = osp.join(path_bins_hash, bin_id, "library")

            # If library has already been done, skip it
            if osp.isdir(path_new_lib):
                logger.debug(
                    f"Library {bin_id} already existing. Delete folder if reinstall needed: {path_new_lib}"
                )
            # If done with other parameters, k25, can reuse it
            elif len(existing_lib) > 0:
                os.symlink(existing_lib[0], path_new_lib)
            else:
                cmd = [
                    "find",
                    osp.join(path_refseq_binned,
                             bin_id), "-name", "'*.fna'", "-print0", "|",
                    "xargs", "-P", f"{main.cores}", "-0", "-I{}", "-n1",
                    "kraken2-build", "--add-to-library", "{}", "--db",
                    osp.join(path_bins_hash, bin_id)
                ]
                bash_process(" ".join(cmd),
                             "Adding genomes to kraken2 library")

        elif "centrifuge" in classifier:
            # Concat all .fna files in a bin into one file.
            path_fnas = osp.join(path_bins_hash, bin_id, "library.fna")
            if osp.isfile(path_fnas):
                logger.info(
                    f"Library file for centrifuge, bin {cluster} exists, skipping step"
                )
                continue
            with open(path_fnas, 'w') as concatenated_fna:
                logger.debug(
                    f"for centrifuge library, concatenated fna files into {path_fnas}"
                )
                for path in tqdm(Path(path_refseq_binned,
                                      bin_id).rglob("*.fna"),
                                 leave=False):
                    with open(path) as fna:
                        concatenated_fna.write(fna.read())
        else:
            raise NotImplementedError(f"classifier unsupported {classifier}")
Beispiel #6
0
 def kraken2(self, fastq_input, folder_hash, arg="unknown"):
     if "hash.k2d" in folder_hash: folder_hash = osp.dirname(folder_hash)
     hash_file = osp.join(folder_hash, "hash.k2d")
     assert osp.isfile(hash_file), FileNotFoundError(f"Hash table not found ! {hash_file}")
     self.hash_size[arg] = osp.getsize(hash_file)
     self.logger.info(f'start to classify reads from file ({f_size(fastq_input)}) {fastq_input}')
     self.logger.info(f'with kraken2, {arg}. hash table is ({f_size(hash_file)}) {hash_file}')
     formatted_out = f"{self.path_out}.{arg}" if self.db_type == "bins" else f"{self.path_out}"
     self.logger.info(f'output is {formatted_out}.out')
     self.cmd = [
         "kraken2", "--threads", f"{THREADS}",
         "--db", folder_hash,
         fastq_input,
         "--output", f"{formatted_out}.out",
         "--report", f"{formatted_out}.report",
     ]
     if self.dry_run:
         self.logger.debug(" ".join(self.cmd))
     else:
         bash_process(self.cmd, f"launching kraken2 classification on {fastq_input}")
Beispiel #7
0
def build_indexes(path_taxonomy, path_classifier, n_clusters, p):
    """ launch kraken build on each bin
        https://htmlpreview.github.io/?https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.html#custom-databases
        Skip skipping by checking if folder exists: **check_step NO FOLDER CHECK** (DON'T REMOVE)
    """
    assert osp.isdir(path_taxonomy), logger.error(
        f"Path to taxonomy doesn't seem to be a directory: {path_taxonomy}")
    add_file_with_parameters(
        path_classifier,
        add_description=f"cluster = {n_clusters} \ntaxonomy = {path_taxonomy}")

    logger.info(
        f"{p['name']} build its {n_clusters} indexes, will take lots of time. Under: {path_classifier}"
    )
    for cluster in tqdm(range(n_clusters), dynamic_ncols=True):
        bin_id = f"{cluster}/"

        if "kraken2" in p['name']:
            # check if hash has already been done
            path_kraken2 = osp.join(path_classifier, bin_id)
            path_kraken2_hash = osp.join(path_kraken2, "hash.k2d")
            if osp.isfile(path_kraken2_hash) and not any(
                [fname.endswith('.tmp')
                 for fname in os.listdir(path_kraken2)]):
                logger.debug(
                    f"Hash table already created, skipping bin {cluster}")
                continue

            # add link to taxonomy
            taxon_in_cluster = osp.join(path_classifier, bin_id, "taxonomy")
            if osp.islink(taxon_in_cluster):
                logger.debug(f"removing existing link at {taxon_in_cluster}")
                os.unlink(taxon_in_cluster)
            os.symlink(path_taxonomy, taxon_in_cluster)

            # Build
            cmd = [
                "kraken2-build",
                "--build",
                "--threads",
                f"{main.cores}",
                "--db",
                path_kraken2,
                "--kmer-len",
                p['k'],
                "--minimizer-len",
                p['l'],
                "--minimizer-spaces",
                p['s'],
            ]
            bash_process(cmd, "launching kraken2-build")

        elif "centrifuge" in p['name']:
            path_bin = osp.join(path_classifier, bin_id)
            p_seqtxid = Path(path_classifier).parent.joinpath(
                "kraken2/k35_l31_s7", bin_id, "seqid2taxid.map").as_posix()
            path_lib = osp.join(path_bin, "library.fna")
            path_cf = osp.join(path_bin, "cf_index")

            # if one cf_index.1.cf exists, and there's no more *.sa files, and all *.cf files are not empty...
            if osp.isfile(f"{path_cf}.1.cf") and not list(Path(path_bin).rglob("*.sa")) \
                    and all([f.stat().st_size > 0 for f in Path(path_bin).rglob("*.cf")]):
                logger.info(
                    f"index has already been generated, skipping bin {cluster}"
                )
                continue

            cmd = [
                "centrifuge-build",
                "-p",
                f"{main.cores}",
                "--conversion-table",
                p_seqtxid,
                "--taxonomy-tree",
                osp.join(path_taxonomy, "nodes.dmp"),
                "--name-table",
                osp.join(path_taxonomy, "names.dmp"),
                path_lib,
                path_cf,
            ]
            bash_process(
                cmd,
                "launching centrifuge-build. Expect very long run time (in hours)"
            )

    logger.info(f"{p['name']} finished building hash tables. " + (
        "You can clean the intermediate files with: kraken2-build --clean {path_bins_hash}/<bin number>"
        if "kraken2" in p['name'] else
        "All files, except the index *.[123].cf, can be removed"))