Beispiel #1
0
def mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, outfile):
    """
    Convert mmseqs clustering to a pangenome file:

    - convert mmseqs results to tsv file
    - convert tsv file to pangenome

    Parameters
    ----------
    mmseqdb : str
         path to base filename of output of mmseqs createdb
    mmseqclust : str
        path to base filename of output of mmseqs cluster
    logmmseq : str
         path to file where logs must be written
    outfile : str
        pangenome filename

    Returns
    -------
    dict
        - families : {fam_num: [all members]}
    """
    cmd = f"mmseqs createtsv {mmseqdb} {mmseqdb} {mmseqclust} {mmseqclust}.tsv"
    msg = "Problem while trying to convert mmseq result file to tsv file"
    logger.details(f"MMseqs command: {cmd}")
    with open(logmmseq, "a") as logf:
        utils.run_cmd(cmd, msg, eof=True, stdout=logf, stderr=logf)
    # Convert the tsv file to a 'pangenome' file: one line per family
    families = mmseqs_tsv_to_pangenome(mmseqclust, logmmseq, outfile)
    return families
Beispiel #2
0
def run_mmseqs_clust(args):
    """
    Run mmseqs clustering

    Parameters
    ----------
    args : tuple
         (mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode), with:

            * mmseqdb: path to base filename (output created by mmseq db)
            * mmseqclust: path to base filename for output of mmseq clustering
            * tmpdir : path to folder which will contain mmseq temporary files
            * logmmseq : path to file where logs must be written
            * min_id : min percentage of identity to be considered in the same family
            *         (between 0 and 1)
            * threads : max number of threads to use
            * clust_mode : [0, 1, 2], 0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'

    """
    mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode = args
    cmd = (
        f"mmseqs cluster {mmseqdb} {mmseqclust} {tmpdir} --min-seq-id {min_id} --threads {threads} --cluster-mode "
        f"{clust_mode}")
    logger.details(f"MMseqs command: {cmd}")
    msg = f"Problem while clustering proteins with mmseqs. See log in {logmmseq}"
    with open(logmmseq, "a") as logm:
        utils.run_cmd(cmd, msg, eof=False, stdout=logm, stderr=logm)
Beispiel #3
0
def run_fastme(alignfile, boot, write_boot, threads, model, outdir, quiet):
    """
    Run fastME on the given alignment.

    Parameters
    ----------
    alignfile: str
        Path to file containing alignments of persistent families grouped by genome
    boot: int or None
        Number of bootstraps to compute. None if no bootstrap asked
    write_boot: bool
        True if all bootstrap pseudo-trees must be saved into a file, False otherwise
    threads: int
        Maximum number of threads to use
    model: str or None
        DNA substitution model chosen by user. None if default one
    outdir: str
        output directory to save all results
    quiet: bool
        True if nothing must be printed to stderr/stdout, False otherwise
    """
    logger.info("Running FastME...")
    bootinfo = ""
    threadinfo = ""
    outboot = ""

    # Get bootstrap information
    if boot:
        bootinfo = "-b {}".format(boot)
    # Get threads information
    if threads:
        threadinfo = "-T {}".format(threads)
    # Get output filename
    align_name = os.path.basename(alignfile)
    logfile = os.path.join(outdir, align_name + ".fastme.log")
    treefile = os.path.join(outdir, align_name + ".fastme_tree.nwk")
    # If bootstrap pseudo-trees must be written, define the filename here
    if write_boot:
        outboot = "-B " + os.path.join(outdir,
                                       align_name + ".fastme_bootstraps.nwk")
    # Put default model if not given
    if not model:
        model = "T"
    cmd = (f"fastme -i {alignfile} -d{model} -nB -s {threadinfo} {bootinfo} "
           f"-o {treefile} -I {logfile} {outboot}")
    logger.details(cmd)
    if quiet:
        fnull = open(os.devnull, 'w')
    else:
        fnull = None
    error = ("Problem while running FastME. See log file ({}) for "
             "more information.").format(logfile)
    utils.run_cmd(cmd,
                  error,
                  stdout=fnull,
                  eof=True,
                  logger=logger,
                  stderr=fnull)
Beispiel #4
0
def create_mmseqs_db(mmseqdb, prt_path, logmmseq):
    """
    Create ffindex of protein bank (prt_path) if not already done. If done, just write a message
    to tell the user that the current existing file will be used.

    Parameters
    ----------
    mmseqdb : str
         path to base filename for output of mmseqs createdb
    prt_path : str
        path to the file containing all proteins to cluster
    logmmseq : str
         path to file where logs must be written


    Returns
    -------
    bool
        True if mmseqs db just created, False if already existed
    """
    outext = [
        "", ".index", ".dbtype", ".lookup", "_h", "_h.index", "_h.dbtype"
    ]
    files_existing = []
    if os.path.isfile(mmseqdb):
        for file in [mmseqdb + ext for ext in outext]:
            if not os.path.isfile(file):
                continue
            files_existing.append(file)
        if len(files_existing) != len(outext):
            logger.warning(
                f"mmseqs database {mmseqdb} already exists, but at least 1 associated "
                "file (.dbtype, .index etc). is missing. The program will "
                "remove existing files and recreate the database.")
            files_remaining = copy.deepcopy(files_existing)
            for file in files_existing:
                os.remove(file)  # Delete file
                files_remaining.remove(
                    file)  # Remove file from list of existing files
                logger.details(f"Removing '{file}'.")
            files_existing = copy.deepcopy(files_remaining)
        else:
            logger.warning(
                f"mmseqs database {mmseqdb} already exists. The program will "
                "use it.")
            return False
    logger.debug("Existing files: {}".format(len(files_existing)))
    logger.debug("Expected extensions: {}".format(len(outext)))
    cmd = f"mmseqs createdb {prt_path} {mmseqdb}"
    msg = (f"Problem while trying to convert database {prt_path} to mmseqs "
           "database format.")
    logger.details(f"MMseqs command: {cmd}")
    with open(logmmseq, "w") as logf:
        utils.run_cmd(cmd, msg, eof=True, stdout=logf, stderr=logf)
    return True
Beispiel #5
0
def compare_all(out_msh, matrix, npz_matrix, mash_log, threads):
    """
    Comparing all pairwise genomes that are already been sketched in the given file.

    Parameters
    ----------
    out_msh : str
        output of mash
    matrix : str
        File to put generated matrix of pairwise distances between all genomes
    npz_matrix : str
        matrix of pairwise distances saved in a binary file
    mash_log : str
        mash logfile
    threads :
        max number of threads to use

    Returns
    -------

    return code
    """
    # txt matrix already exists
    if os.path.isfile(matrix):
        logger.warning(
            "Matrix file {} already exists. The program will use this distance matrix "
            "to filter all genomes according to their distances.".format(
                matrix))
        return 0
    # npz matrix already exists
    if os.path.isfile(npz_matrix):
        logger.warning(
            "Matrix file {} already exists. The program will use this distance matrix "
            "to filter all genomes according to their distances.".format(
                matrix))
        return 0
    logger.info("Computing pairwise distances between all genomes")
    cmd_dist = f"mash dist -p {threads} {out_msh}.msh {out_msh}.msh"
    logger.details(cmd_dist)
    # Open matfile to write matrix inside
    matfile = open(matrix, "w")
    # Open mash log to add log of 'mash dist' to log of 'mash sketch'
    outf = open(mash_log, "a")
    error_dist = (
        "Error while trying to estimate pairwise distances between all genomes. "
        f"See {mash_log}.")
    utils.run_cmd(cmd_dist, error_dist, eof=True, stdout=matfile, stderr=outf)
    outf.close()
    matfile.close()
    return 0
Beispiel #6
0
def run_fasttree(alignfile, boot, outdir, model, quiet):
    """
    Run FastTree on given alignment

    Parameters
    ----------
    alignfile: str
        Path to file containing all families aligned, grouped by genome
    boot: int or None
        Number of bootstraps to calculate (None if no bootstrap asked)
    treefile: str or None
        Path to the tree file that must be created
    model: str
        DNA substitution model
    quiet: bool
        True if nothing must be printed to stderr/stdout, False otherwise
    """
    logger.info("Running FasttreeMP...")
    if not boot:
        bootinfo = "-nosupport"
    else:
        bootinfo = "-boot {}".format(boot)
    align_name = os.path.basename(alignfile)
    logfile = os.path.join(outdir, align_name + ".fasttree.log")
    treefile = os.path.join(outdir, align_name + ".fasttree_tree.nwk")
    cmd = f"FastTreeMP -nt {model} -noml -nocat {bootinfo} -log {logfile} {alignfile}"
    logger.details("Fasttree command: " + cmd)
    if quiet:
        fnull = open(os.devnull, 'w')
    else:
        fnull = None
    stdout = open(treefile, "w")
    error = ("Problem while running Fasttree. See log file ({}) for "
             "more information.").format(logfile)
    utils.run_cmd(cmd,
                  error,
                  stdout=stdout,
                  eof=True,
                  logger=logger,
                  stderr=fnull)
Beispiel #7
0
def run_quicktree(alignfile, boot, outdir):
    """
    Run quicktree on the given alignment.

    Parameters
    ----------
    alignfile: str
        Path to file containing alignments of persistent families grouped by genome,
        in Stockholm format
    boot: int or None
        Number of bootstraps to compute. None if no bootstrap asked
    outdir: str or None
        Path to the tree file that must be created
    """
    logger.info("Running Quicktree...")
    bootinfo = ""

    # Get bootstrap information
    if boot:
        bootinfo = f"-boot {boot}"
    # Get output filename and logfile name
    align_name = os.path.basename(alignfile)
    logfile = os.path.join(outdir, align_name + ".quicktree.log")
    treefile = os.path.join(outdir, align_name + ".quicktree_tree.nwk")
    cmd = f"quicktree -in a -out t {bootinfo} {alignfile}"
    outfile = open(treefile, "w")
    logfilef = open(logfile, "w")
    error = (f"Problem while running quicktree. See log file ({logfile}) for "
             "more information.")
    logger.details(cmd)
    utils.run_cmd(cmd,
                  error,
                  stdout=outfile,
                  eof=True,
                  logger=logger,
                  stderr=logfilef)
Beispiel #8
0
def back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger):
    """
    Backtranslate protein alignment to nucleotides

    Parameters
    ----------
    num_fam : int
        current family number. Used for log messages
    mafft_file : str
        path to file containing protein alignments by mafft
    gen_file : str
        path to file containing all sequences, not aligned, in nucleotides. It is used to
        convert the alignment in proteins into a nucleotide alignment
    btr_file : str
        path to the file that will contain the nucleotide alignment
    nbfal : int
        number of sequences aligned for the family by mafft
    logger : logging.Logger
        logger with queueHandler to give logs to main logger

    Returns
    -------
    bool
        - False if problem (back-translation, different number of families...)
        - number of sequences in btr file if everything went well
    """
    logger.log(utils.detail_lvl(), f"Back-translating family {num_fam}")
    curpath = os.path.dirname(os.path.abspath(__file__))
    awk_script = os.path.join(curpath, "prt2codon.awk")
    cmd = f"awk -f {awk_script} {mafft_file} {gen_file}"
    stdout = open(btr_file, "w")
    error = f"Problem while trying to backtranslate {mafft_file} to a nucleotide alignment"
    ret = utils.run_cmd(cmd, error, stdout=stdout, logger=logger)
    stdout.close()
    if not isinstance(ret, int):
        ret = ret.returncode
    if ret != 0:
        os.remove(btr_file)
        return False
    message = (f"fam {num_fam}: different number of proteins aligned in {mafft_file} ({nbfal}) and genes "
               f"back-translated in {btr_file}")
    # Check number of sequences in btr file, and return True/False according to it
    # It should contain the same number of sequences as the mafft file.
    return check_nb_seqs(mafft_file, nbfal, logger, message)
def prodigal_train(gpath, annot_folder):
    """
    Use prodigal training mode.
    First, train prodigal on the first genome ('gpath'), and write it to 'genome'.trn,
    file which will be used for the annotation of all next sequence
    Parameters
    ----------
    gpath : str
        path to genome to train on
    annot_folder : str
        path to folder where the log files and train file will be saved

    Returns
    -------
    str
        path and name of train file (will be used to annotate all next genomes)
        If problem, returns empty string
    """
    logger.info(f"Prodigal will train using {gpath}")
    gname = os.path.basename(gpath)             # path/to/original/genome.fasta -> genome.fasta
    gpath_train = os.path.join(annot_folder, gname + ".trn") # path/to/prodiRes/genome.fasta.trn
    if os.path.isfile(gpath_train):
        logger.info(f"A training file already exists ({gpath_train}). "
                     "It will be used to annotate all genomes.")
        return gpath_train
    prodigal_logfile = gpath_train + "-prodigal-train.log"  # path/to/genome-prodigal-train.log
    prodigal_logfile_err = gpath_train + "-prodigal-train.log.err"
    cmd = (f"prodigal -i {gpath} -t {gpath_train}")
    error = (f"Error while trying to train prodigal on {gname}. See {prodigal_logfile_err}.")
    logger.log(utils.detail_lvl(), "prodigal command: " + cmd)
    prodigalf = open(prodigal_logfile, "w")
    prodigalferr = open(prodigal_logfile_err, "w")
    ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,
                        logger=logger)
    prodigalf.close()
    prodigalferr.close()
    if ret.returncode == 0:
        logger.log(utils.detail_lvl(), f"End training on {gpath}")
        return gpath_train
    else:
        return ""
Beispiel #10
0
def mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger):
    """
    Align all proteins of the given family with mafft

    Parameters
    ----------
    num_fam : int
        current family number
    prt_file : str
        path to file containing all proteins extracted
    mafft_file : str
        path to file which will contain proteins alignment
    nbfprt : int
        number of proteins extracted in prt file
    logger : logging.Logger
        logger with queueHandler to give logs to main logger

    Returns
    -------
    bool
        True if no problem (alignment ok, same number of proteins extracted and aligned),
        False otherwise
    """
    logger.log(utils.detail_lvl(), f"Aligning family {num_fam}")
    cmd = f"mafft --auto {prt_file}"
    error = f"Problem while trying to align fam {num_fam}"
    stdout = open(mafft_file, "w")
    stderr = open(mafft_file + ".log", "w")
    logger.log(utils.detail_lvl(), f"Mafft command: {cmd}")
    ret = utils.run_cmd(cmd, error, stdout=stdout, stderr=stderr, logger=logger)
    stdout.close()
    if not isinstance(ret, int):
        ret = ret.returncode
    if ret != 0:
        os.remove(mafft_file)
        return False
    message = (f"fam {num_fam}: different number of proteins extracted in {prt_file} ({nbfprt}) and proteins "
               f"aligned in {mafft_file}")
    return check_nb_seqs(mafft_file, nbfprt, logger, message)
Beispiel #11
0
def run_tree(alignfile, boot, outdir, quiet, threads, **kwargs):
    """
    Run IQtree for the given alignment file and options

    Parameters
    ----------
    alignfile: str
        path to file containing all persistent families aligned, and grouped by genome
    boot: int or None
        number of bootstraps to calculate, None if no bootstrap asked
    outdir: str or None
        Path to the tree file that must be created
    quiet: bool
        True if nothing must be printed to stderr/stdout, False otherwise
    threads: int
        Maximum number of threads to use
    kwargs["model"]: str
        DNA substitution model chosen by user
    kwards["wb"]: bool
    	True if all bootstrap pseudo-trees must be saved into a file, False otherwise
    kwargs["mem"]: str
    	Maximal RAM usage in GB | MB | % - Only for iqtree
    kwargs["s"]: str
    	soft to use (iqtree or iqtree2)
    """
    # Get optional arguments
    model = kwargs["model"]
    write_boot = kwargs["wb"]
    memory = kwargs["mem"]
    soft = kwargs["s"]
    fast = kwargs["f"]
    if not fast:
        fast = ""
    else:
        fast = "-fast"

    logger.info("Running IQtree...")

    # Init non mandatory arguments
    bootinfo = ""
    wb_info = ""
    mem_info = ""
    threadinfo = ""

    # Get info on all options (syntax changes according to IQtree version 1.x or 2.x)
    if boot:
        if soft == "iqtree":
            bootinfo = f"-bb {boot}"
        else:
            bootinfo = f"-B {boot}"
    if write_boot:
        if soft == "iqtree":
            wb_info = "-wbt"
        else:
            wb_info = "--boot-trees"
    if memory:
        if soft == "iqtree":
            mem_info = f"-mem {memory}"
        else:
            mem_info = f"--mem {memory}"
    # IQtree is always run quietly, but syntax depends on version:
    if soft == "iqtree":
        qu = "-quiet"
    else:
        qu = "--quiet"
    # Get threads information
    if threads:
        if soft == "iqtree":
            threadinfo = f"-nt {threads}"
        else:
            threadinfo = f"-T {threads}"

# get cmd for seqtype
    if soft == "iqtree":
        seqtype = "-st DNA"
    else:
        seqtype = "--seqtype DNA"

    # Define treefile name if not given.
    align_name = os.path.basename(alignfile)
    logfile = os.path.join(outdir, align_name + ".iqtree.log")
    treefile = os.path.join(outdir, align_name + ".iqtree_tree")
    # get prefix cmd:
    if soft == "iqtree":
        prefix = f"-pre {treefile}"
    else:
        prefix = f"--prefix {treefile}"
    cmd = (
        f"{soft} -s {alignfile} {threadinfo} -m {model} {mem_info} {bootinfo} {wb_info} "
        f"{seqtype} {prefix} {qu} {fast}")
    logger.details("IQtree command: " + cmd)
    if quiet:
        fnull = open(os.devnull, 'w')
    else:
        fnull = None
    error = (f"Problem while running IQtree. See log file ({logfile}) for "
             "more information.")
    utils.run_cmd(cmd, error, eof=True, logger=logger, stderr=fnull)
def to_database(outdir, section):
    """
    Move .fna.gz files to 'database_init' folder, and uncompress them.

    Parameters
    ----------
    outdir : str
        directory where all results are (for now, refseq/genbank folders, assembly summary and log
    section : str
        refseq (default) or genbank

    Returns
    -------
        nb_gen : number of genomes downloaded
        db_dir : directory where are all fna files downloaded from refseq/genbank
    """
    # Copy .gz files in a new folder, and Unzip them in this new folder
    logger.info("Uncompressing genome files.")
    # Folder where are .gz files
    download_dir = os.path.join(outdir, section, "bacteria")
    # If no folder output/refseq/bacteria: error, no genome found
    # (or output/genbank/bacteria)
    if not os.path.exists(download_dir):
        logger.error(f"The folder containing genomes downloaded from NCBI {section} "
                     f"({download_dir}) does not exist. Check that you really downloaded "
                     "sequences (fna.gz) and that they are in this folder.")
        sys.exit(1)
    # If folder output/<refseq or genbank>/bacteria empty: error, no genome found
    list_downloads = os.listdir(download_dir)
    if list_downloads == []:
        logger.error(f"The folder supposed to contain genomes downloaded from NCBI {section} "
                     f"({download_dir}) exists but is empty. Check that you really downloaded "
                     "sequences (fna.gz).")
        sys.exit(1)
    # Create directory to put uncompressed genomes
    db_dir = os.path.join(outdir, "Database_init")
    os.makedirs(db_dir, exist_ok=True)
    nb_gen = 0
    # For each subfolder of download dir, move the .gz file it contains (if possible)
    # to the new database folder
    for g_folder in os.listdir(download_dir):
        fasta = glob.glob(os.path.join(download_dir, g_folder, "*.fna.gz"))
        # No .gz file in folder
        if len(fasta) == 0:
            logger.warning("Problem with genome in {}: no compressed fasta file downloaded. "
                           "This genome will be ignored.".format(g_folder))
            continue
        # Several gz files in folder
        elif len(fasta) > 1:
            logger.warning("Problem with genome in {}: several compressed fasta files found. "
                           "This genome will be ignored.".format(g_folder))
            continue
        # Copy gz file to new folder
        fasta_file = os.path.basename(fasta[0])
        fasta_out = os.path.join(db_dir, fasta_file)
        shutil.copy(fasta[0], fasta_out)
        # Uncompress file copied
        cmd = f"gunzip {fasta_out} -f"
        error = f"Error while trying to uncompress {fasta_out}. This genome will be ignored."
        call = utils.run_cmd(cmd, error)
        # Problem with uncompressing: genome ignored (remove gz file from new folder)
        if call.returncode != 0:
            os.remove(fasta_out)
            continue
        nb_gen += 1
    return nb_gen, db_dir
Beispiel #13
0
def sketch_all(genomes, sorted_genomes, outdir, list_reps, out_msh, mash_log,
               threads):
    """
    Sketch all genomes to a combined archive.

    Parameters
    ----------
    genomes : dict
        {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
    sorted_genomes: list
        list of 'genome_file' for all genomes kept (L90 and nbcont ok), ordered by
        decreasing quality
    outdir : str
        path to directory where all results are saved
    list_reps : str
        file with list of genomes to sketch. File will be emptied if it contain something, and
        filled with the informations from 'genomes'.
    out_msh : str
        output of mash
    mash_log : str
        mash logfile
    threads :
        max number of threads to use

    Returns
    -------

    return value (0 if OK, 1 if error)

    """
    # If given outdir does not exist, close it
    if not os.path.isdir(outdir):
        logger.error(f"Your output directory '{outdir}' does not exist.")
        sys.exit(1)
    # Empty list_reps file
    open(list_reps, "w").close()
    # Complete paths to genomes to compare: 'path_to_seq_to_annotate' = genome_file[2]
    file_paths = [genomes[g][2] for g in sorted_genomes]
    # Write list of genomes to compare to a file
    utils.write_list(file_paths, list_reps)
    # Sketch all genome sequences if not already done
    if os.path.isfile(out_msh + ".msh"):
        logger.warning(
            f"Mash sketch file {out_msh}.msh already exists. PanACoTA will "
            "use it for next step.")
        os.remove(list_reps)
        return 0
    logger.info("Sketching all genomes...")
    cmd_sketch = f"mash sketch -o {out_msh} -p {threads} -l {list_reps} -s 1e4"
    logger.details(cmd_sketch)
    error_sketch = (
        f"Error while trying to sketch {len(sorted_genomes)} genomes to combined "
        "archive. Maybe some genome sequences in "
        "'tmp_files' are missing! Check logfile: "
        f"{mash_log}")

    outf = open(mash_log, "w")
    utils.run_cmd(cmd_sketch,
                  error_sketch,
                  eof=True,
                  stdout=outf,
                  stderr=outf,
                  logger=logger)
    outf.close()
    return 0
def run_prodigal(arguments):
    """
    Run prodigal for the given genome.

    Parameters
    ----------
    arguments : tuple
        (gpath, prodigal_folder, cores_annot, name, force, nbcont, q) with:

        * gpath: path and filename of genome to annotate
        * prodigal_folder: path to folder where all prodigal folders for all genomes are saved
        * cores_annot: how many cores can use prodigal
        * name: output name of annotated genome
        * force: True if force run (override existing files), False otherwise
        * nbcont: number of contigs in the input genome, to check prodigal results
        * small: ifcontigs are too small (<20000bp), use -p meta option
        * q : queue where logs are put

    Returns
    -------
    boolean
        True if eveything went well (all needed output files present,
        corresponding numbers of proteins, genes etc.). False otherwise.
    """
    gpath, prodigal_folder, threads, name, force, nbcont, gpath_train, q = arguments
    # Set logger for this process, which will be given to all subprocess
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    root.addHandler(qh)
    logger = logging.getLogger('annotate.run_prodigal')
    # Define prodigal directory and logfile, and check their existence
    # By default, prodigal is in tmp_folder -> resdir/tmp_files/genome-prodigalRes
    g_ori_name = os.path.basename(gpath)
    prodigal_dir = os.path.join(prodigal_folder, g_ori_name + "-prodigalRes")
    prodigal_logfile = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log")
    prodigal_logfile_err = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log.err")

    # If result dir exists but user wants to force, remove this result dir
    if os.path.isdir(prodigal_dir) and force:
        shutil.rmtree(prodigal_dir)
        logger.warning("Prodigal results folder already exists, but is removed because "
                       "--force option was used.")

    # Training file can be "small option", meaning that we did not use the training mode.
    # If not "small option", we used the training mode. If training file does not exist 
    # and prodigal result directory neither, return False
    # We cannot annotate using nothing.
    # Happens if there was a problem while training
    if (gpath_train != "small option" and not os.path.isfile(gpath_train) 
        and not os.path.isdir(prodigal_dir)):
        return False

    logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) "
                                     "with Prodigal")
    # If prodigal results dir already exists (meaning user did not want to force,
    # otherwise it would have been deleted just before),
    # can we use it for next step ? -> check content.
    if os.path.isdir(prodigal_dir):
        logger.warning(f"Prodigal results folder {prodigal_dir} already exists.")
        ok = check_prodigal(gpath, name, prodigal_dir, logger)
        # If everything ok in the result dir, do not rerun prodigal,
        # use those results for next step (formatting)
        if ok:
            logger.log(utils.detail_lvl(), "Prodigal did not run again. "
                                           "Formatting step will use already generated results of "
                                           "Prodigal in {}. If you want to re-run Prodigal, first "
                                           "remove this result folder, or use '-F' or '--force' "
                                           "option.".format(prodigal_dir))

            logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")
        # If missing files, or other problems in result dir, error message,
        # ask user to force or remove this folder.
        else:
            logger.warning("Problems in the files contained in your already existing output dir "
                           f"({prodigal_dir}). Please check it, or remove it to "
                           "re-annotate.")
        # If everything was ok -> everything is ready for next step -> return True
        # If something is wrong -> cannot use those results, genome won't be annotated
        # -> return False
        return ok
    else:
        # We are sure prodigal result dir does not exist yet, because either:
        #     - never existed
        #     - removed because user asked to force
        #     - exists but left function, so does not go until this line
        #        -> either if files inside are ok or not
        # So make prodigal_dir (not automatically created by prodigal)
        os.makedirs(prodigal_dir)

    # Prodigal_directory is empty and ready to get prodigal results
    basic_outname = os.path.join(prodigal_dir, name)
    # Define cmd, stderr and stdout files, and error to write if problem.
    error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.")
    prodigalf = open(prodigal_logfile, "w")
    prodigalferr = open(prodigal_logfile_err, "w")
    if gpath_train == "small option":
        training = "-p meta"
    else:
        training = f"-t {gpath_train}"
    cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} "
           f"-f gff -o {basic_outname + '.gff'} {training} -q")
    logger.log(utils.detail_lvl(), "Prodigal command: " + cmd)

    ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,
                        logger=logger)
    prodigalf.close()
    prodigalferr.close()
    if ret.returncode == 0:
        logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")
        return True
    else:
        return False
def run_prokka(arguments):
    """
    Run prokka for the given genome.

    Parameters
    ----------
    arguments : tuple
        (gpath, prok_folder, cores_annot, name, force, nbcont, small, q) with:

        * gpath: path and filename of genome to annotate
        * prok_folder: path to folder where all prokka folders for all genomes are saved
        * cores_annot: how many cores can use prokka
        * name: output name of annotated genome
        * force: True if force run (override existing files), False otherwise
        * nbcont: number of contigs in the input genome, to check prokka results
        * small: used for prodigal, if sequences to annotate are small. Not used here
        * q : queue where logs are put

    Returns
    -------
    boolean
        True if eveything went well (all needed output files present,
        corresponding numbers of proteins, genes etc.). False otherwise.
    """
    gpath, prok_folder, threads, name, force, nbcont, _, q = arguments
    # Set logger for this process
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    root.addHandler(qh)
    logger = logging.getLogger('annotate.run_prokka')
    logger.log(utils.detail_lvl(), f"Start annotating {name} from {gpath} with Prokka")

    # Define prokka directory and logfile, and check their existence
    prok_dir = os.path.join(prok_folder, os.path.basename(gpath) + "-prokkaRes")
    fnull = open(os.devnull, 'w')
    prok_logfile = os.path.join(prok_folder, os.path.basename(gpath) + "-prokka.log")
    # import sys
    # sys.exit(1)
    # If result dir already exists, check if we can use it or next step or not
    if os.path.isdir(prok_dir) and not force:
        logger.warning(f"Prokka results folder {prok_dir} already exists.")
        ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger)
        # If everything ok in the result dir, do not rerun prokka,
        # use those results for next step (formatting)
        if ok:
            logger.log(utils.detail_lvl(), "Prokka did not run again, "
                       "formatting step used already generated results of "
                       f"Prokka in {prok_dir}. If you want to re-run prokka, first "
                       "remove this result folder, or use '-F' or '--force' "
                       "option if you want to rerun prokka for all genomes.")
            logger.log(utils.detail_lvl(), f"End annotating {name} {gpath}")
        # If missing files, or other problems in result dir, error message,
        # ask user to force or remove this folder.
        else:
            logger.warning("Problems in the files contained in your already existing output dir "
                           "({}). Please check it, or remove it to "
                           "re-annotate.".format(prok_dir))
        # If everything was ok -> everything is ready for next step -> return True
        # If something is wrong -> cannot use those results, genome won't be annotated
        # -> return False
        return ok
    # If result dir exists but user wants to force, remove this result dir
    elif os.path.isdir(prok_dir) and force:
        shutil.rmtree(prok_dir)
        logger.warning("Prokka results folder already exists, but removed because --force option "
                       "used")
    # Now that we checked and solved those cases:
    #     - outdir exists (problems or not, we returned appropriate boolean)
    #     - if outdir exists exists but force, remove this outdir.
    # So, outdir does not exist -> run prokka
    cmd = (f"prokka --outdir {prok_dir} --cpus {threads} "
           f"--prefix {name} --centre prokka {gpath}")
    error = (f"Error while trying to run prokka on {name} from {gpath}")
    logger.log(utils.detail_lvl(), "Prokka command: " + cmd)
    prokf = open(prok_logfile, "w")
    ret = utils.run_cmd(cmd, error, eof=False, stderr=prokf, logger=logger)
    prokf.close()
    if ret.returncode != 0:
        return False
    ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger)
    logger.log(utils.detail_lvl(), f"End annotating {name} from {gpath}.")
    return ok