def mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, outfile): """ Convert mmseqs clustering to a pangenome file: - convert mmseqs results to tsv file - convert tsv file to pangenome Parameters ---------- mmseqdb : str path to base filename of output of mmseqs createdb mmseqclust : str path to base filename of output of mmseqs cluster logmmseq : str path to file where logs must be written outfile : str pangenome filename Returns ------- dict - families : {fam_num: [all members]} """ cmd = f"mmseqs createtsv {mmseqdb} {mmseqdb} {mmseqclust} {mmseqclust}.tsv" msg = "Problem while trying to convert mmseq result file to tsv file" logger.details(f"MMseqs command: {cmd}") with open(logmmseq, "a") as logf: utils.run_cmd(cmd, msg, eof=True, stdout=logf, stderr=logf) # Convert the tsv file to a 'pangenome' file: one line per family families = mmseqs_tsv_to_pangenome(mmseqclust, logmmseq, outfile) return families
def run_mmseqs_clust(args): """ Run mmseqs clustering Parameters ---------- args : tuple (mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode), with: * mmseqdb: path to base filename (output created by mmseq db) * mmseqclust: path to base filename for output of mmseq clustering * tmpdir : path to folder which will contain mmseq temporary files * logmmseq : path to file where logs must be written * min_id : min percentage of identity to be considered in the same family * (between 0 and 1) * threads : max number of threads to use * clust_mode : [0, 1, 2], 0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit' """ mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode = args cmd = ( f"mmseqs cluster {mmseqdb} {mmseqclust} {tmpdir} --min-seq-id {min_id} --threads {threads} --cluster-mode " f"{clust_mode}") logger.details(f"MMseqs command: {cmd}") msg = f"Problem while clustering proteins with mmseqs. See log in {logmmseq}" with open(logmmseq, "a") as logm: utils.run_cmd(cmd, msg, eof=False, stdout=logm, stderr=logm)
def run_fastme(alignfile, boot, write_boot, threads, model, outdir, quiet): """ Run fastME on the given alignment. Parameters ---------- alignfile: str Path to file containing alignments of persistent families grouped by genome boot: int or None Number of bootstraps to compute. None if no bootstrap asked write_boot: bool True if all bootstrap pseudo-trees must be saved into a file, False otherwise threads: int Maximum number of threads to use model: str or None DNA substitution model chosen by user. None if default one outdir: str output directory to save all results quiet: bool True if nothing must be printed to stderr/stdout, False otherwise """ logger.info("Running FastME...") bootinfo = "" threadinfo = "" outboot = "" # Get bootstrap information if boot: bootinfo = "-b {}".format(boot) # Get threads information if threads: threadinfo = "-T {}".format(threads) # Get output filename align_name = os.path.basename(alignfile) logfile = os.path.join(outdir, align_name + ".fastme.log") treefile = os.path.join(outdir, align_name + ".fastme_tree.nwk") # If bootstrap pseudo-trees must be written, define the filename here if write_boot: outboot = "-B " + os.path.join(outdir, align_name + ".fastme_bootstraps.nwk") # Put default model if not given if not model: model = "T" cmd = (f"fastme -i {alignfile} -d{model} -nB -s {threadinfo} {bootinfo} " f"-o {treefile} -I {logfile} {outboot}") logger.details(cmd) if quiet: fnull = open(os.devnull, 'w') else: fnull = None error = ("Problem while running FastME. See log file ({}) for " "more information.").format(logfile) utils.run_cmd(cmd, error, stdout=fnull, eof=True, logger=logger, stderr=fnull)
def create_mmseqs_db(mmseqdb, prt_path, logmmseq): """ Create ffindex of protein bank (prt_path) if not already done. If done, just write a message to tell the user that the current existing file will be used. Parameters ---------- mmseqdb : str path to base filename for output of mmseqs createdb prt_path : str path to the file containing all proteins to cluster logmmseq : str path to file where logs must be written Returns ------- bool True if mmseqs db just created, False if already existed """ outext = [ "", ".index", ".dbtype", ".lookup", "_h", "_h.index", "_h.dbtype" ] files_existing = [] if os.path.isfile(mmseqdb): for file in [mmseqdb + ext for ext in outext]: if not os.path.isfile(file): continue files_existing.append(file) if len(files_existing) != len(outext): logger.warning( f"mmseqs database {mmseqdb} already exists, but at least 1 associated " "file (.dbtype, .index etc). is missing. The program will " "remove existing files and recreate the database.") files_remaining = copy.deepcopy(files_existing) for file in files_existing: os.remove(file) # Delete file files_remaining.remove( file) # Remove file from list of existing files logger.details(f"Removing '{file}'.") files_existing = copy.deepcopy(files_remaining) else: logger.warning( f"mmseqs database {mmseqdb} already exists. The program will " "use it.") return False logger.debug("Existing files: {}".format(len(files_existing))) logger.debug("Expected extensions: {}".format(len(outext))) cmd = f"mmseqs createdb {prt_path} {mmseqdb}" msg = (f"Problem while trying to convert database {prt_path} to mmseqs " "database format.") logger.details(f"MMseqs command: {cmd}") with open(logmmseq, "w") as logf: utils.run_cmd(cmd, msg, eof=True, stdout=logf, stderr=logf) return True
def compare_all(out_msh, matrix, npz_matrix, mash_log, threads): """ Comparing all pairwise genomes that are already been sketched in the given file. Parameters ---------- out_msh : str output of mash matrix : str File to put generated matrix of pairwise distances between all genomes npz_matrix : str matrix of pairwise distances saved in a binary file mash_log : str mash logfile threads : max number of threads to use Returns ------- return code """ # txt matrix already exists if os.path.isfile(matrix): logger.warning( "Matrix file {} already exists. The program will use this distance matrix " "to filter all genomes according to their distances.".format( matrix)) return 0 # npz matrix already exists if os.path.isfile(npz_matrix): logger.warning( "Matrix file {} already exists. The program will use this distance matrix " "to filter all genomes according to their distances.".format( matrix)) return 0 logger.info("Computing pairwise distances between all genomes") cmd_dist = f"mash dist -p {threads} {out_msh}.msh {out_msh}.msh" logger.details(cmd_dist) # Open matfile to write matrix inside matfile = open(matrix, "w") # Open mash log to add log of 'mash dist' to log of 'mash sketch' outf = open(mash_log, "a") error_dist = ( "Error while trying to estimate pairwise distances between all genomes. " f"See {mash_log}.") utils.run_cmd(cmd_dist, error_dist, eof=True, stdout=matfile, stderr=outf) outf.close() matfile.close() return 0
def run_fasttree(alignfile, boot, outdir, model, quiet): """ Run FastTree on given alignment Parameters ---------- alignfile: str Path to file containing all families aligned, grouped by genome boot: int or None Number of bootstraps to calculate (None if no bootstrap asked) treefile: str or None Path to the tree file that must be created model: str DNA substitution model quiet: bool True if nothing must be printed to stderr/stdout, False otherwise """ logger.info("Running FasttreeMP...") if not boot: bootinfo = "-nosupport" else: bootinfo = "-boot {}".format(boot) align_name = os.path.basename(alignfile) logfile = os.path.join(outdir, align_name + ".fasttree.log") treefile = os.path.join(outdir, align_name + ".fasttree_tree.nwk") cmd = f"FastTreeMP -nt {model} -noml -nocat {bootinfo} -log {logfile} {alignfile}" logger.details("Fasttree command: " + cmd) if quiet: fnull = open(os.devnull, 'w') else: fnull = None stdout = open(treefile, "w") error = ("Problem while running Fasttree. See log file ({}) for " "more information.").format(logfile) utils.run_cmd(cmd, error, stdout=stdout, eof=True, logger=logger, stderr=fnull)
def run_quicktree(alignfile, boot, outdir): """ Run quicktree on the given alignment. Parameters ---------- alignfile: str Path to file containing alignments of persistent families grouped by genome, in Stockholm format boot: int or None Number of bootstraps to compute. None if no bootstrap asked outdir: str or None Path to the tree file that must be created """ logger.info("Running Quicktree...") bootinfo = "" # Get bootstrap information if boot: bootinfo = f"-boot {boot}" # Get output filename and logfile name align_name = os.path.basename(alignfile) logfile = os.path.join(outdir, align_name + ".quicktree.log") treefile = os.path.join(outdir, align_name + ".quicktree_tree.nwk") cmd = f"quicktree -in a -out t {bootinfo} {alignfile}" outfile = open(treefile, "w") logfilef = open(logfile, "w") error = (f"Problem while running quicktree. See log file ({logfile}) for " "more information.") logger.details(cmd) utils.run_cmd(cmd, error, stdout=outfile, eof=True, logger=logger, stderr=logfilef)
def back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger): """ Backtranslate protein alignment to nucleotides Parameters ---------- num_fam : int current family number. Used for log messages mafft_file : str path to file containing protein alignments by mafft gen_file : str path to file containing all sequences, not aligned, in nucleotides. It is used to convert the alignment in proteins into a nucleotide alignment btr_file : str path to the file that will contain the nucleotide alignment nbfal : int number of sequences aligned for the family by mafft logger : logging.Logger logger with queueHandler to give logs to main logger Returns ------- bool - False if problem (back-translation, different number of families...) - number of sequences in btr file if everything went well """ logger.log(utils.detail_lvl(), f"Back-translating family {num_fam}") curpath = os.path.dirname(os.path.abspath(__file__)) awk_script = os.path.join(curpath, "prt2codon.awk") cmd = f"awk -f {awk_script} {mafft_file} {gen_file}" stdout = open(btr_file, "w") error = f"Problem while trying to backtranslate {mafft_file} to a nucleotide alignment" ret = utils.run_cmd(cmd, error, stdout=stdout, logger=logger) stdout.close() if not isinstance(ret, int): ret = ret.returncode if ret != 0: os.remove(btr_file) return False message = (f"fam {num_fam}: different number of proteins aligned in {mafft_file} ({nbfal}) and genes " f"back-translated in {btr_file}") # Check number of sequences in btr file, and return True/False according to it # It should contain the same number of sequences as the mafft file. return check_nb_seqs(mafft_file, nbfal, logger, message)
def prodigal_train(gpath, annot_folder): """ Use prodigal training mode. First, train prodigal on the first genome ('gpath'), and write it to 'genome'.trn, file which will be used for the annotation of all next sequence Parameters ---------- gpath : str path to genome to train on annot_folder : str path to folder where the log files and train file will be saved Returns ------- str path and name of train file (will be used to annotate all next genomes) If problem, returns empty string """ logger.info(f"Prodigal will train using {gpath}") gname = os.path.basename(gpath) # path/to/original/genome.fasta -> genome.fasta gpath_train = os.path.join(annot_folder, gname + ".trn") # path/to/prodiRes/genome.fasta.trn if os.path.isfile(gpath_train): logger.info(f"A training file already exists ({gpath_train}). " "It will be used to annotate all genomes.") return gpath_train prodigal_logfile = gpath_train + "-prodigal-train.log" # path/to/genome-prodigal-train.log prodigal_logfile_err = gpath_train + "-prodigal-train.log.err" cmd = (f"prodigal -i {gpath} -t {gpath_train}") error = (f"Error while trying to train prodigal on {gname}. See {prodigal_logfile_err}.") logger.log(utils.detail_lvl(), "prodigal command: " + cmd) prodigalf = open(prodigal_logfile, "w") prodigalferr = open(prodigal_logfile_err, "w") ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf, logger=logger) prodigalf.close() prodigalferr.close() if ret.returncode == 0: logger.log(utils.detail_lvl(), f"End training on {gpath}") return gpath_train else: return ""
def mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger): """ Align all proteins of the given family with mafft Parameters ---------- num_fam : int current family number prt_file : str path to file containing all proteins extracted mafft_file : str path to file which will contain proteins alignment nbfprt : int number of proteins extracted in prt file logger : logging.Logger logger with queueHandler to give logs to main logger Returns ------- bool True if no problem (alignment ok, same number of proteins extracted and aligned), False otherwise """ logger.log(utils.detail_lvl(), f"Aligning family {num_fam}") cmd = f"mafft --auto {prt_file}" error = f"Problem while trying to align fam {num_fam}" stdout = open(mafft_file, "w") stderr = open(mafft_file + ".log", "w") logger.log(utils.detail_lvl(), f"Mafft command: {cmd}") ret = utils.run_cmd(cmd, error, stdout=stdout, stderr=stderr, logger=logger) stdout.close() if not isinstance(ret, int): ret = ret.returncode if ret != 0: os.remove(mafft_file) return False message = (f"fam {num_fam}: different number of proteins extracted in {prt_file} ({nbfprt}) and proteins " f"aligned in {mafft_file}") return check_nb_seqs(mafft_file, nbfprt, logger, message)
def run_tree(alignfile, boot, outdir, quiet, threads, **kwargs): """ Run IQtree for the given alignment file and options Parameters ---------- alignfile: str path to file containing all persistent families aligned, and grouped by genome boot: int or None number of bootstraps to calculate, None if no bootstrap asked outdir: str or None Path to the tree file that must be created quiet: bool True if nothing must be printed to stderr/stdout, False otherwise threads: int Maximum number of threads to use kwargs["model"]: str DNA substitution model chosen by user kwards["wb"]: bool True if all bootstrap pseudo-trees must be saved into a file, False otherwise kwargs["mem"]: str Maximal RAM usage in GB | MB | % - Only for iqtree kwargs["s"]: str soft to use (iqtree or iqtree2) """ # Get optional arguments model = kwargs["model"] write_boot = kwargs["wb"] memory = kwargs["mem"] soft = kwargs["s"] fast = kwargs["f"] if not fast: fast = "" else: fast = "-fast" logger.info("Running IQtree...") # Init non mandatory arguments bootinfo = "" wb_info = "" mem_info = "" threadinfo = "" # Get info on all options (syntax changes according to IQtree version 1.x or 2.x) if boot: if soft == "iqtree": bootinfo = f"-bb {boot}" else: bootinfo = f"-B {boot}" if write_boot: if soft == "iqtree": wb_info = "-wbt" else: wb_info = "--boot-trees" if memory: if soft == "iqtree": mem_info = f"-mem {memory}" else: mem_info = f"--mem {memory}" # IQtree is always run quietly, but syntax depends on version: if soft == "iqtree": qu = "-quiet" else: qu = "--quiet" # Get threads information if threads: if soft == "iqtree": threadinfo = f"-nt {threads}" else: threadinfo = f"-T {threads}" # get cmd for seqtype if soft == "iqtree": seqtype = "-st DNA" else: seqtype = "--seqtype DNA" # Define treefile name if not given. align_name = os.path.basename(alignfile) logfile = os.path.join(outdir, align_name + ".iqtree.log") treefile = os.path.join(outdir, align_name + ".iqtree_tree") # get prefix cmd: if soft == "iqtree": prefix = f"-pre {treefile}" else: prefix = f"--prefix {treefile}" cmd = ( f"{soft} -s {alignfile} {threadinfo} -m {model} {mem_info} {bootinfo} {wb_info} " f"{seqtype} {prefix} {qu} {fast}") logger.details("IQtree command: " + cmd) if quiet: fnull = open(os.devnull, 'w') else: fnull = None error = (f"Problem while running IQtree. See log file ({logfile}) for " "more information.") utils.run_cmd(cmd, error, eof=True, logger=logger, stderr=fnull)
def to_database(outdir, section): """ Move .fna.gz files to 'database_init' folder, and uncompress them. Parameters ---------- outdir : str directory where all results are (for now, refseq/genbank folders, assembly summary and log section : str refseq (default) or genbank Returns ------- nb_gen : number of genomes downloaded db_dir : directory where are all fna files downloaded from refseq/genbank """ # Copy .gz files in a new folder, and Unzip them in this new folder logger.info("Uncompressing genome files.") # Folder where are .gz files download_dir = os.path.join(outdir, section, "bacteria") # If no folder output/refseq/bacteria: error, no genome found # (or output/genbank/bacteria) if not os.path.exists(download_dir): logger.error(f"The folder containing genomes downloaded from NCBI {section} " f"({download_dir}) does not exist. Check that you really downloaded " "sequences (fna.gz) and that they are in this folder.") sys.exit(1) # If folder output/<refseq or genbank>/bacteria empty: error, no genome found list_downloads = os.listdir(download_dir) if list_downloads == []: logger.error(f"The folder supposed to contain genomes downloaded from NCBI {section} " f"({download_dir}) exists but is empty. Check that you really downloaded " "sequences (fna.gz).") sys.exit(1) # Create directory to put uncompressed genomes db_dir = os.path.join(outdir, "Database_init") os.makedirs(db_dir, exist_ok=True) nb_gen = 0 # For each subfolder of download dir, move the .gz file it contains (if possible) # to the new database folder for g_folder in os.listdir(download_dir): fasta = glob.glob(os.path.join(download_dir, g_folder, "*.fna.gz")) # No .gz file in folder if len(fasta) == 0: logger.warning("Problem with genome in {}: no compressed fasta file downloaded. " "This genome will be ignored.".format(g_folder)) continue # Several gz files in folder elif len(fasta) > 1: logger.warning("Problem with genome in {}: several compressed fasta files found. " "This genome will be ignored.".format(g_folder)) continue # Copy gz file to new folder fasta_file = os.path.basename(fasta[0]) fasta_out = os.path.join(db_dir, fasta_file) shutil.copy(fasta[0], fasta_out) # Uncompress file copied cmd = f"gunzip {fasta_out} -f" error = f"Error while trying to uncompress {fasta_out}. This genome will be ignored." call = utils.run_cmd(cmd, error) # Problem with uncompressing: genome ignored (remove gz file from new folder) if call.returncode != 0: os.remove(fasta_out) continue nb_gen += 1 return nb_gen, db_dir
def sketch_all(genomes, sorted_genomes, outdir, list_reps, out_msh, mash_log, threads): """ Sketch all genomes to a combined archive. Parameters ---------- genomes : dict {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} sorted_genomes: list list of 'genome_file' for all genomes kept (L90 and nbcont ok), ordered by decreasing quality outdir : str path to directory where all results are saved list_reps : str file with list of genomes to sketch. File will be emptied if it contain something, and filled with the informations from 'genomes'. out_msh : str output of mash mash_log : str mash logfile threads : max number of threads to use Returns ------- return value (0 if OK, 1 if error) """ # If given outdir does not exist, close it if not os.path.isdir(outdir): logger.error(f"Your output directory '{outdir}' does not exist.") sys.exit(1) # Empty list_reps file open(list_reps, "w").close() # Complete paths to genomes to compare: 'path_to_seq_to_annotate' = genome_file[2] file_paths = [genomes[g][2] for g in sorted_genomes] # Write list of genomes to compare to a file utils.write_list(file_paths, list_reps) # Sketch all genome sequences if not already done if os.path.isfile(out_msh + ".msh"): logger.warning( f"Mash sketch file {out_msh}.msh already exists. PanACoTA will " "use it for next step.") os.remove(list_reps) return 0 logger.info("Sketching all genomes...") cmd_sketch = f"mash sketch -o {out_msh} -p {threads} -l {list_reps} -s 1e4" logger.details(cmd_sketch) error_sketch = ( f"Error while trying to sketch {len(sorted_genomes)} genomes to combined " "archive. Maybe some genome sequences in " "'tmp_files' are missing! Check logfile: " f"{mash_log}") outf = open(mash_log, "w") utils.run_cmd(cmd_sketch, error_sketch, eof=True, stdout=outf, stderr=outf, logger=logger) outf.close() return 0
def run_prodigal(arguments): """ Run prodigal for the given genome. Parameters ---------- arguments : tuple (gpath, prodigal_folder, cores_annot, name, force, nbcont, q) with: * gpath: path and filename of genome to annotate * prodigal_folder: path to folder where all prodigal folders for all genomes are saved * cores_annot: how many cores can use prodigal * name: output name of annotated genome * force: True if force run (override existing files), False otherwise * nbcont: number of contigs in the input genome, to check prodigal results * small: ifcontigs are too small (<20000bp), use -p meta option * q : queue where logs are put Returns ------- boolean True if eveything went well (all needed output files present, corresponding numbers of proteins, genes etc.). False otherwise. """ gpath, prodigal_folder, threads, name, force, nbcont, gpath_train, q = arguments # Set logger for this process, which will be given to all subprocess qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('annotate.run_prodigal') # Define prodigal directory and logfile, and check their existence # By default, prodigal is in tmp_folder -> resdir/tmp_files/genome-prodigalRes g_ori_name = os.path.basename(gpath) prodigal_dir = os.path.join(prodigal_folder, g_ori_name + "-prodigalRes") prodigal_logfile = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log") prodigal_logfile_err = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log.err") # If result dir exists but user wants to force, remove this result dir if os.path.isdir(prodigal_dir) and force: shutil.rmtree(prodigal_dir) logger.warning("Prodigal results folder already exists, but is removed because " "--force option was used.") # Training file can be "small option", meaning that we did not use the training mode. # If not "small option", we used the training mode. If training file does not exist # and prodigal result directory neither, return False # We cannot annotate using nothing. # Happens if there was a problem while training if (gpath_train != "small option" and not os.path.isfile(gpath_train) and not os.path.isdir(prodigal_dir)): return False logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) " "with Prodigal") # If prodigal results dir already exists (meaning user did not want to force, # otherwise it would have been deleted just before), # can we use it for next step ? -> check content. if os.path.isdir(prodigal_dir): logger.warning(f"Prodigal results folder {prodigal_dir} already exists.") ok = check_prodigal(gpath, name, prodigal_dir, logger) # If everything ok in the result dir, do not rerun prodigal, # use those results for next step (formatting) if ok: logger.log(utils.detail_lvl(), "Prodigal did not run again. " "Formatting step will use already generated results of " "Prodigal in {}. If you want to re-run Prodigal, first " "remove this result folder, or use '-F' or '--force' " "option.".format(prodigal_dir)) logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})") # If missing files, or other problems in result dir, error message, # ask user to force or remove this folder. else: logger.warning("Problems in the files contained in your already existing output dir " f"({prodigal_dir}). Please check it, or remove it to " "re-annotate.") # If everything was ok -> everything is ready for next step -> return True # If something is wrong -> cannot use those results, genome won't be annotated # -> return False return ok else: # We are sure prodigal result dir does not exist yet, because either: # - never existed # - removed because user asked to force # - exists but left function, so does not go until this line # -> either if files inside are ok or not # So make prodigal_dir (not automatically created by prodigal) os.makedirs(prodigal_dir) # Prodigal_directory is empty and ready to get prodigal results basic_outname = os.path.join(prodigal_dir, name) # Define cmd, stderr and stdout files, and error to write if problem. error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.") prodigalf = open(prodigal_logfile, "w") prodigalferr = open(prodigal_logfile_err, "w") if gpath_train == "small option": training = "-p meta" else: training = f"-t {gpath_train}" cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} " f"-f gff -o {basic_outname + '.gff'} {training} -q") logger.log(utils.detail_lvl(), "Prodigal command: " + cmd) ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf, logger=logger) prodigalf.close() prodigalferr.close() if ret.returncode == 0: logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})") return True else: return False
def run_prokka(arguments): """ Run prokka for the given genome. Parameters ---------- arguments : tuple (gpath, prok_folder, cores_annot, name, force, nbcont, small, q) with: * gpath: path and filename of genome to annotate * prok_folder: path to folder where all prokka folders for all genomes are saved * cores_annot: how many cores can use prokka * name: output name of annotated genome * force: True if force run (override existing files), False otherwise * nbcont: number of contigs in the input genome, to check prokka results * small: used for prodigal, if sequences to annotate are small. Not used here * q : queue where logs are put Returns ------- boolean True if eveything went well (all needed output files present, corresponding numbers of proteins, genes etc.). False otherwise. """ gpath, prok_folder, threads, name, force, nbcont, _, q = arguments # Set logger for this process qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('annotate.run_prokka') logger.log(utils.detail_lvl(), f"Start annotating {name} from {gpath} with Prokka") # Define prokka directory and logfile, and check their existence prok_dir = os.path.join(prok_folder, os.path.basename(gpath) + "-prokkaRes") fnull = open(os.devnull, 'w') prok_logfile = os.path.join(prok_folder, os.path.basename(gpath) + "-prokka.log") # import sys # sys.exit(1) # If result dir already exists, check if we can use it or next step or not if os.path.isdir(prok_dir) and not force: logger.warning(f"Prokka results folder {prok_dir} already exists.") ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger) # If everything ok in the result dir, do not rerun prokka, # use those results for next step (formatting) if ok: logger.log(utils.detail_lvl(), "Prokka did not run again, " "formatting step used already generated results of " f"Prokka in {prok_dir}. If you want to re-run prokka, first " "remove this result folder, or use '-F' or '--force' " "option if you want to rerun prokka for all genomes.") logger.log(utils.detail_lvl(), f"End annotating {name} {gpath}") # If missing files, or other problems in result dir, error message, # ask user to force or remove this folder. else: logger.warning("Problems in the files contained in your already existing output dir " "({}). Please check it, or remove it to " "re-annotate.".format(prok_dir)) # If everything was ok -> everything is ready for next step -> return True # If something is wrong -> cannot use those results, genome won't be annotated # -> return False return ok # If result dir exists but user wants to force, remove this result dir elif os.path.isdir(prok_dir) and force: shutil.rmtree(prok_dir) logger.warning("Prokka results folder already exists, but removed because --force option " "used") # Now that we checked and solved those cases: # - outdir exists (problems or not, we returned appropriate boolean) # - if outdir exists exists but force, remove this outdir. # So, outdir does not exist -> run prokka cmd = (f"prokka --outdir {prok_dir} --cpus {threads} " f"--prefix {name} --centre prokka {gpath}") error = (f"Error while trying to run prokka on {name} from {gpath}") logger.log(utils.detail_lvl(), "Prokka command: " + cmd) prokf = open(prok_logfile, "w") ret = utils.run_cmd(cmd, error, eof=False, stderr=prokf, logger=logger) prokf.close() if ret.returncode != 0: return False ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger) logger.log(utils.detail_lvl(), f"End annotating {name} from {gpath}.") return ok