def test_logger_info(capsys): """ Test that when logger is initialized with "INFO" level, it does not return DEBUG info. """ logfile = os.path.join(GENEPATH, "logfile_test.txt") level = logging.INFO utils.init_logger(logfile, level, "info") logger = logging.getLogger("info") logger.debug("info debug") logger.details("info details") logger.info("info info") logger.warning("info warning") logger.error("info error") logger.critical("info critical") out, err = capsys.readouterr() assert "info info" in out assert "info error" in err assert "info critical" in err with open(logfile + ".log", "r") as logf: assert logf.readline().endswith(" :: INFO :: info info\n") assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n") assert not os.path.isfile(logfile + ".log.details") assert not os.path.isfile(logfile + ".log.debug") with open(logfile + ".log.err", "r") as logf: assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n")
def setup_teardown_module(): """ Remove log files at the end of this test module Before each test: - init logger - create directory to put generated files After: - remove all log files - remove directory with generated results """ utils.init_logger(LOGFILE_BASE, logging.DEBUG, 'test_post_mmseq', verbose=1) os.mkdir(GENEPATH) print("setup") yield shutil.rmtree(GENEPATH) for f in LOGFILES: if os.path.exists(f): os.remove(f) print("teardown")
def test_run_prokka_out_problem_running(): """ Check that when a problem occurs while trying to run prokka, run_prokka returns False, and the error message indicating to read in the log why it couldn't run """ logger = my_logger("test_run_prokka_out_problem_running") utils.init_logger(LOGFILE_BASE, 0, 'test_run_prokka_out_problem_running') gpath = os.path.join(GEN_PATH, "H299_H561bis.fasta") cores_prokka = 2 name = "test_runprokka_H299-error" force = False nbcont = 3 logf = os.path.join(GENEPATH, "H299_H561.fasta-prokka.log") trn_file = "nofile.trn" arguments = (gpath, GENEPATH, cores_prokka, name, force, nbcont, trn_file, logger[0]) assert not afunc.run_prokka(arguments) q = logger[0] assert q.qsize() == 3 assert q.get().message.startswith("Start annotating") assert q.get().message == ( "Prokka command: prokka " "--outdir test/data/annotate/generated_by_unit-tests/" "H299_H561bis.fasta-prokkaRes --cpus 2 " "--prefix test_runprokka_H299-error " "--centre prokka test/data/annotate/genomes/H299_H561bis.fasta") assert q.get().message == ( "Error while trying to run prokka on test_runprokka_H299-error " "from test/data/annotate/genomes/H299_H561bis.fasta")
def test_logger_critical(capsys): """ Test that when logger is initialized with "CRITICAL" level, it only returns CRITICAL information. """ logfile = os.path.join(GENEPATH, "logfile_test.txt") level = logging.CRITICAL utils.init_logger(logfile, level, "crit") logger = logging.getLogger("crit") logger.debug("info debug") logger.details("info details") logger.info("info info") logger.warning("info warning") logger.error("info error") logger.critical("info critical") out, err = capsys.readouterr() assert "info info" in out assert "info error" in err assert "info critical" in err files = os.listdir(GENEPATH) files = [f for f in files if "fuse" not in f] assert len(files) == 2 with open(logfile + ".log", "r") as logf: assert logf.readline().endswith(" :: INFO :: info info\n") assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n") assert not os.path.isfile(logfile + ".log.details") with open(logfile + ".log.err", "r") as logf: assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n")
def test_run_prokka_out_exists_ok(): """ Test that when the output directory already exists, and files inside are OK, run_prokka returns True, with a warning message indicating that prokka did not rerun. """ logger = my_logger("test_run_prokka_out_exists_ok") utils.init_logger(LOGFILE_BASE, 0, 'prokka_out_exists_ok') gpath = "path/to/nogenome/original_name.fna" cores_prokka = 1 name = "prokka_out_for_test" force = False nbcont = 6 trn_file = "nofile.trn" arguments = (gpath, TEST_DIR, cores_prokka, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prokka(arguments) q = logger[0] assert q.qsize() == 4 # start annotating : assert q.get().message.startswith("Start annotating") # warning prokka results folder exists: assert q.get().message.startswith( "Prokka results folder test/data/annotate/" "test_files/" "original_name.fna-prokkaRes already exists.") # Results in result folder are ok assert q.get().message.startswith( "Prokka did not run again, formatting step used already " "generated results of Prokka in " "test/data/annotate/test_files/original_name.fna-prokkaRes.") # End annotation: assert q.get().message.startswith("End annotating")
def test_run_prodigal_out_exists_ok(): """ Test that when the output directory already exists, and files inside are OK, run_prodigal returns True, with a warning message indicating that prodigal did not rerun. """ logger = my_logger("test_run_prodigal_out_exists_ok") utils.init_logger(LOGFILE_BASE, 0, 'prodigal_out_exists_ok') gpath = "path/to/nogenome/original_name.fna" cores_prodigal = 1 name = "prodigal.outtest.ok" force = False nbcont = 7 trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") arguments = (gpath, TEST_DIR, cores_prodigal, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prodigal(arguments) q = logger[0] assert q.qsize() == 4 # start annotating : assert q.get().message.startswith( "Start annotating prodigal.outtest.ok (from " "path/to/nogenome/original_name.fna sequence) with Prodigal") # # warning prodigal results folder exists: assert q.get().message.startswith( "Prodigal results folder test/data/annotate/test_files/" "original_name.fna-prodigalRes already exists.") # Results in result folder are ok assert q.get().message.startswith( "Prodigal did not run again. Formatting step will use " "already generated results of Prodigal in " "test/data/annotate/test_files/" "original_name.fna-prodigalRes.") # End annotation: assert q.get().message.startswith("End annotating")
def test_logger_warning_verbose2(capsys): """ Test that when logger is initialized with "WARNING" level, it does not return anything in stdout, as DEBUG and INFO are not returned. """ logfile = os.path.join(GENEPATH, "logfile_test.txt") level = logging.WARNING utils.init_logger(logfile, level, "warn", verbose=2) logger = logging.getLogger("warn") logger.debug("info debug") logger.details("info details") logger.info("info info") logger.warning("info warning") logger.error("info error") logger.critical("info critical") out, err = capsys.readouterr() assert "info info" in out assert "info details" in out assert "info error" in err assert "info warning" in err assert "info critical" in err files = os.listdir(GENEPATH) files = [f for f in files if "fuse" not in f] assert len(files) == 2 with open(logfile + ".log", "r") as logf: assert logf.readline().endswith(" :: INFO :: info info\n") assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n") assert not os.path.isfile(logfile + ".log.details") with open(logfile + ".log.err", "r") as logf: assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n")
def test_run_prokka_out_doesnt_exist_ok(): """ Test that when the output directory does not exist, it creates it, and runs prokka with all expected outfiles """ logger = my_logger("test_run_prokka_out_doesnt_exist") utils.init_logger(LOGFILE_BASE, 0, 'test_run_prokka_out_doesnt_exist') gpath = os.path.join(GEN_PATH, "H299_H561.fasta") out_dir = os.path.join(GENEPATH, "H299_H561.fasta-prokkaRes") cores_prokka = 2 name = "test_runprokka_H299" force = False nbcont = 3 trn_file = "nofile.trn" arguments = (gpath, GENEPATH, cores_prokka, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prokka(arguments) # Check content of tbl, ffn and faa files exp_dir = os.path.join(EXP_DIR, "H299_H561.fasta-short-contig.fna-prokkaRes", "test_runprokka_H299") out_tbl = os.path.join(out_dir, name + ".tbl") out_faa = os.path.join(out_dir, name + ".faa") out_ffn = os.path.join(out_dir, name + ".ffn") out_gff = os.path.join(out_dir, name + ".gff") assert os.path.isfile(out_tbl) # For tbl file, check that, at least, the 3 contigs were considered, # and that the number of CDS is as expected. # Before, we checked that the output # was exactly as expected. But it changes with the different versions of prokka, so # we cannot compare the whole file. with open(out_tbl, "r") as outt: lines = [line.strip() for line in outt.readlines()] # Check that there are 3 contigs feature = 0 for line in lines: if 'Feature' in line: feature += 1 assert feature == 3 # Check that there are 16 CDS CDS = 0 for line in lines: if "CDS" in line: CDS += 1 assert CDS == 16 # Check that faa and ffn files are as expected assert os.path.isfile(out_faa) assert tutil.compare_order_content(exp_dir + ".faa", out_faa) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".ffn", out_ffn) q = logger[0] assert q.qsize() == 3 assert q.get().message.startswith("Start annotating") assert q.get().message == ( "Prokka command: prokka " "--outdir test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prokkaRes --cpus 2 --prefix test_runprokka_H299 " "--centre prokka test/data/annotate/genomes/H299_H561.fasta") assert q.get().message.startswith("End annotating")
def make_logger(name="test_post_mmseq"): """ Create logger according to name given """ logfile_base = "log_" + name level = logging.DEBUG utils.init_logger(logfile_base, level, name, verbose=0, quiet=False) return logfile_base
def test_run_prodigal_out_exists_force(): """ Test that when the output directory already exists with wrong files, but force is on, prodigal is rerun and outputs the right files """ logger = my_logger("test_run_prodigal_out_exists_force") utils.init_logger(LOGFILE_BASE, 0, 'force') gpath = os.path.join(GEN_PATH, "H299_H561.fasta") out_prokdir = os.path.join(GENEPATH, "H299_H561.fasta-prodigalRes") name = "test_runprodigal_H299" # Put empty tbl, faa, ffn files in prodigal output dir, to check that they are overridden os.makedirs(out_prokdir) open(os.path.join(out_prokdir, name + ".gff"), "w").close() open(os.path.join(out_prokdir, name + ".faa"), "w").close() open(os.path.join(out_prokdir, name + ".ffn"), "w").close() cores_prodigal = 2 force = True nbcont = 3 trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") arguments = (gpath, GENEPATH, cores_prodigal, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prodigal(arguments) # As we used 'force', tbl, faa and ffn files, which were empty, must have been replaced # by the prodigal output exp_dir = os.path.join(EXP_DIR, "H299_H561.fasta-prodigalRes", "ESCO.1015.00001") out_gff = os.path.join(out_prokdir, name + ".gff") out_faa = os.path.join(out_prokdir, name + ".faa") out_ffn = os.path.join(out_prokdir, name + ".ffn") # Check that faa and ffn files are as expected assert os.path.isfile(out_faa) assert tutil.compare_order_content(exp_dir + ".faa", out_faa) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".ffn", out_ffn) q = logger[0] assert q.qsize() == 4 assert q.get().message.startswith( "Prodigal results folder already exists, but is " "removed because --force option was used") assert q.get().message.startswith( "Start annotating test_runprodigal_H299 (from test/data/" "annotate/genomes/H299_H561.fasta sequence) " "with Prodigal") assert q.get().message.startswith( "Prodigal command: prodigal -i test/data/annotate/genomes/" "H299_H561.fasta -d test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.ffn -a test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.faa -f gff -o test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.gff -t " "test/data/annotate/test_files/A_H738-and-B2_A3_5.fna.trn " "-q") assert q.get().message.startswith( "End annotating test_runprodigal_H299 " "(from test/data/annotate/genomes/H299_H561.fasta)")
def test_log_listen(capsys): """ Check that when we log to a queue listener, and then handle the logs via logger_thread, the logs appear. """ import multiprocessing import threading # Create Queue, QueueHandler, and log messages to it m = multiprocessing.Manager() q = m.Queue() qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('process') logger.debug("debug message") logger.log(utils.detail_lvl(), "detail message") logger.info("info message") logger.warning("warning message") logger.error("error message") logger.critical("critical message") q.put(None) # Initialize real logger logfile = os.path.join(GENEPATH, "logfile_test.txt") utils.init_logger(logfile, 0, '') # Listen to QueueHandler and handle messages to stdout/stderr/files lp = threading.Thread(target=utils.logger_thread, args=(q, )) lp.start() lp.join() out, err = capsys.readouterr() assert "info message" in out assert "error message" in err assert "critical message" in err with open(logfile + ".log", "r") as logf: assert logf.readline().endswith(" :: INFO :: info message\n") assert logf.readline().endswith(" :: WARNING :: warning message\n") assert logf.readline().endswith(" :: ERROR :: error message\n") assert logf.readline().endswith(" :: CRITICAL :: critical message\n") with open(logfile + ".log.details") as logf: assert logf.readline().endswith(" :: DETAIL :: detail message\n") assert logf.readline().endswith(" :: INFO :: info message\n") assert logf.readline().endswith(" :: WARNING :: warning message\n") assert logf.readline().endswith(" :: ERROR :: error message\n") assert logf.readline().endswith(" :: CRITICAL :: critical message\n") with open(logfile + ".log.err", "r") as logf: assert logf.readline().endswith(" :: WARNING :: warning message\n") assert logf.readline().endswith(" :: ERROR :: error message\n") assert logf.readline().endswith(" :: CRITICAL :: critical message\n")
def test_logger_exists(capsys): """ Test that when the logfiles already exist, it creates new ones with a timestamp added """ logfile = os.path.join(GENEPATH, "logfile_test.txt") open(logfile + ".log", "w").close() open(logfile + ".log.details", "w").close() open(logfile + ".log.debug", "w").close() open(logfile + ".log.err", "w").close() level = logging.DEBUG utils.init_logger(logfile, level, "already_exist", verbose=1) logger = logging.getLogger("already_exist") logger.debug("info debug") logger.details("info details") logger.info("info info") logger.warning("info warning") logger.error("info error") logger.critical("info critical") out, err = capsys.readouterr() assert "info info" in out assert "info warning" in err assert "info error" in err assert "info critical" in err # Check that initial log files are still empty with open(logfile + ".log", "r") as logf: assert logf.readlines() == [] with open(logfile + ".log.debug", "r") as logf: assert logf.readlines() == [] with open(logfile + ".log.err", "r") as logf: assert logf.readlines() == [] with open(logfile + ".log.details", "r") as logf: assert logf.readlines() == [] # Check for new .log file, remove the one which is empty import glob logs = glob.glob(logfile + "*" + ".log") assert len(logs) == 2 logs.remove(logfile + ".log") with open(logs[0], "r") as logf: assert logf.readline().endswith(" :: INFO :: info info\n") assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n") # Same thing for .log.err file logs_err = glob.glob(logfile + "*" + ".log.err") assert len(logs_err) == 2 logs_err.remove(logfile + ".log.err") with open(logs_err[0], "r") as logf: assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n")
def test_logger_verbose2(capsys): """ Test that logger is initialized as expected. """ logfile = os.path.join(GENEPATH, "logfile_test.txt") level = logging.DEBUG utils.init_logger(logfile, level, "toto", verbose=2) logger = logging.getLogger("toto") logger.debug("info debug") logger.details("info details") logger.info("info info") logger.warning("info warning") logger.error("info error") logger.critical("info critical") out, err = capsys.readouterr() assert "info debug" in out assert "info details" in out assert "info info" in out assert "info warning" in err assert "info error" in err assert "info critical" in err with open(logfile + ".log", "r") as logf: assert logf.readline().endswith(" :: INFO :: info info\n") assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n") with open(logfile + ".log.details") as logf: assert logf.readline().endswith(" :: DETAIL :: info details\n") assert logf.readline().endswith(" :: INFO :: info info\n") assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n") with open(logfile + ".log.err", "r") as logf: assert logf.readline().endswith(" :: WARNING :: info warning\n") assert logf.readline().endswith(" :: ERROR :: info error\n") assert logf.readline().endswith(" :: CRITICAL :: info critical\n") with open(logfile + ".log.debug") as logf: assert logf.readline().endswith( " :: DEBUG (from toto logger) :: info debug\n") assert logf.readline().endswith( " :: DETAIL (from toto logger) :: info details\n") assert logf.readline().endswith( " :: INFO (from toto logger) :: info info\n") assert logf.readline().endswith( " :: WARNING (from toto logger) :: info warning\n") assert logf.readline().endswith( " :: ERROR (from toto logger) :: info error\n") assert logf.readline().endswith( " :: CRITICAL (from toto logger) :: info critical\n")
def test_run_prodigal_small(): """ Test that when the output directory does not exist, it creates it, and runs prodigal with all expected outfiles. Here, we run prodigal with --small option (on a small genome) """ logger = my_logger("test_run_prodigal_small") utils.init_logger(LOGFILE_BASE, 0, 'test_run_prodigal_small') gpath = os.path.join(GEN_PATH, "H299_H561.fasta") out_dir = os.path.join(GENEPATH, "H299_H561.fasta-prodigalRes") cores_prodigal = 2 name = "test_runprodigal_small_H299" force = False trn_file = "small option" nbcont = 3 arguments = (gpath, GENEPATH, cores_prodigal, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prodigal(arguments) # Check content of tbl, ffn and faa files exp_dir = os.path.join(EXP_DIR, "H299_H561.fasta_small-prodigalRes", "test_runprodigal_small_H299") out_faa = os.path.join(out_dir, name + ".faa") out_ffn = os.path.join(out_dir, name + ".ffn") out_gff = os.path.join(out_dir, name + ".gff") # Check that faa and ffn files are as expected assert os.path.isfile(out_faa) assert tutil.compare_order_content(exp_dir + ".faa", out_faa) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".ffn", out_ffn) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".gff", out_gff) # Check logs q = logger[0] assert q.qsize() == 3 assert q.get().message.startswith("Start annotating") prodigal_cmd = q.get().message assert ("Prodigal command: prodigal -i test/data/annotate/genomes/" "H299_H561.fasta -d test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_small_H299.ffn -a test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_small_H299.faa -f gff -o test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_small_H299.gff -p meta -q") in prodigal_cmd assert q.get().message.startswith("End annotating")
def test_run_prodigal_out_doesnt_exist(): """ Test that when the output directory does not exist, it creates it, and runs prodigal with all expected outfiles """ logger = my_logger("test_run_prodigal_out_doesnt_exist") utils.init_logger(LOGFILE_BASE, 0, 'test_run_prodigal_out_doesnt_exist') gpath = os.path.join(GEN_PATH, "H299_H561.fasta") out_dir = os.path.join(GENEPATH, "H299_H561.fasta-prodigalRes") cores_prodigal = 2 name = "test_runprodigal_H299" force = False trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") nbcont = 3 arguments = (gpath, GENEPATH, cores_prodigal, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prodigal(arguments) # Check content of tbl, ffn and faa files exp_dir = os.path.join(EXP_DIR, "H299_H561.fasta-prodigalRes", "ESCO.1015.00001") out_faa = os.path.join(out_dir, name + ".faa") out_ffn = os.path.join(out_dir, name + ".ffn") out_gff = os.path.join(out_dir, name + ".gff") # Check that faa and ffn files are as expected assert os.path.isfile(out_faa) assert tutil.compare_order_content(exp_dir + ".faa", out_faa) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".ffn", out_ffn) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".gff", out_gff) q = logger[0] assert q.qsize() == 3 assert q.get().message.startswith("Start annotating") assert q.get().message == ( "Prodigal command: prodigal -i test/data/annotate/genomes/" "H299_H561.fasta -d test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.ffn -a test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.faa -f gff -o test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.gff -t " "test/data/annotate/test_files/A_H738-and-B2_A3_5.fna.trn " "-q") assert q.get().message.startswith("End annotating")
def test_run_prokka_out_exists_error(): """ Test that when the output directory already exists, and 1 file is missing, run_prokka returns False, and writes the warning message saying that prokka did not rerun, + the warning message for the missing file(s). """ logger = my_logger("test_run_prokka_out_exists_error") utils.init_logger(LOGFILE_BASE, 0, 'prokka_out_error') ori_prok_dir = os.path.join(TEST_DIR, "original_name.fna-prokkaRes") ori_name = "prokka_out_for_test" new_prok_dir = os.path.join(GENEPATH, "original_name-error-prokkaRes") name = "prokka_out_for_test-wrongCDS" os.makedirs(new_prok_dir) shutil.copyfile(os.path.join(ori_prok_dir, ori_name + ".fna"), os.path.join(new_prok_dir, name + ".fna")) shutil.copyfile(os.path.join(ori_prok_dir, ori_name + ".ffn"), os.path.join(new_prok_dir, name + ".ffn")) shutil.copyfile(os.path.join(ori_prok_dir, ori_name + ".faa"), os.path.join(new_prok_dir, name + ".faa")) shutil.copyfile(os.path.join(ori_prok_dir, ori_name + ".gff"), os.path.join(new_prok_dir, name + ".gff")) gpath = "path/to/nogenome/original_name-error" cores_prokka = 1 force = False nbcont = 6 trn_file = "nofile.trn" arguments = (gpath, GENEPATH, cores_prokka, name, force, nbcont, trn_file, logger[0]) assert not afunc.run_prokka(arguments) q = logger[0] assert q.qsize() == 4 # start annotating : assert q.get().message.startswith("Start annotating") # warning prokka results folder exists: assert q.get().message == ( "Prokka results folder test/data/annotate/generated_by_unit-tests/" "original_name-error-prokkaRes already exists.") # error, no tbl file assert q.get( ).message == "prokka_out_for_test-wrongCDS original_name-error: no .tbl file" # warning, files in outdir are not as expected assert q.get().message.startswith( "Problems in the files contained in your already existing " "output dir (test/data/annotate/generated_by_unit-tests/" "original_name-error-prokkaRes)")
def test_run_prodigal_out_exists_error(): """ Test that when the output directory already exists, and 1 file is missing, run_prodigal returns False, and writes the warning message saying that prodigal did not rerun, + the warning message for the missing file(s). """ logger = my_logger("test_run_prodigal_out_exists_error") utils.init_logger(LOGFILE_BASE, 0, 'prodigal_out_error') ori_prok_dir = os.path.join(TEST_DIR, "original_name.fna-prodigalRes") ori_name = "prodigal.outtest.ok" new_prok_dir = os.path.join(GENEPATH, "original_name-error-prodigalRes") name = "prodigal_out_for_test-wrongCDS" os.makedirs(new_prok_dir) shutil.copyfile(os.path.join(ori_prok_dir, ori_name + ".ffn"), os.path.join(new_prok_dir, name + ".ffn")) shutil.copyfile(os.path.join(ori_prok_dir, ori_name + ".faa"), os.path.join(new_prok_dir, name + ".faa")) open(os.path.join(new_prok_dir, name + ".gff"), "w").close() gpath = "path/to/nogenome/original_name-error" cores_prodigal = 1 force = False trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") nbcont = 7 arguments = (gpath, GENEPATH, cores_prodigal, name, force, nbcont, trn_file, logger[0]) assert not afunc.run_prodigal(arguments) q = logger[0] assert q.qsize() == 4 # start annotating : assert q.get().message.startswith("Start annotating") # warning prodigal results folder exists: assert q.get().message == ( "Prodigal results folder test/data/annotate/" "generated_by_unit-tests/" "original_name-error-prodigalRes already exists.") # error, empty gff msg = ("Genome prodigal_out_for_test-wrongCDS (from original_name-error): " "At least one of your Prodigal result file is empty.") assert q.get().message == msg # warning, files in outdir are not as expected assert q.get().message.startswith( "Problems in the files contained in your already existing " "output dir (test/data/annotate/generated_by_unit-tests/" "original_name-error-prodigalRes")
def test_run_prodigal_noout_notrain(): """ Prodigal result directory does not exist (not already run) training file does not exist (probably, problem while trying to train) -> return False """ logger = my_logger("test_run_prodigal_out_exists_error") utils.init_logger(LOGFILE_BASE, 0, 'prodigal_out_error') gpath = "path/to/nogenome/original_name-error" cores_prodigal = 1 name = "prodigal_out_for_test-wrongCDS" force = False nbcont = 7 trn_file = "ghost_trn_file" arguments = (gpath, GENEPATH, cores_prodigal, name, force, nbcont, trn_file, logger[0]) assert not afunc.run_prodigal(arguments) q = logger[0] assert q.qsize() == 0
def test_run_all_prodigal_error_train(): """ Check that when we want to train on a genome but it fails, it returns False for all genomes Here, it fails because genome to train on is too small """ logger = my_logger("test_run_all_parallel_more_threads") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_parallel_more_threads') # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]} genome1 = "H299_H561.fasta" gpath1 = os.path.join(GEN_PATH, genome1) genome2 = "A_H738.fasta" gpath2 = os.path.join(GEN_PATH, genome2) genomes = { genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0], genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465] } threads = 8 force = False trn_gname = genome1 final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_gname, prodigal_only=True, quiet=True) assert not final[genome1] assert not final[genome2] q = logger[0] assert q.qsize() == 4 assert q.get().message == "Annotating all genomes with prodigal" assert q.get().message == ("Prodigal will train using " "test/data/annotate/genomes/H299_H561.fasta") assert q.get().message == ( "prodigal command: prodigal -i " "test/data/annotate/genomes/H299_H561.fasta -t " "test/data/annotate/generated_by_unit-tests/H299_H561.fasta.trn") assert q.get().message == ( "Error while trying to train prodigal on H299_H561.fasta. See " "test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta.trn-prodigal-train.log.err.")
def test_run_prodigal_out_problem_running(): """ Check that when a problem occurs while trying to run prodigal, run_prodigal returns False, and the error message indicating to read in the log why it couldn't run """ logger = my_logger("test_run_prodigal_out_problem_running") utils.init_logger(LOGFILE_BASE, 0, 'test_run_prodigal_out_problem_running') gpath = os.path.join(GEN_PATH, "H299_H561bis.fasta") cores_prodigal = 2 name = "test_runprodigal_H299-error" force = False nbcont = 3 trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") logf = os.path.join(GENEPATH, "H299_H561bis.fasta-prodigal.log") arguments = (gpath, GENEPATH, cores_prodigal, name, force, nbcont, trn_file, logger[0]) assert not afunc.run_prodigal(arguments) # Check that output directory is empty outdir = os.path.join(GENEPATH, "H299_H561bis.fasta-prodigalRes") assert os.listdir(outdir) == [] # Check logs q = logger[0] assert q.qsize() == 3 assert q.get().message.startswith("Start annotating") assert q.get().message.startswith( "Prodigal command: prodigal -i test/data/annotate/genomes/" "H299_H561bis.fasta -d test/data/annotate/" "generated_by_unit-tests/H299_H561bis.fasta-prodigalRes/" "test_runprodigal_H299-error.ffn -a test/data/annotate/" "generated_by_unit-tests/H299_H561bis.fasta-prodigalRes/" "test_runprodigal_H299-error.faa -f gff -o test/data/annotate/" "generated_by_unit-tests/H299_H561bis.fasta-prodigalRes/" "test_runprodigal_H299-error.gff -t " "test/data/annotate/test_files/A_H738-and-B2_A3_5.fna.trn " "-q") assert q.get().message.startswith( "Error while trying to run prodigal. See test/data/" "annotate/generated_by_unit-tests/" "H299_H561bis.fasta-prodigal.log.err.")
def test_log_no_listen(capsys): """ Check that when we log to a queue listener, but never listen to the queue, there is nothing in stderr/stdout/files """ import multiprocessing # Create Queue, QueueHandler, and log messages to it m = multiprocessing.Manager() q = m.Queue() qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('process') logger.debug("debug message") logger.log(utils.detail_lvl(), "detail message") logger.info("info message") logger.warning("warning message") logger.error("error message") logger.critical("critical message") q.put(None) # Initialize real logger logfile = os.path.join(GENEPATH, "test_log_listen") utils.init_logger(logfile, 0, '') assert q.qsize() == 7 out, err = capsys.readouterr() assert out == "" assert err == "" with open(logfile + ".log", "r") as logf: assert logf.readlines() == [] with open(logfile + ".log.details") as logf: assert logf.readlines() == [] with open(logfile + ".log.err", "r") as logf: assert logf.readlines() == []
def test_run_all_prodigal_train_exists_ok(): """ Check that when we want to train on a genome but it fails, it returns False for all genomes Here, it fails because genome to train on is too small """ logger = my_logger("test_run_prodigal_train_exist_error") utils.init_logger(LOGFILE_BASE, 0, 'test_run_prodigal_train_exist_error') # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]} genome1 = "toto.fasta" gpath1 = os.path.join(GEN_PATH, genome1) genome2 = "A_H738.fasta" gpath2 = os.path.join(GEN_PATH, genome2) genomes = { genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0], genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465] } threads = 8 force = False trn_gname = genome1 # Copy trn file to outdir, so that panacota detects that it already exists orig_trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") trn_file = os.path.join(GENEPATH, "toto.fasta.trn") shutil.copyfile(orig_trn_file, trn_file) # Run annotation all final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_gname, prodigal_only=True, quiet=False) assert not final[genome1] assert final[genome2] q = logger[0] assert q.qsize() == 9 assert q.get().message == "Annotating all genomes with prodigal" assert q.get().message == ("Prodigal will train using " "test/data/annotate/genomes/toto.fasta") assert q.get().message == ( "A training file already exists (test/data/annotate/" "generated_by_unit-tests/toto.fasta.trn). It will be used " "to annotate all genomes.") # Check that all messages exist. We cannot know in which order, # as 'genomes' is a dict, hence unordered, and as computation is done in parallel messages = [] for i in range(6): a = q.get().message messages.append(a) # Check start annotation messages message_start_annot1 = ( "Start annotating test_runall_1by1_1 " "(from test/data/annotate/genomes/toto.fasta sequence) " "with Prodigal") message_start_annot2 = ( "Start annotating test_runall_1by1_2 " "(from test/data/annotate/genomes/A_H738.fasta sequence) " "with Prodigal") assert message_start_annot1 in messages assert message_start_annot2 in messages # Prodigal cmd message_cmd1 = ( "Prodigal command: prodigal -i test/data/annotate/genomes/toto.fasta " "-d test/data/annotate/generated_by_unit-tests/toto.fasta-prodigalRes/" "test_runall_1by1_1.ffn -a test/data/annotate/generated_by_unit-tests/" "toto.fasta-prodigalRes/test_runall_1by1_1.faa -f gff " "-o test/data/annotate/generated_by_unit-tests/" "toto.fasta-prodigalRes/test_runall_1by1_1.gff -t " "test/data/annotate/generated_by_unit-tests/toto.fasta.trn -q") message_cmd2 = ( "Prodigal command: prodigal -i test/data/annotate/genomes/A_H738.fasta " "-d test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/" "test_runall_1by1_2.ffn -a test/data/annotate/generated_by_unit-tests/" "A_H738.fasta-prodigalRes/test_runall_1by1_2.faa -f gff " "-o test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/" "test_runall_1by1_2.gff -t " "test/data/annotate/generated_by_unit-tests/toto.fasta.trn -q") assert message_cmd1 in messages assert message_cmd2 in messages message_end_annot1 = ("Error while trying to run prodigal. See " "test/data/annotate/generated_by_unit-tests/" "toto.fasta-prodigal.log.err.") message_end_annot2 = ( "End annotating test_runall_1by1_2 (from test/data/annotate/genomes/" "A_H738.fasta)") assert message_end_annot1 in messages assert message_end_annot2 in messages
def main(cmd, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains, levels, ncbi_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet): """ Main method, constructing the draft dataset for the given species verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more - 1: same as 0 + WARNING in stderr - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug Parameters ---------- cmd : str command line used to launch this program ncbi_species_name : str name of species to download, as given by NCBI ncbi_species_taxid : int species taxid given in NCBI ncbi_taxid : int NCBI taxid (sub-species) ncbi_strains : str specific strains to download levels: str Level of assembly to download. Choice between 'all', 'complete', 'chromosome', 'scaffold', 'contig'. Default is 'all' outdir : str path to output directory (where created database will be saved). tmp_dir : str Path to directory where tmp files are saved (sequences split at each row of 5 'N') threads : int max number of threads to use norefseq : bool True if user does not want to download again the database db_dir : str Name of the folder where already downloaded fasta files are saved. only_mash : bool True if user user already has the database and quality of each genome (L90, #contigs etc.) info_file : str File containing information on QC if it was already ran before (columns to_annotate, gsize, nb_conts and L90). l90 : int Max L90 allowed to keep a genome nbcont : int Max number of contigs allowed to keep a genome cutn : int cut at each when there are 'cutn' N in a row. Don't cut if equal to 0 min_dist : int lower limit of distance between 2 genomes to keep them max_dist : int upper limit of distance between 2 genomes to keep them (default is 0.06) verbose : int verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more - 1: same as 0 + WARNING in stderr - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug quiet : bool True if nothing must be sent to stdout/stderr, False otherwise """ # get species name in NCBI format # -> will be used to name output directory # -> will be used to download summary file if given species corresponds to NCBI name if ncbi_species_name: species_linked = "_".join(ncbi_species_name.split()) species_linked = "_".join(species_linked.split("/")) # if species name not given by user, use species taxID (if given) to name output directory elif ncbi_species_taxid: species_linked = str(ncbi_species_taxid) # if species name not species taxid by user, use taxID (if given) to name output directory elif ncbi_taxid: species_linked = str(ncbi_taxid) # If no species nor taxID, get specific strain names elif ncbi_strains: if os.path.isfile(ncbi_strains): species_linked = os.path.basename(ncbi_strains) species_linked = os.path.splitext(species_linked)[0] else: species_linked = "_".join(ncbi_strains.split()) species_linked = "-".join(species_linked.split("/")) species_linked = "_and_".join(species_linked.split(",")) # if neither speName, speID, taxID nor strainName given (--norefseq, mashonly), name is NA else: species_linked = "NA" # Default outdir is species name if given, or species taxID if not outdir: outdir = species_linked # Default tmp_dir is outdir/tmp_files if not tmp_dir: tmp_dir = os.path.join(outdir, "tmp_files") # directory that will be created by ncbi_genome_download ncbidir = os.path.join(outdir, ncbi_section, "bacteria") os.makedirs(outdir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) # Initialize logger # set level of logger: level is the minimum level that will be considered. if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = utils.detail_lvl() # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG logfile_base = os.path.join(outdir, "PanACoTA_prepare_{}").format(species_linked) logfile_base, logger = utils.init_logger(logfile_base, level, 'prepare', log_details=True, verbose=verbose, quiet=quiet) # Message on what will be done (cmd, cores used) logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) message = f"'PanACoTA prepare' will run on {threads} " message += f"cores" if threads > 1 else "core" logger.info(message) # Start prepare step # Run more than only mash filter (!only_mash): # - start from QC and mash (norefseq) # - start from genome download (!norefseq)) if not only_mash: # Not only mash, so a new info file will be created. If the user still gave an info # file (he will be warned that it will be ignored), rename it with '.bak' # to avoid erasing it if info_file and os.path.isfile(info_file): os.rename(info_file, info_file + ".back") # 'norefseq = True" : Do not download genomes, just do QC and mash filter on given genomes # -> if not, error and exit if norefseq: logger.warning(f'You asked to skip {ncbi_section} downloads.') # -> if db_dir given, watch for sequences there. If does not exist, error and exit # (user gave a directory (even if it does not exist), so we won't look for # the sequences in other folders) if db_dir: if not os.path.exists(db_dir): logger.error( f"Database folder {db_dir} supposed to contain fasta " "sequences does not " "exist. Please give a valid folder, or leave the default " "directory (no '-d' option).") sys.exit(1) # -> If user did not give db_dir, genomes could be in # outdir/Database_init/<genome_name>.fna else: db_dir = os.path.join(outdir, "Database_init") # If it does not exist, check if default compressed files folder exists. if not os.path.exists(db_dir): logger.warning( f"Database folder {db_dir} supposed to contain fasta " "sequences does not " "exist. We will check if the download folder (with compressed " "sequences) exists.") # -> if not in database_init, genomes must be in # outdir/refeq/bacteria/<genome_name>.fna.gz. In that case, # uncompress and add them to Database_init if not os.path.exists(ncbidir): logger.error( f"Folder {ncbidir} does not exist. You do not have any " "genome to analyse. Possible reasons:\n" "- if you want to rerun analysis in the same folder as " "sequences were downloaded (my_outdir/Database_init or " f"my_outdir/{ncbi_section}), make sure you have '-o my_outdir' " "option\n" "- if you want to rerun analysis and save them in a new " "output folder called 'new_outdir', make sure you have " "'-o new_outdir' option, " "and you specified where the uncompressed sequences to " "use are ('-d sequence_database_path'). ") sys.exit(1) # add genomes from refseq/bacteria folder to Database_init nb_gen, _ = dgf.to_database(outdir, ncbi_section) # No sequence: Do all steps -> download, QC, mash filter else: # Download all genomes of the given taxID db_dir, nb_gen = dgf.download_from_ncbi(species_linked, ncbi_section, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains, levels, outdir, threads) logger.info(f"{nb_gen} {ncbi_section} genome(s) downloaded") # Now that genomes are downloaded and uncompressed, check their quality to remove bad ones genomes = fg.check_quality(species_linked, db_dir, tmp_dir, l90, nbcont, cutn) # Do only mash filter. Genomes must be already downloaded, and there must be a file with # all information on these genomes (L90 etc.) else: logger.warning('You asked to run only mash steps.') if not os.path.exists( info_file): # info-file missing -> error and exit logger.error( f"Your info file {info_file} does not exist. Please provide the " "right name/path, or remove the '--mash-only option to rerun " "quality control.") sys.exit(1) logger.info(("You want to run only mash steps. Getting information " "from {}").format(info_file)) genomes = utils.read_genomes_info( info_file, species_linked, ) # Run Mash # genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} # sorted_genome : [genome_file] ordered by L90/nbcont (keys of genomes) sorted_genomes = fg.sort_genomes_minhash(genomes, l90, nbcont) # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90 discQC = f"by-L90_nbcont-{species_linked}.txt" utils.write_genomes_info(genomes, sorted_genomes, discQC, outdir) # Remove genomes not corresponding to mash filters removed = fg.iterative_mash(sorted_genomes, genomes, outdir, species_linked, min_dist, max_dist, threads, quiet) # Write list of genomes kept, and list of genomes discarded by mash step info_file = fg.write_outputfiles(genomes, sorted_genomes, removed, outdir, species_linked, min_dist, max_dist) logger.info("End") return info_file
def test_run_prokka_out_exists_force(): """ Test that when the output directory already exists with wrong files, but force is on, prokka is rerun and outputs the right files """ logger = my_logger("test_run_prokka_out_exists_force") utils.init_logger(LOGFILE_BASE, 0, 'force') gpath = os.path.join(GEN_PATH, "H299_H561.fasta") out_prokdir = os.path.join(GENEPATH, "H299_H561.fasta-prokkaRes") name = "test_runprokka_H299" # Put empty tbl, faa, ffn files in prokka output dir, to check that they are overridden os.makedirs(out_prokdir) open(os.path.join(out_prokdir, name + ".tbl"), "w").close() open(os.path.join(out_prokdir, name + ".faa"), "w").close() open(os.path.join(out_prokdir, name + ".ffn"), "w").close() cores_prokka = 2 force = True nbcont = 3 trn_file = "nofile.trn" arguments = (gpath, GENEPATH, cores_prokka, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prokka(arguments) # As we used 'force', tbl, faa and ffn files, which were empty, must have been replaced # by the prokka output exp_dir = os.path.join(EXP_DIR, "H299_H561.fasta-short-contig.fna-prokkaRes", "test_runprokka_H299") out_tbl = os.path.join(out_prokdir, name + ".tbl") out_faa = os.path.join(out_prokdir, name + ".faa") out_ffn = os.path.join(out_prokdir, name + ".ffn") assert os.path.isfile(out_tbl) # For tbl file, check that, at least, the 3 contigs were considered, # and that the number of CDS is as expected. # Before, we checked that the output # was exactly as expected. But it changes with the different versions of prokka, so # we cannot compare the whole file. with open(out_tbl, "r") as outt: lines = [line.strip() for line in outt.readlines()] # Check that there are 3 contigs feature = 0 for line in lines: if 'Feature' in line: feature += 1 assert feature == 3 # Check that there are 16 CDS CDS = 0 for line in lines: if "CDS" in line: CDS += 1 assert CDS == 16 # Check that faa and ffn files are as expected assert os.path.isfile(out_faa) assert tutil.compare_order_content(exp_dir + ".faa", out_faa) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".ffn", out_ffn) q = logger[0] assert q.qsize() == 4 assert q.get().message.startswith( "Start annotating test_runprokka_H299 from test/data/" "annotate/genomes/H299_H561.fasta with Prokka") assert q.get().message == ( "Prokka results folder already exists, but removed because " "--force option used") assert q.get().message == ( "Prokka command: prokka " "--outdir test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prokkaRes --cpus 2 --prefix test_runprokka_H299 " "--centre prokka test/data/annotate/genomes/H299_H561.fasta") assert q.get().message.startswith( "End annotating test_runprokka_H299 " "from test/data/annotate/genomes/H299_H561.fasta")
def test_run_all_prodigal_outexists_error(): """ trn file already exists, and output folder too. No force option. Output folder is empty -> error message while checking prodigal """ logger = my_logger("test_run_all_parallel_more_threads") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_parallel_more_threads') # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]} genome1 = "toto.fasta" genome2 = "A_H738.fasta" genomes = { genome1: ["test_runall_1by1_1", genome1, genome1, 12656, 3, 0], genome2: ["test_runall_1by1_2", genome2, genome2, 456464645, 1, 465] } # Create prodigal result directories prodigaldir_g1 = os.path.join(GENEPATH, "A_H738.fasta-prodigalRes") prodigaldir_g2 = os.path.join(GENEPATH, "toto.fasta-prodigalRes") os.makedirs(prodigaldir_g1) os.makedirs(prodigaldir_g2) # Other parameters threads = 1 force = False # Add existing training file orig_trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") trn_file = os.path.join(GENEPATH, "toto.fasta.trn") shutil.copyfile(orig_trn_file, trn_file) trn_gname = genome1 final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_gname, prodigal_only=True, quiet=False) assert not final[genome1] assert not final[genome2] q = logger[0] assert q.qsize() == 15 assert q.get().message == "Annotating all genomes with prodigal" assert q.get().message == "Prodigal will train using toto.fasta" assert q.get().message == ( "A training file already exists (test/data/annotate/" "generated_by_unit-tests/toto.fasta.trn). It will " "be used to annotate all genomes.") messages = [] for i in range(12): a = q.get().message messages.append(a) message_start_annot1 = ("Start annotating test_runall_1by1_1 " "(from toto.fasta sequence) with Prodigal") # Check that all messages exist. We cannot know in which order, # as 'genomes' is a dict, hence unordered, and as computation is done in parallel assert message_start_annot1 in messages # Prodigal cmd message_exists1 = ( "Prodigal results folder test/data/annotate/generated_by_unit-tests/" "toto.fasta-prodigalRes already exists.") message_errorfaa = ( "test_runall_1by1_1 toto.fasta: no or several .faa file(s)") message_errorffn = ( "test_runall_1by1_1 toto.fasta: no or several .ffn file(s)") message_errorgff = ( "test_runall_1by1_1 toto.fasta: no or several .gff file(s)") message_error1 = ( "Problems in the files contained in your already existing output dir " "(test/data/annotate/generated_by_unit-tests/toto.fasta-prodigalRes). " "Please check it, or remove it to re-annotate.") assert message_exists1 in messages assert message_errorfaa in messages assert message_errorffn in messages assert message_errorgff in messages assert message_error1 in messages message_start_annot2 = ("Start annotating test_runall_1by1_2 " "(from A_H738.fasta sequence) with Prodigal") assert message_start_annot2 in messages message_error_annot2 = ( "Problems in the files contained in your already existing output dir " "(test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes). " "Please check it, or remove it to re-annotate.") assert message_error_annot2 in messages
def test_run_all_1by1_prokka(): """ Check that when running with 3 threads (not parallel), prokka runs as expected, and returns True for each genome -> Runs 1 by 1, with prokka using 3 cpus Start and end must be ordered: (start1, end1, start2, end2) or (start2, end2, start1, end1) """ logger = my_logger("test_runall_1by1_1") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_1by1') # genomes = {genome: [name, gpath, size, nbcont, l90]} genome1 = "H299_H561.fasta" gpath1 = os.path.join(GEN_PATH, genome1) genome2 = "A_H738.fasta" gpath2 = os.path.join(GEN_PATH, genome2) genomes = { genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0], genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465] } threads = 3 force = False trn_file = "nofile.trn" annot_folder = os.path.join(GENEPATH, "annot-folder") os.makedirs(annot_folder) final = afunc.run_annotation_all(genomes, threads, force, annot_folder, trn_file) assert final[genome1] assert final[genome2] q = logger[0] assert q.qsize() == 7 assert q.get().message == 'Annotating all genomes with prokka' # Messages for start and end annotation of the different genomes message_start_annot1 = ( "Start annotating test_runall_1by1_1 test/data/annotate/genomes/" "H299_H561.fasta") message_cmd1 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "annot-folder/H299_H561.fasta-prokkaRes --cpus 3") message_end_annot1 = ( "End annotating test_runall_1by1_1 from test/data/annotate/genomes/" "H299_H561.fasta.") message_start_annot2 = ( "Start annotating test_runall_1by1_2 test/data/annotate/genomes/" "A_H738.fasta") message_cmd2 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "annot-folder/A_H738.fasta-prokkaRes --cpus 3") message_end_annot2 = ( "End annotating test_runall_1by1_2 from test/data/annotate/genomes/" "A_H738.fasta.") qget = q.get().message # Check logs. Given that it is executed in parallel, we cannot know in which order messages # will appear assert qget == message_start_annot1 or message_start_annot2 if qget == message_start_annot1: # Ending annotation of first genome (same genome as started because running 1by1) assert q.get().message.startswith(message_cmd1) assert q.get().message == message_end_annot1 else: assert q.get().message.startswith(message_cmd2) assert q.get().message == message_end_annot2 qget2 = q.get().message assert qget2 == message_start_annot1 or message_start_annot2 if qget2 == message_start_annot2: # Ending annotation of first genome (same genome as started because running 1by1) assert q.get().message.startswith(message_cmd2) assert q.get().message == message_end_annot2 else: assert q.get().message.startswith(message_cmd1) assert q.get().message == message_end_annot1
def test_run_all_prokka_parallel_less_threads(): """ Check that there is no problem when running with less threads than genomes (each genomes uses 2 threads) Genomes H299 and A_H738 should run well, but genomes genome* have problems (no CDS found), so check_prokka should return false. """ logger = my_logger("test_run_all_parallel_more_threads") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_4threads') # genomes = {genome: [name, gpath, size, nbcont, l90]} gnames = [ "H299_H561.fasta", "A_H738.fasta", "genome1.fasta", "genome2.fasta", "genome3.fasta" ] gpaths = [os.path.join(GEN_PATH, name) for name in gnames] genomes = { gnames[0]: ["test_runall_1by1_1", gpaths[0], gpaths[0], 12656, 3, 1], gnames[1]: ["test_runall_1by1_2", gpaths[1], gpaths[1], 456464645, 1, 1], gnames[2]: ["test_runall_1by1_3", gpaths[2], gpaths[2], 456464645, 4, 1], gnames[3]: ["test_runall_1by1_4", gpaths[3], gpaths[3], 456464645, 3, 1], gnames[4]: ["test_runall_1by1_5", gpaths[4], gpaths[4], 456464645, 1, 1] } threads = 4 force = False trn_file = "nofile.trn" final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_file) assert final[gnames[0]] assert final[gnames[1]] assert not final[gnames[2]] assert not final[gnames[3]] assert not final[gnames[4]] q = logger[0] # Check size of logs # -> starting log -> 1 log # -> for each genome ok (2 first ones): start annotate, prokka cmd, end annotate -> 6 logs # -> for each genome not ok (3 others): # start annotate, prokka cmd, problem, end annotate -> 12 logs assert q.qsize() == 19 assert q.get().message == "Annotating all genomes with prokka" # messages start annotation messages = [] for i in range(18): a = q.get().message messages.append(a) message_start_annot1 = ("Start annotating test_runall_1by1_1 " "from test/data/annotate/genomes/H299_H561.fasta " "with Prokka") message_start_annot2 = ("Start annotating test_runall_1by1_2 " "from test/data/annotate/genomes/A_H738.fasta " "with Prokka") message_start_annot3 = ("Start annotating test_runall_1by1_4 " "from test/data/annotate/genomes/genome2.fasta " "with Prokka") # Check that all messages exist. We cannot know in which order, # as 'genomes' is a dict, hence unordered, and as computation is done in parallel assert message_start_annot1 in messages assert message_start_annot2 in messages assert message_start_annot3 in messages # messages Prokka cmd message_cmd1 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prokkaRes --cpus 2 --prefix test_runall_1by1_1 " "--centre prokka test/data/annotate/genomes/H299_H561.fasta") message_cmd2 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "A_H738.fasta-prokkaRes --cpus 2 --prefix test_runall_1by1_2 " "--centre prokka test/data/annotate/genomes/A_H738.fasta") message_cmd3 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "genome1.fasta-prokkaRes --cpus 2 --prefix test_runall_1by1_3 " "--centre prokka test/data/annotate/genomes/genome1.fasta") assert message_cmd1 in messages assert message_cmd2 in messages assert message_cmd3 in messages # Messages end annotation cmd message_end1 = ("End annotating test_runall_1by1_1 from " "test/data/annotate/genomes/H299_H561.fasta.") message_end2 = ("End annotating test_runall_1by1_3 from " "test/data/annotate/genomes/genome1.fasta.") message_end3 = ("End annotating test_runall_1by1_5 from " "test/data/annotate/genomes/genome3.fasta.") assert message_end1 in messages assert message_end2 in messages assert message_end3 in messages # Messages error annotation cmd message_err1 = "test_runall_1by1_3 genome1.fasta: several .faa files" message_err2 = "test_runall_1by1_4 genome2.fasta: several .faa files" message_err3 = "test_runall_1by1_5 genome3.fasta: several .faa files" assert message_err1 in messages assert message_err2 in messages assert message_err3 in messages
def main(cmd, pangenome, tol, multi, mixed, outputdir, lstinfo_file, floor, verbose, quiet): """ Read pangenome and deduce Persistent genome according to the user criteria Parameters ---------- pangenome : str file containing pangenome tol : float min % of genomes present in a family to consider it as persistent (between 0 and 1) multi : bool True if multigenic families are allowed, False otherwise mixed : bool True if mixed families are allowed, False otherwise outputdir : str or None Specific directory for the generated persistent genome. If not given, pangenome directory is used. lstinfo_file : str list of genomes to include in the core/persistent genome. If not given, include all genomes of pan floor : bool Require at least floor(nb_genomes*tol) genomes if True, ceil(nb_genomes*tol) if False verbose : int verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR. - 1: stdout contains INFO, stderr contains WARNING and ERROR - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR - >=15: Add DEBUG in stdout quiet : bool True if nothing must be sent to stdout/stderr, False otherwise """ # import needed packages import logging from PanACoTA import utils from PanACoTA import utils_pangenome as utilsp import PanACoTA.corepers_module.persistent_functions as pers from PanACoTA import __version__ as version # get pangenome name info _, base_pan = os.path.split(pangenome) if lstinfo_file: _, base_lst = os.path.split(lstinfo_file) else: base_lst = "all" # Define output filename output_name = f"PersGenome_{base_pan}-{base_lst}_" if floor: output_name += "F" output_name += str(tol) if multi: output_name += "-multi.lst" elif mixed: output_name += "-mixed.lst" else: output_name += ".lst" # Define output directory and filename path if not os.path.isdir(outputdir): os.makedirs(outputdir) outputfile = os.path.join(outputdir, output_name) logfile_base = os.path.join(outputdir, "PanACoTA-corepers") # level is the minimum level that will be considered. # for verbose = 0 or 1, ignore details and debug, start from info if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = 15 # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG utils.init_logger(logfile_base, level, 'corepers', verbose=verbose, quiet=quiet) logger = logging.getLogger("corepers") logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) logger.info(get_info(tol, multi, mixed, floor)) # Read pangenome fams_by_strain, families, all_strains = utilsp.read_pangenome( pangenome, logger) # If list of genomes given, get subset of previous dicts, including only the genomes aksed if lstinfo_file: fams_by_strain, families, all_strains = pers.get_subset_genomes( fams_by_strain, families, lstinfo_file) # Generate persistent genome fams = pers.get_pers(fams_by_strain, families, len(all_strains), tol, multi, mixed, floor) # Write persistent genome to file pers.write_persistent(fams, outputfile) logger.info("Persistent genome step done.") return outputfile
def test_run_all_prodigal(): """ Check that there is no problem when running prodigal on all genomes Start and end are not necessarily in the same order (ex: start1, start2, end2, end1) """ logger = my_logger("test_run_all_parallel_more_threads") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_parallel_more_threads') # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]} genome1 = "H299_H561.fasta" gpath1 = os.path.join(GEN_PATH, genome1) genome2 = "A_H738.fasta" gpath2 = os.path.join(GEN_PATH, genome2) genomes = { genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0], genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465] } threads = 8 force = False trn_gname = genome2 final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_gname, prodigal_only=True, quiet=True) assert final[genome1] assert final[genome2] q = logger[0] assert q.qsize() == 10 assert q.get().message == "Annotating all genomes with prodigal" assert q.get( ).message == "Prodigal will train using test/data/annotate/genomes/A_H738.fasta" assert q.get().message == ( "prodigal command: prodigal -i " "test/data/annotate/genomes/A_H738.fasta -t " "test/data/annotate/generated_by_unit-tests/A_H738.fasta.trn") assert q.get( ).message == "End training on test/data/annotate/genomes/A_H738.fasta" messages = [] for i in range(6): a = q.get().message messages.append(a) message_start_annot1 = ( "Start annotating test_runall_1by1_1 " "(from test/data/annotate/genomes/H299_H561.fasta sequence) " "with Prodigal") message_start_annot2 = ( "Start annotating test_runall_1by1_2 " "(from test/data/annotate/genomes/A_H738.fasta sequence) " "with Prodigal") # Check that all messages exist. We cannot know in which order, # as 'genomes' is a dict, hence unordered, and as computation is done in parallel assert message_start_annot1 in messages assert message_start_annot2 in messages # Prodigal cmd message_cmd1 = ( "Prodigal command: prodigal -i test/data/annotate/genomes/H299_H561.fasta " "-d test/data/annotate/generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runall_1by1_1.ffn -a test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prodigalRes/test_runall_1by1_1.faa -f gff " "-o test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prodigalRes/test_runall_1by1_1.gff -t " "test/data/annotate/generated_by_unit-tests/A_H738.fasta.trn -q") message_cmd2 = ( "Prodigal command: prodigal -i test/data/annotate/genomes/A_H738.fasta " "-d test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/" "test_runall_1by1_2.ffn -a test/data/annotate/generated_by_unit-tests/" "A_H738.fasta-prodigalRes/test_runall_1by1_2.faa -f gff " "-o test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/" "test_runall_1by1_2.gff -t " "test/data/annotate/generated_by_unit-tests/A_H738.fasta.trn -q") assert message_cmd1 in messages assert message_cmd2 in messages message_end_annot1 = ( "End annotating test_runall_1by1_1 (from test/data/annotate/genomes/" "H299_H561.fasta)") message_end_annot2 = ( "End annotating test_runall_1by1_2 (from test/data/annotate/genomes/" "A_H738.fasta)") assert message_end_annot1 in messages assert message_end_annot2 in messages
def test_run_all_parallel_prokka_more_threads(): """ Check that there is no problem when running with more threads than genomes (6 threads and 2 genome: each genome uses 3 threads) Genomes H299 should run well but genome1.fasta should get an error """ logger = my_logger("test_run_all_parallel_more_threads") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_4threads') # genomes = {genome: [name, gpath, size, nbcont, l90]} gnames = ["H299_H561.fasta", "genome1.fasta"] gpaths = [os.path.join(GEN_PATH, name) for name in gnames] genomes = { gnames[0]: ["test_runall_1by1_1", gpaths[0], gpaths[0], 12656, 3, 1], gnames[1]: ["test_runall_1by1_2", gpaths[1], gpaths[1], 456464645, 4, 1], } threads = 6 force = False trn_file = "nofile.trn" final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_file) assert final[gnames[0]] assert not final[gnames[1]] q = logger[0] # Check size of logs # -> starting log -> 1 log # -> for genome ok : start annotate, prokka cmd, end annotate -> 3 logs # -> for genome not ok : start annotate, prokka cmd, problem, end annotate -> 4 logs assert q.qsize() == 8 assert q.get().message == "Annotating all genomes with prokka" # messages start annotation messages = [] for i in range(7): a = q.get().message messages.append(a) message_start_annot1 = ("Start annotating test_runall_1by1_1 " "from test/data/annotate/genomes/H299_H561.fasta " "with Prokka") message_start_annot2 = ("Start annotating test_runall_1by1_2 " "from test/data/annotate/genomes/genome1.fasta " "with Prokka") # Check that all messages exist. We cannot know in which order, # as 'genomes' is a dict, hence unordered, and as computation is done in parallel assert message_start_annot1 in messages assert message_start_annot2 in messages # messages Prokka cmd message_cmd1 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prokkaRes --cpus 3 --prefix test_runall_1by1_1 " "--centre prokka test/data/annotate/genomes/H299_H561.fasta") message_cmd2 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "genome1.fasta-prokkaRes --cpus 3 --prefix test_runall_1by1_2 " "--centre prokka test/data/annotate/genomes/genome1.fasta") assert message_cmd1 in messages assert message_cmd2 in messages # Messages end annotation cmd message_end1 = ("End annotating test_runall_1by1_1 from " "test/data/annotate/genomes/H299_H561.fasta.") message_end2 = ("End annotating test_runall_1by1_2 from " "test/data/annotate/genomes/genome1.fasta.") assert message_end1 in messages assert message_end2 in messages # Messages error annotation cmd message_err1 = "test_runall_1by1_2 genome1.fasta: several .faa files" assert message_err1 in messages