def test_main_norefseq_wrongdbpath(capsys): """ We run with option norefseq, but given db_dir does not exist. -> error message """ NCBI_species_name = "" NCBI_species_taxid = "" NCBI_taxid = "" NCBI_strains = "" NCBI_section = "refseq" levels = "" outdir = GENEPATH tmp_dir = os.path.join(outdir, "temporary_directory") threads = 1 norefseq = True db_dir = "dbdir" only_mash = False l90 = 100 nbcont = 999 cutn = 5 min_dist = 1e-4 max_dist = 0.06 verbose = 15 quiet = False info_file = "" with pytest.raises(SystemExit): prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) _, err = capsys.readouterr() assert ("You asked to skip refseq downloads") in err assert ( "Database folder dbdir supposed to contain fasta sequences does not exist. Please " "give a valid folder, or leave the default directory (no '-d' option)" ) in err # Check output files summary = os.path.join(GENEPATH, "assembly_summary-123.txt") assert not os.path.isfile(summary) ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") assert not os.path.isdir(ngd_outdir) # Check logfiles are here log_files = glob.glob(os.path.join(GENEPATH, "*log*")) assert len(log_files) == 4 #.log.debug as we put verbose = 15 # Check tmp files folder created, but empty asnothing is downloaded assert len(os.listdir(tmp_dir)) == 0 # Check Database_init folder created, with at list 4 ".fna" genomes assert not os.path.isdir(os.path.join(GENEPATH, "Database_init"))
def test_main_wrong_taxid(capsys): """ We run without option only_mash, but still provide a lstinfo file -> will change its name to .back to save it when the new file will be created """ NCBI_species_name = "" NCBI_taxid = "123" NCBI_species_taxid = "" NCBI_strains = "" NCBI_section = "genbank" levels = "" outdir = GENEPATH tmp_dir = os.path.join(GENEPATH, "123", "temporary_directory") threads = 1 norefseq = False info_file = "" db_dir = "" only_mash = False l90 = 100 nbcont = 999 cutn = 5 min_dist = 1e-4 max_dist = 0.06 verbose = 2 quiet = False with pytest.raises(SystemExit): prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) _, err = capsys.readouterr() assert ( "No strain correspond to your request. If you are sure there should have " "some, check that you gave valid NCBI taxid and/or " "NCBI species name and/or NCBI strain name. If you gave several, check that " "given taxIDs and names are compatible.") in err # Check output files summary = os.path.join(outdir, "assembly_summary-123.txt") assert not os.path.isfile(summary) ngd_outdir = os.path.join(outdir, "genbank", "bacteria") assert not os.path.isdir(ngd_outdir) # # Check logfiles are here log_files = glob.glob(os.path.join(outdir, "*log*")) assert len(log_files) == 3 # Check tmp files folder created, but empty asnothing is downloaded assert len(os.listdir(tmp_dir)) == 0 # Check Database_init folder created, with at list 4 ".fna" genomes assert not os.path.isdir(os.path.join(outdir, "Database_init"))
def test_only_mash_empty_lstinfo(capsys): """ Running only mash step giving an empty lstinfo file -> error, no genome found """ NCBI_species_name = "" NCBI_species_taxid = "" NCBI_taxid = "" NCBI_strains = "" NCBI_section = "refseq" levels = "" outdir = GENEPATH tmp_dir = "" threads = 1 norefseq = False db_dir = "" only_mash = True # Create empty lstinfo file info_file = os.path.join(GENEPATH, "LSTINFO-empty.lst") open(info_file, "w").close() l90 = 100 nbcont = 999 cutn = 5 min_dist = 1e-4 max_dist = 0.06 verbose = 1 quiet = False with pytest.raises(SystemExit): prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) out, err = capsys.readouterr() assert ("You asked to run only mash steps") in err assert ( "You want to run only mash steps. Getting information from " "test/data/prepare/generated_by_func-tests/LSTINFO-empty.lst") in out assert ( "No genome listed in test/data/prepare/generated_by_func-tests/LSTINFO-empty.lst " "was found.") in err # Check output files assert len(os.listdir(os.path.join(outdir, "tmp_files"))) == 0 # Check logfiles are here log_files = glob.glob(os.path.join(outdir, "*log*")) assert len(log_files) == 3 # Check lstinfo file is still here and still empty assert os.stat(info_file).st_size == 0
def test_only_mash_no_lstinfo(capsys): """ Running only mash step giving an info file which does not exist -> error missing infofile """ NCBI_species_name = "" NCBI_species_taxid = "" NCBI_taxid = "" NCBI_strains = "" NCBI_section = "refseq" levels = "" outdir = GENEPATH tmp_dir = "" threads = 1 norefseq = False db_dir = "" only_mash = True # Create empty lstinfo file info_file = "info_file.lst" l90 = 100 nbcont = 999 cutn = 5 min_dist = 1e-4 max_dist = 0.06 verbose = 1 quiet = False with pytest.raises(SystemExit): prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) out, err = capsys.readouterr() assert ("You asked to run only mash steps") in err assert ("Your info file info_file.lst does not exist. Please provide the " "right name/path, or remove the '--mash-only option to rerun " "quality control.") in err # Check output files assert len(os.listdir(os.path.join(outdir, "tmp_files"))) == 0 # Check logfiles are here log_files = glob.glob(os.path.join(outdir, "*log*")) assert len(log_files) == 3 # Check that outdir contains only 4 elements: 3 logs + tmp_files repo files = os.listdir(outdir) files = [f for f in files if "fuse" not in f] assert len(files) == 4
def test_main_not_only_mash_infoexists(): """ We run without option only_mash, but still provide a lstinfo file -> will change its name to .back to save it when the new file will be created """ NCBI_species_name = "" NCBI_species_taxid = "104099" NCBI_taxid = "" NCBI_section = "refseq" NCBI_strains = "" levels = "" outdir = GENEPATH tmp_dir = os.path.join(outdir, "temporary_directory") threads = 1 norefseq = False db_dir = "" only_mash = False info_file = os.path.join(outdir, "LSTINFO-existing.lst") open(info_file, "w").close() #create empty info file, to check it is renamed l90 = 100 nbcont = 999 cutn = 5 min_dist = 1e-4 max_dist = 0.06 verbose = 2 quiet = False out_info_file = os.path.join(outdir, "LSTINFO-104099-filtered-0.0001_0.06.txt") assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) == out_info_file # Check output files summary = os.path.join(GENEPATH, "assembly_summary-104099.txt") assert os.path.isfile(summary) # Check that the NCBI_genome_download output directory exists ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") # And that it contains folders assert os.path.isdir(ngd_outdir) assert len(os.listdir(ngd_outdir)) >= 4 # Check logfiles are here log_files = glob.glob(os.path.join(GENEPATH, "*log*")) assert len(log_files) == 3 # Check tmp files folder created, but empty as we do not split tmp_files = glob.glob(os.path.join(tmp_dir, "*.fna_prepare-split5N.fna")) assert len(tmp_files) >= 4 # Check Database_init folder created, with at list 4 ".fna" genomes fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna")) assert len(fna_files) >= 4 # Check that LSTINFO file existing was renamed and still empty # And new LSTINFO file created assert os.path.isfile(info_file + ".back") assert os.stat(info_file + ".back").st_size == 0
def test_main_norefseq_defaultdbdir(capsys): """ We run with option norefseq, but given db_dir does not exist. -> error message """ NCBI_species_name = "" NCBI_species_taxid = "" NCBI_taxid = "" NCBI_strains = "" NCBI_section = "refseq" levels = "" # Copy refseq/bacteria and content into outdirectory outdir = GENEPATH tmp_dir = "" threads = 1 norefseq = True orig_dbdir = os.path.join(GEN_PATH, "genomes_comparison") refseq_db_dir = os.path.join(GENEPATH, "Database_init") shutil.copytree(orig_dbdir, refseq_db_dir) db_dir = "" only_mash = False l90 = 100 nbcont = 999 cutn = 0 min_dist = 1e-4 max_dist = 0.06 verbose = 2 quiet = False info_file = "" out_info_file = os.path.join(outdir, "LSTINFO-NA-filtered-0.0001_0.06.txt") assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) == out_info_file out, err = capsys.readouterr() assert ("You asked to skip refseq downloads") in err assert ("Total number of genomes for NA: 5") in out assert ("Computing pairwise distances between all genomes") in out assert ("Final number of genomes in dataset: 1") in out # Check output files # Check that the NCBI_genome_download output directory exists ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") assert not os.path.isdir(ngd_outdir) # Check logfiles are here log_files = glob.glob(os.path.join(GENEPATH, "*log*")) assert len(log_files) == 3 # Check tmp files folder created, but empty as we do not split tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files")) assert len(tmp_folder) == 0 # Check Database_init folder created, with the 3 ".fna" genomes fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna")) assert len(fna_files) == 5
def test_main_only_strainname(): """ Only give strain names (no spe taxid etc). Chack that they are downloaded, and that the summary file has the expected name. """ NCBI_species_name = "" NCBI_species_taxid = "" NCBI_taxid = "" NCBI_section = "refseq" NCBI_strains = "AS001254,KPPR1,LMG 1583" levels = "" outdir = GENEPATH tmp_dir = os.path.join(outdir, 'tmp') threads = 1 norefseq = False db_dir = "" only_mash = False info_file = "" l90 = 100 nbcont = 999 cutn = 5 min_dist = 1e-4 max_dist = 0.06 verbose = 2 quiet = False out_info_file = os.path.join( outdir, "LSTINFO-AS001254_and_KPPR1_and_LMG_1583-filtered-0.0001_0.06.txt") assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) == out_info_file # Check output files summary = os.path.join( GENEPATH, "assembly_summary-AS001254_and_KPPR1_and_LMG_1583.txt") assert os.path.isfile(summary) # Check that the NCBI_genome_download output directory exists ngd_outdir = os.path.join(GENEPATH, "refseq", "bacteria") # And that it contains folders assert os.path.isdir(ngd_outdir) assert len(os.listdir(ngd_outdir)) == 3 # Check logfiles are here log_files = glob.glob(os.path.join(GENEPATH, "*log*")) assert len(log_files) == 3 # Check tmp files folder created, with the 3 strain files tmp_files = glob.glob(os.path.join(tmp_dir, "*.fna_prepare-split5N.fna")) assert len(tmp_files) == 3 # Check Database_init folder created, with all 3 ".fna" genomes fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna")) assert len(fna_files) == 3
def test_only_mash(capsys): """ Running only mash step (giving genomes and corresponding LSTINFO file) """ NCBI_species_name = "" NCBI_species_taxid = "" NCBI_taxid = "" NCBI_strains = "" NCBI_section = "refseq" levels = "" outdir = GENEPATH tmp_dir = "" threads = 1 norefseq = False db_dir = "" only_mash = True info_file = os.path.join(TEST_DIR, "test_lstinfo_onlymash.lst") l90 = 100 nbcont = 999 cutn = 5 min_dist = 1e-4 max_dist = 0.06 verbose = 1 quiet = False out_info_file = os.path.join(outdir, "LSTINFO-NA-filtered-0.0001_0.06.txt") assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) == out_info_file out, err = capsys.readouterr() assert ("You asked to run only mash steps") in err assert ("You want to run only mash steps. Getting information from " "test/data/prepare/test_files/test_lstinfo_onlymash.lst") in out assert ("Found 5 genomes in total") in out assert ("Computing pairwise distances between all genomes") in out assert ("Sorting all 5 genomes by quality") in out assert ("Final number of genomes in dataset: 1") in out # Check output files assert len(os.listdir(os.path.join(outdir, "tmp_files"))) == 0 # Check logfiles are here log_files = glob.glob(os.path.join(outdir, "*log*")) assert len(log_files) == 3 # Check content of output lstinfo file out_lst = os.path.join(outdir, "LSTINFO-NA-filtered-0.0001_0.06.txt") exp_lst = os.path.join(DBDIR, "exp_files", "exp_lstinfo_run_only-mash.lst") assert tutil.compare_order_content(out_lst, exp_lst)
def test_main_norefseq_nodefault_dbdir_but_refseq(capsys): """ We run with option norefseq, but given db_dir does not exist. -> error message """ NCBI_species_name = "" NCBI_species_taxid = "123" NCBI_taxid = "" NCBI_strains = "" NCBI_section = "genbank" levels = "" # Copy refseq/bacteria and content into outdirectory outdir = GENEPATH tmp_dir = "" threads = 1 norefseq = True orig_dbdir = os.path.join(GEN_PATH, "refseq") refseq_db_dir = os.path.join(GENEPATH, "genbank") shutil.copytree(orig_dbdir, refseq_db_dir) db_dir = "" only_mash = False l90 = 100 nbcont = 999 cutn = 0 min_dist = 1e-4 max_dist = 0.06 verbose = 2 quiet = False info_file = "" out_info_file = os.path.join(outdir, f"LSTINFO-123-filtered-0.0001_0.06.txt") assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) == out_info_file out, err = capsys.readouterr() assert ("You asked to skip genbank downloads") in err assert ( "Database folder test/data/prepare/generated_by_func-tests/" "Database_init supposed " "to contain fasta sequences does not exist. We will check if the download folder " "(with compressed sequences) exists.") in err assert ("Uncompressing genome files") in out assert ("Total number of genomes for 123: 3") in out assert ("Computing pairwise distances between all genomes") in out assert ("Final number of genomes in dataset: 1") in out # Check output files # Check that the NCBI_genome_download output directory exists ngd_outdir = os.path.join(GENEPATH, "genbank", "bacteria") # And that it contains folders assert os.path.isdir(ngd_outdir) assert len(os.listdir(ngd_outdir)) == 3 # Check logfiles are here log_files = glob.glob(os.path.join(GENEPATH, "*log*")) assert len(log_files) == 3 # Check tmp files folder created, but empty as we do not split tmp_folder = os.listdir(os.path.join(GENEPATH, "tmp_files")) assert len(tmp_folder) == 0 # Check Database_init folder created, with the 3 ".fna" genomes fna_files = glob.glob(os.path.join(GENEPATH, "Database_init", "*.fna")) assert len(fna_files) == 3
def test_main_norefseq_nodefault_dbdir_nor_refseq(capsys): """ We run with option norefseq, but given db_dir does not exist. -> error message """ NCBI_species_name = "" NCBI_species_taxid = "" NCBI_taxid = "" NCBI_strains = "" NCBI_section = "genbank" levels = "" outdir = GENEPATH tmp_dir = "" threads = 1 norefseq = True db_dir = "" only_mash = False l90 = 100 nbcont = 999 cutn = 5 min_dist = 1e-4 max_dist = 0.06 verbose = 2 quiet = False info_file = "" with pytest.raises(SystemExit): prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) _, err = capsys.readouterr() assert ("You asked to skip genbank downloads") in err assert ( "Database folder test/data/prepare/generated_by_func-tests/Database_init supposed " "to contain fasta sequences does not exist. We will check if the download folder " "(with compressed sequences) exists.") in err assert ( "Folder test/data/prepare/generated_by_func-tests/genbank/bacteria " "does not exist. You do not have any genome to analyse. Possible reasons:\n" ) in err assert ("- if you want to rerun analysis in the same folder as " "sequences were downloaded (my_outdir/Database_init or " "my_outdir/genbank), make sure you have '-o my_outdir' option\n" ) in err assert ( "- if you want to rerun analysis and save them in a new " "output folder called 'new_outdir', make sure you have '-o new_outdir' option, " "and you specified where the uncompressed sequences to use are " "('-d sequence_database_path'") in err # # Check output files summary = os.path.join(GENEPATH, "assembly_summary-123.txt") assert not os.path.isfile(summary) ngd_outdir = os.path.join(GENEPATH, "genbank", "bacteria") assert not os.path.isdir(ngd_outdir) # Check logfiles are here log_files = glob.glob(os.path.join(GENEPATH, "*log*")) assert len(log_files) == 3 # Check tmp files folder created, but empty asnothing is downloaded assert len(os.listdir(os.path.join(GENEPATH, "tmp_files"))) == 0 # Check Database_init folder created, with at list 4 ".fna" genomes assert not os.path.isdir(os.path.join(GENEPATH, "Database_init"))
def main(cmd, args_all, args_prepare, args_annot, args_pan, args_corepers, args_align, args_tree): """ Call all modules, one by one, using output of one as input for the next one Parameters ---------- cmd : str command line used to launch the program args_all : tuple arguments common to all modules: output directory (str), threads (int), verbose (int), quiet (bool) args_prepare : tuple arguments for prepare module (see subcommands.prepare.py): NCBI_species_taxid (int), NCBI_species_name (str), NCBI_species_taxid (int), NCBI_taxid (int), NCBI_strains (str), levels (str), NCBI_section (str), tmp_dir (str), norefseq (bool), db_dir (str), only_mash (bool), info_file (str), l90 (int), nbcont (int), cutn (int), min_dist (float), max_dist (float) args_annot : tuple arguments for annotate module (see subcommands/annotate.py): name (str), qc_only (bool), date (str), prodigal_only (bool), small (bool) args_pan : tuple arguments for pangenome module (see subcommands/pangenome.py): min_id (float), clust_mode (int), spe_dir (str), outfile (str) args_corepers : tuple arguments for corepers module (see subcommands.corepers.py): tol (float), mixed (bool), multi (bool), floor (bool) args_align : tuple arguments for align module (see subcommands.align.py): prot_ali (bool) args_tree : tuple arguments for tree module (see subcommands.tree.py): soft (str), model (str), boot (bool), write_boot (bool), memory (str), fast (bool) """ outdir, threads, verbose, quiet = args_all os.makedirs(outdir, exist_ok=True) # Initialize logger import logging # set level of logger: level is the minimum level that will be considered. if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = utils.detail_lvl() # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG logfile_base = os.path.join(outdir, "PanACoTA-all_modules") logfile_base = utils.init_logger(logfile_base, level, name='all_modules', verbose=verbose, quiet=quiet) logger = logging.getLogger('all_modules') logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) # Run prepare module outdir_prepare = os.path.join(outdir, "1-prepare_module") (NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, tmp_dir, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist) = args_prepare logger.info("prepare step") info_file = prepare.main("PanACoTA prepare", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir_prepare, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) # Run annotate module list_file = "" db_path = "" tmp_dir = "" force = False outdir_annotate = os.path.join(outdir, "2-annotate_module") (name, qc_only, date, prodigal_only, small) = args_annot res_annot_dir = None logger.info("annotate step") lstinfo, nbgenomes = annotate.main("PanACoTA annotate", list_file, db_path, outdir_annotate, name, date, l90, nbcont, cutn, threads, force, qc_only, info_file, tmp_dir, res_annot_dir, verbose, quiet, prodigal_only=prodigal_only, small=small) if qc_only: return "QC_only done" # Pangenome step name_pan = f"{name}_{nbgenomes}" outdir_pan = os.path.join(outdir, "3-pangenome_module") dbpath = os.path.join(outdir_annotate, "Proteins") (min_id, clust_mode, spe_dir, outfile) = args_pan logger.info("pangenome step") panfile = pangenome.main("PanACoTA pangenome", lstinfo, name_pan, dbpath, min_id, outdir_pan, clust_mode, spe_dir, threads, outfile, verbose=verbose, quiet=quiet) # Coregenome step outdir_corpers = os.path.join(outdir, "4-corepers_module") logger.info("corepers step") (tol, mixed, multi, floor) = args_corepers lstinfo_file = "" # include all genomes in core corepers_file = corepers.main("PanACoTA corepers", panfile, tol, multi, mixed, outdir_corpers, lstinfo_file, floor, verbose, quiet) # Align step outdir_align = os.path.join(outdir, "5-align_module") force = False logger.info("align step") (prot_ali) = args_align align_file = align.main("PanACoTA align", corepers_file, lstinfo, name_pan, outdir_annotate, outdir_align, prot_ali, threads, force, verbose=verbose, quiet=quiet) # Tree step (soft, model, boot, write_boot, memory, fast) = args_tree outdir_tree = os.path.join(outdir, "6-tree_module") logger.info("tree step") tree.main("PanACoTA tree", align_file, outdir_tree, soft, model, threads, boot, write_boot, memory, fast, verbose=verbose, quiet=quiet) logger.info("All modules of PanACOTA are finished.") return 0