def test_create_gen_lst(caplog): """ Check that generated gen and lst files are as expected. In the test file, all genomes have names different from gembase name This test file contains the following aspects: - gene in D strand (start < end) - gene in C strand (start > end) - CDS features - contigs with more than 2 genes - contig with only 2 genes (both 'b' loc) - contig with 1 gene ('b' loc) - contig without gene (should be skipped) """ caplog.set_level(logging.DEBUG) genfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal.outtest.ok.ffn") contigs = { "JGIKIPgffgIJ": "test.0417.00002.0001", "toto": "test.0417.00002.0002", "other_header": "test.0417.00002.0003", "my_contig": "test.0417.00002.0004", "bis": "test.0417.00002.0005", "ter": "test.0417.00002.0006", "contname": "test.0417.00002.0007" } name = "test.0417.00002" res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen") res_lst_file = os.path.join(GENEPATH, "prodigal_res.lst") gpath = "original_genome_name" assert prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name) exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst") assert tutil.compare_order_content(exp_lst, res_lst_file) exp_gen = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.gen") assert tutil.compare_order_content(exp_gen, res_gen_file)
def test_run_prokka_out_doesnt_exist_ok(): """ Test that when the output directory does not exist, it creates it, and runs prokka with all expected outfiles """ logger = my_logger("test_run_prokka_out_doesnt_exist") utils.init_logger(LOGFILE_BASE, 0, 'test_run_prokka_out_doesnt_exist') gpath = os.path.join(GEN_PATH, "H299_H561.fasta") out_dir = os.path.join(GENEPATH, "H299_H561.fasta-prokkaRes") cores_prokka = 2 name = "test_runprokka_H299" force = False nbcont = 3 trn_file = "nofile.trn" arguments = (gpath, GENEPATH, cores_prokka, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prokka(arguments) # Check content of tbl, ffn and faa files exp_dir = os.path.join(EXP_DIR, "H299_H561.fasta-short-contig.fna-prokkaRes", "test_runprokka_H299") out_tbl = os.path.join(out_dir, name + ".tbl") out_faa = os.path.join(out_dir, name + ".faa") out_ffn = os.path.join(out_dir, name + ".ffn") out_gff = os.path.join(out_dir, name + ".gff") assert os.path.isfile(out_tbl) # For tbl file, check that, at least, the 3 contigs were considered, # and that the number of CDS is as expected. # Before, we checked that the output # was exactly as expected. But it changes with the different versions of prokka, so # we cannot compare the whole file. with open(out_tbl, "r") as outt: lines = [line.strip() for line in outt.readlines()] # Check that there are 3 contigs feature = 0 for line in lines: if 'Feature' in line: feature += 1 assert feature == 3 # Check that there are 16 CDS CDS = 0 for line in lines: if "CDS" in line: CDS += 1 assert CDS == 16 # Check that faa and ffn files are as expected assert os.path.isfile(out_faa) assert tutil.compare_order_content(exp_dir + ".faa", out_faa) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".ffn", out_ffn) q = logger[0] assert q.qsize() == 3 assert q.get().message.startswith("Start annotating") assert q.get().message == ( "Prokka command: prokka " "--outdir test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prokkaRes --cpus 2 --prefix test_runprokka_H299 " "--centre prokka test/data/annotate/genomes/H299_H561.fasta") assert q.get().message.startswith("End annotating")
def test_run_prodigal_out_exists_force(): """ Test that when the output directory already exists with wrong files, but force is on, prodigal is rerun and outputs the right files """ logger = my_logger("test_run_prodigal_out_exists_force") utils.init_logger(LOGFILE_BASE, 0, 'force') gpath = os.path.join(GEN_PATH, "H299_H561.fasta") out_prokdir = os.path.join(GENEPATH, "H299_H561.fasta-prodigalRes") name = "test_runprodigal_H299" # Put empty tbl, faa, ffn files in prodigal output dir, to check that they are overridden os.makedirs(out_prokdir) open(os.path.join(out_prokdir, name + ".gff"), "w").close() open(os.path.join(out_prokdir, name + ".faa"), "w").close() open(os.path.join(out_prokdir, name + ".ffn"), "w").close() cores_prodigal = 2 force = True nbcont = 3 trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") arguments = (gpath, GENEPATH, cores_prodigal, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prodigal(arguments) # As we used 'force', tbl, faa and ffn files, which were empty, must have been replaced # by the prodigal output exp_dir = os.path.join(EXP_DIR, "H299_H561.fasta-prodigalRes", "ESCO.1015.00001") out_gff = os.path.join(out_prokdir, name + ".gff") out_faa = os.path.join(out_prokdir, name + ".faa") out_ffn = os.path.join(out_prokdir, name + ".ffn") # Check that faa and ffn files are as expected assert os.path.isfile(out_faa) assert tutil.compare_order_content(exp_dir + ".faa", out_faa) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".ffn", out_ffn) q = logger[0] assert q.qsize() == 4 assert q.get().message.startswith( "Prodigal results folder already exists, but is " "removed because --force option was used") assert q.get().message.startswith( "Start annotating test_runprodigal_H299 (from test/data/" "annotate/genomes/H299_H561.fasta sequence) " "with Prodigal") assert q.get().message.startswith( "Prodigal command: prodigal -i test/data/annotate/genomes/" "H299_H561.fasta -d test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.ffn -a test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.faa -f gff -o test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.gff -t " "test/data/annotate/test_files/A_H738-and-B2_A3_5.fna.trn " "-q") assert q.get().message.startswith( "End annotating test_runprodigal_H299 " "(from test/data/annotate/genomes/H299_H561.fasta)")
def test_convert_phylip(caplog): """ Test that when giving a valid fasta alignment file, it converts it to Stockholm format, as expected. """ caplog.set_level(logging.DEBUG) outfile = os.path.join(GENEPATH, "test_2phylip") fme.convert2phylip(ALIGNMENT, outfile) exp_stk = os.path.join(EXPPATH, "exp_align_phylip.ph") assert os.path.isfile(outfile) tutil.compare_order_content(outfile, exp_stk) assert "Converting fasta alignment to PHYLIP-relaxed format" in caplog.text
def test_handle_genome_formatok_prodigal(caplog): """ Test that when we try to format a genome which was annotated by prodigal without any problem It returns True associated with the genome name """ caplog.set_level(logging.DEBUG) name_orig = "prodigal.outtest.ok" name = "test.0417.00002" # path to original genome, given to prodigal for annotation gpath = os.path.join(ANNOTEDIR, "test_files", "original_name.fna") prodi_path = os.path.join(ANNOTEDIR, "test_files") # Create result directories prot_dir = os.path.join(GENEPATH, "Proteins") lst_dir = os.path.join(GENEPATH, "LSTINFO") rep_dir = os.path.join(GENEPATH, "Replicons") gene_dir = os.path.join(GENEPATH, "Genes") gff_dir = os.path.join(GENEPATH, "gff") os.makedirs(prot_dir) os.makedirs(lst_dir) os.makedirs(rep_dir) os.makedirs(gene_dir) os.makedirs(gff_dir) # Get args for function args = (name_orig, name, gpath, prodi_path, lst_dir, prot_dir, gene_dir, rep_dir, gff_dir, True, my_logger()[0]) ok_format, genome = ffunc.handle_genome(args) assert ok_format == True assert genome == name_orig # Check generated files exp_rep = os.path.join(EXP_ANNOTE, "res_created_rep-prokka.fna") res_rep_file = os.path.join(rep_dir, "test.0417.00002.fna") assert tutil.compare_order_content(exp_rep, res_rep_file) # Proteins exp_prt = os.path.join(EXP_ANNOTE, "res_create_prt_prodigal.faa") res_prt_file = os.path.join(prot_dir, "test.0417.00002.prt") assert tutil.compare_order_content(exp_prt, res_prt_file) # Genes exp_gen = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.gen") res_gen_file = os.path.join(gene_dir, "test.0417.00002.gen") assert tutil.compare_order_content(exp_gen, res_gen_file) # LSTINFO exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst") res_lst_file = os.path.join(lst_dir, "test.0417.00002.lst") assert tutil.compare_order_content(exp_lst, res_lst_file) # gff exp_gff = os.path.join(EXP_ANNOTE, "res_create_gff_prodigal.gff") res_gff_file = os.path.join(gff_dir, "test.0417.00002.gff") assert tutil.compare_order_content(exp_gff, res_gff_file)
def test_main_onexistingprodigaldir_train_exists(capsys): """ Test that, when the pipeline is run with a given prodigal dir, where prodigal results already exist, and are ok, all runs well, no re-annotation, just format - no train - no reannote - format 2 genomes in list file: B2_A3_5.fasta-changeName.fna and H299_H561.fasta """ # FOLDER with all results # Create result folder, with existing prodigal folders (which are OK) res_folder = os.path.join(GENEPATH, "results-prodigal") os.makedirs(res_folder) # copy prodigalRes folders B2_A3_5_folder = os.path.join(EXP_DIR, "B2_A3_5.fasta-changeName.fna-prodigalRes") H299_folder = os.path.join(EXP_DIR, "H299_H561.fasta-prodigalRes") res_B2_A3_5_folder = os.path.join(res_folder, "B2_A3_5.fasta-changeName.fna-prodigalRes") res_H299_folder = os.path.join(res_folder, "H299_H561.fasta-prodigalRes") shutil.copytree(B2_A3_5_folder, res_B2_A3_5_folder) shutil.copytree(H299_folder, res_H299_folder) # Add a training file in result folder trn_file = os.path.join(res_folder, "H299_H561.fasta.trn") open(trn_file, "w").close() # Function arguments list_file = os.path.join(TEST_DIR, "list_genomes-func-test-exist_dir.txt") name = "ESCO" date = "0417" lstout = os.path.join(GENEPATH, "LSTINFO-list_genomes-func-test-exist_dir.lst") lstexp = os.path.join(EXP_DIR, "exp_LSTINFO-func-annot_exists-prokkadir.lst") assert annot.main("cmd", list_file, GEN_PATH, GENEPATH, name, date, cutn=0, res_annot_dir=res_folder, verbose=3, prodigal_only=True) == (lstout, 2) out, err = capsys.readouterr() # Check that tmp files folder is empty (prokka res are somewhere else) assert len(os.listdir(os.path.join(GENEPATH, "tmp_files"))) == 0 # Test that result files are in result dir assert os.path.isfile(lstout) assert tutil.compare_order_content(lstout, lstexp) logfile = os.path.join(GENEPATH, "PanACoTA-annotate_list_genomes-func-test-exist_dir.log.details") with open(logfile, "r") as lc: log_content = lc.readlines() assert ("A training file already exists (test/data/annotate/generated_by_func-tests/" "results-prodigal/H299_H561.fasta.trn). It will be used to annotate " "all genomes.") in " ".join(log_content) assert ("Prodigal results folder test/data/annotate/generated_by_func-tests/" "results-prodigal/H299_H561.fasta-prodigalRes " "already exists") in " ".join(log_content) assert ("Prodigal results folder test/data/annotate/generated_by_func-tests/" "results-prodigal/B2_A3_5.fasta-changeName.fna-prodigalRes " "already exists") in " ".join(log_content) assert ("Prodigal did not run again. Formatting step will use already generated results of " "Prodigal in test/data/annotate/generated_by_func-tests/results-prodigal/" "H299_H561.fasta-prodigalRes. " "If you want to re-run Prodigal, first remove this result folder, or use '-F' or " "'--force' option.") in ' '.join(log_content) assert "Formatting all genomes" in " ".join(log_content) assert "Annotation step done" in " ".join(log_content)
def test_analyse1genome_cut_prodigal(): ''' Analyse the given genome, cutting at stretches of 5N, in order to annotate it Create new genome file in outdir, calculate genome size, nb contigs and L90, and add it to the genomes dict, as well as the path to the genome file. ''' gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"] genomes = { gs[0]: ["SAEN.1113"], gs[1]: ["SAEN.1114"], gs[2]: ["ESCO.0416"] } genome = gs[1] cut = True pat = "NNNNN+" soft = "prodigal" assert gfunc.analyse_genome(genome, GEN_PATH, GENEPATH, cut, pat, genomes, soft, logger) # Check that information on analyzed genome are correct. And path to 'genome to annotate' # is the same as the path to the genome itself initf = os.path.join(GEN_PATH, "genome2.fasta") # initial genome path outf = os.path.join( GENEPATH, "genome2.fasta_prodigal-split5N.fna") # path to geerated genome exp_out = os.path.join(EXP_DIR, "genome2-split5N.fna") # expected generated genome assert os.path.isfile(outf) assert tutil.compare_order_content(outf, exp_out) exp_genomes = { gs[0]: ["SAEN.1113"], gs[1]: ["SAEN.1114", initf, outf, 55, 5, 4], gs[2]: ["ESCO.0416"] } assert genomes == exp_genomes
def test_main_default(capsys): """ Test that with default parameters, it creates the expected core genome. """ tol = 1 multi = False mixed = False cmd = "cmd" floor = False verbose = 0 quiet = False lstinfo = "" out_pers = os.path.join(GENEPATH, "PersGenome_pangenome.lst-all_1.lst") assert corepers.main(cmd, UPAN, tol, multi, mixed, GENEPATH, lstinfo, floor, verbose, quiet) == out_pers # Check creation of binary file for pangenome, and remove it assert os.path.isfile(UPAN + ".bin") # Check presence of persistent genome, and its content, and remove it exp_pers = os.path.join(EXP_PATH, "exp_coregenome.txt") assert os.path.isfile(out_pers) assert tutil.compare_order_content(out_pers, exp_pers) # Check presence of log files and remove them logfile = os.path.join(GENEPATH, "PanACoTA-corepers.log") assert os.path.isfile(logfile) assert os.path.isfile(logfile + ".err") # Check log messages out, err = capsys.readouterr() assert "Will generate a CoreGenome." in out assert "Saving all information to a binary file for later use" in out assert "Generating Persistent genome of a dataset containing 4 genomes" in out assert ("The core genome contains 2 families, each one having exactly " "4 members, from the 4 different genomes.") in out
def test_main_frominfo(capsys): """ test that it runs well when giving an info file instead of list file + db etc. It does not re-calculate L90 and nbcont """ listfile = None dbpath = None name = "TOTO" date = "1205" infofile = os.path.join(TEST_DIR, "lstinfo.lst") out_infofile = os.path.join(GENEPATH, "LSTINFO-lstinfo.lst") assert annot.main("cmd", listfile, dbpath, GENEPATH, name, date, from_info=infofile, prodigal_only=True) == (out_infofile, 3) out, err = capsys.readouterr() # Check logs assert ("Generating distribution of L90 and #contigs graphs.") in out # Check output files present protdir = os.path.join(GENEPATH, "Proteins") assert len(os.listdir(protdir)) == 3 gffdir = os.path.join(GENEPATH, "gff3") assert len(os.listdir(gffdir)) == 3 lstdir = os.path.join(GENEPATH, "LSTINFO") assert len(os.listdir(lstdir)) == 3 # Check genomes are renamed as expected, and with expected L90/nbcont values exp_lstinfo = os.path.join(EXP_DIR, "exp_LSTINFO-test-main-frominfo.lst") res_lstinfo = os.path.join(GENEPATH, "LSTINFO-lstinfo.lst") assert tutil.compare_order_content(exp_lstinfo, res_lstinfo)
def test_create_gff(caplog): """ Check generated gff file. Must have all sequences in header (even replicons without gene), and then 1 line per gene """ caplog.set_level(logging.DEBUG) logger = logging.getLogger("test_prodigal") gfffile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal.outtest.ok.gff") contigs = { "JGIKIPgffgIJ": "test.0417.00002.0001", "toto": "test.0417.00002.0002", "other_header": "test.0417.00002.0003", "my_contig": "test.0417.00002.0004", "bis": "test.0417.00002.0005", "ter": "test.0417.00002.0006", "contname": "test.0417.00002.0007" } sizes = { "test.0417.00002.0001": 84, "test.0417.00002.0002": 103, "test.0417.00002.0003": 122, "test.0417.00002.0004": 35, "test.0417.00002.0005": 198, "test.0417.00002.0006": 128, "test.0417.00002.0007": 85, } res_gff_file = os.path.join(GENEPATH, "prodigal_res.gff") exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst") gpath = "original_genome_name" assert prodigalfunc.create_gff(gpath, gfffile, res_gff_file, exp_lst, contigs, sizes) exp_gff = os.path.join(EXP_ANNOTE, "res_create_gff_prodigal.gff") assert tutil.compare_order_content(exp_gff, res_gff_file)
def test_format_contig_cut(): """ For a given contig, if we want to annotate it, and cut at each stretch of 5 'N' check that it writes this contig, split, in the expected file """ cut = True pat = 'NNNNN+' cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG" cur_contig_name = ">my_contig_name for_my_sequence" contig_sizes = {} resfile = os.path.join(GENEPATH, "test_format_cont_cut5N.fna") gresf = open(resfile, "w") num = 2 assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, "genome", contig_sizes, gresf, num, logger=None) == 4 gresf.close() exp_file = os.path.join(EXP_DIR, "exp_split_contig_cut3N.fna") assert os.path.exists(resfile) assert tutil.compare_order_content(resfile, exp_file) assert contig_sizes == { ">2_my_contig_name for_my_sequence\n": 26, ">3_my_contig_name for_my_sequence\n": 25 }
def test_concat_quiet(caplog): """ Given a list of families, and a directory where are alignment files, check that the files corresponding to the given families are concatenated as expected, and it returns "Done" and expected output filename """ caplog.set_level(logging.DEBUG) # Prepare aldir with all needed alignment files aldir = os.path.join(GENEPATH, "test_concat_aldir") dname = "TESTconcat" prefix = os.path.join(aldir, dname) orig_btr1 = os.path.join(EXPPATH, "exp_aldir", "mafft-prt2nuc.1.aln") orig_btr8 = os.path.join(EXPPATH, "exp_aldir-pers", "mafft-prt2nuc.8.aln") orig_btr11 = os.path.join(EXPPATH, "exp_aldir-pers", "mafft-prt2nuc.11.aln") btr1 = os.path.join(aldir, dname + "-mafft-prt2nuc.1.aln") btr8 = os.path.join(aldir, dname + "-mafft-prt2nuc.8.aln") btr11 = os.path.join(aldir, dname + "-mafft-prt2nuc.11.aln") os.makedirs(aldir) shutil.copyfile(orig_btr1, btr1) shutil.copyfile(orig_btr8, btr8) shutil.copyfile(orig_btr11, btr11) # Other parameters, and run concatenation fam_nums = [1, 8, 11] quiet = True output, mess = pal.concat_alignments(fam_nums, prefix, "nucl", quiet) assert output == os.path.join(aldir, dname + "-complete.nucl.cat.aln") ref_concat = os.path.join(EXPPATH, "exp_concat_4genomes-fam1-8-11.aln") assert tutil.compare_order_content(output, ref_concat) assert mess == "Done" assert "Concatenating all nucl alignment files" in caplog.text
def test_tbl_to_lst_changed_names(caplog): """ Check that generated lstinfo file is as expected, when the genome name is not the same as it already was in the genome given to prokka. The test tblfile contains the following aspects: - gene in D strand (start < end) - gene in C strand (start > end) - CDS features (some with all info = ECnumber, gene name, product etc. ; some with missing info) - tRNA type - repeat_region type (*2) - contigs with more than 2 genes - contig with only 2 genes (both 'b' loc) - contig with 1 gene ('b' loc) - contig without gene (should be skipped) """ caplog.set_level(logging.DEBUG) logger = logging.getLogger("test_prokka") tblfile = os.path.join(TEST_ANNOTE, "prokka_out_tbl_changed-contnames.tbl") lstfile = os.path.join(GENEPATH, "res_test_tbl2lst.lst") contigs = {"toto_1": "test.0417.00002.0001", "toto_2": "test.0417.00002.0002", "toto_3": "test.0417.00002.0003", "toto_4": "test.0417.00002.0004", "toto_5": "test.0417.00002.0005", "toto_6": "test.0417.00002.0006", "toto_7": "test.0417.00002.0007", } name = "test.0417.00002" gpath = "path_to_genome" assert prokkafunc.tbl2lst(tblfile, lstfile, contigs, name, gpath) exp_lst = os.path.join(EXP_ANNOTE, "res_create_lst-prokka.lst") assert tutil.compare_order_content(exp_lst, lstfile)
def test_write_gene(): """ Test that lstinfo line is written as expected when writing info for a gene (CDS). """ gtype = "CDS" locus_num = "5621221" gene_name = "abc" product = "new product" cont_loc = "i" genome = "ESCO.0216.00005" cont_num = 15 ecnum = "454.12.5" inf2 = "more information... dfd | with | pipe|characters..." db_xref = "mydb|pipe" strand = "C" start = str(154) end = str(656) lstfile = os.path.join(GENEPATH, "toto.lst") lstopenfile = open(lstfile, "w") lst_line = ffunc.write_gene(gtype, locus_num, gene_name, product, cont_loc, genome, cont_num, ecnum, inf2, db_xref, strand, start, end, lstopenfile) lstopenfile.close() assert lst_line == ( "154\t656\tC\tCDS\tESCO.0216.00005.0015i_5621221\tabc\t| new product " "| 454.12.5 | more information... dfd _ with _ pipe_characters... | " "mydb_pipe") exp_file = os.path.join(EXP_ANNOTE, "res_test_write_geneCDS.lst") assert tutil.compare_order_content(exp_file, lstfile)
def test_build_bank(caplog): """ Build a protein bank from a list of genomes, and create it at the same place as the database. """ caplog.set_level(logging.DEBUG) lstinfo = os.path.join(PATH_TEST_FILES, "list_to_pan.txt") dbpath = os.path.join(PATH_TEST_FILES, "example_db", "Proteins") cur_dbpath = os.path.join(GENEPATH, "Proteins") shutil.copytree(dbpath, cur_dbpath) name = "EXEM" spedir = None quiet = True outfile = psf.build_prt_bank(lstinfo, cur_dbpath, name, spedir, quiet) exp_file = os.path.join(PATH_EXP_FILES, "exp_EXEM.All.prt") exp_out = os.path.join(cur_dbpath, name + ".All.prt") # Check prt bank filename assert outfile == exp_out # Check content of bank created assert tutil.compare_order_content(exp_file, exp_out) # Check logs assert ("Building bank with all proteins to test/data/pangenome/" "generated_by_unit-tests/Proteins/EXEM.All.prt") in caplog.text
def test_run_all_pangenome_panexists_ok(caplog): """ Check that, given a prt bank, and a pangenome file, it says that pangenome file already exists, and just reads families from it """ caplog.set_level(15) min_id = 0.8 clust_mode = 1 outdir = os.path.join(GENEPATH, "test_run_allpangenome") os.makedirs(outdir) prt_path = os.path.join(PATH_EXP_FILES, "exp_EXEM.All.prt") threads = 2 panfile = None quiet = False # Create pangenome file exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") panfile_out = os.path.join( outdir, "PanGenome-exp_EXEM.All.prt-clust-0.8-mode1-th2.lst") shutil.copyfile(exp_pan, panfile_out) fams, outfile = mmseqs.run_all_pangenome(min_id, clust_mode, outdir, prt_path, threads, panfile=panfile, quiet=quiet) # check that tmp dir was created and not empty tmp_dir = os.path.join(outdir, "tmp_exp_EXEM.All.prt_0.8-mode1-th2") assert not os.path.isdir(tmp_dir) # check that pangenome file is present with expected name assert panfile_out == outfile assert os.path.isfile(panfile_out) # Check content of output pangenome file assert tutil.compare_order_content(exp_pan, outfile) # Check families returned in fams dict. for num, fam in fams.items(): exp_nums = [str(i) for i in range(1, 17)] assert num in exp_nums found = False for expfam in FAMILIES4G: if fam == expfam: found = True break assert found # Check logs assert ("Will run MMseqs2 with:\n\t- minimum sequence identity = 80.0%\n" "\t- cluster mode 1") in caplog.text assert ( "Pangenome file " "test/data/pangenome/generated_by_unit-tests/test_run_allpangenome/PanGenome-exp_EXEM.All.prt-clust-0.8-mode1-th2.lst " "already exists. PanACoTA will read it to get families." ) in caplog.text assert "Reading and getting information from pangenome file" in caplog.text assert caplog.records[0].levelname == "INFO" assert caplog.records[1].levelname == "WARNING" assert caplog.records[2].levelname == "INFO"
def test_main_given_tmp_verbose3(capsys): """ Test that when a tmp folder is given by user, tmp files are saved in it, and prokka files too. + check that, with verbose=3, warning and details are written to stdout Giving 4 genomes in list_files - for 1 genome, toto.fst does not exist, and will not be in the concatenated file - 2 concatenated files - 4 files to annotate - 4 prokkaRes - 1 genome with problems: no CDS found - 3 genomes in result dirs """ list_file = os.path.join(TEST_DIR, "list_genomes-func-test-default.txt") tmpdir = os.path.join(GENEPATH, "tmp_funcGivenTmp") name = "ESCO" l90 = 10 date = "0417" verbose = 3 info_file = os.path.join(GENEPATH, "LSTINFO-list_genomes-func-test-default.lst") assert annot.main("cmd", list_file, GEN_PATH, GENEPATH, name, date, l90, cutn=3, tmp_dir=tmpdir, verbose=verbose) == (info_file, 3) out, err = capsys.readouterr() # Check that warnings are written to stderr assert "WARNING" in err assert ("toto.fst genome file does not exist. Its file will be ignored when " "concatenating ['A_H738.fasta', 'genome1.fasta', 'toto.fst']") in err # Check that tmp files exist in the right folder # -> 2 fna files created (concatenations) # -> + 3 files created (split 5N) assert os.path.isfile(os.path.join(tmpdir, "A_H738.fasta-all.fna")) assert os.path.isfile(os.path.join(tmpdir, "H299_H561.fasta-all.fna")) assert len(glob.glob(os.path.join(tmpdir, '*.fna'))) == 6 assert len(glob.glob(os.path.join(tmpdir, '*split3N.fna'))) == 4 # Check that split contigs were renamed with unique ID at the begining of the header res_file = os.path.join(tmpdir, "A_H738.fasta-all.fna_prokka-split3N.fna") exp_file = os.path.join(EXP_DIR, "exp_A_H738.fasta-all.fna_prokka-split3N.fna") assert tutil.compare_order_content(exp_file, res_file) # Check that even for complete genome, contig was renamed with ID res_file = os.path.join(tmpdir, "complete_genome.fna_prokka-split3N.fna") exp_file = os.path.join(EXP_DIR, "exp_complete_genome.fna_prokka-split3N.fna") assert tutil.compare_order_content(exp_file, res_file) # Test that prokka folder is in the right directory # Only 1 genome annotated by prokka (the 2 others do not have appropriate L90/nbcont) assert os.path.isdir(os.path.join(tmpdir, "A_H738.fasta-all.fna_prokka-split3N.fna-prokkaRes")) assert not os.path.isdir(os.path.join(tmpdir, "H299_H561.fasta-all.fna-prokkaRes"))
def test_format_1pb_prodigal(caplog): """ Test that when giving a list of genomes, 1 that is correctly formatted, and 1 has a pb, it returns the last one in skipped_format """ caplog.set_level(logging.DEBUG) # GENOME 2: Create empty original genome file genome1 = "wrong.fasta" gpath1 = os.path.join(GENEPATH, "wrong.fasta") open(gpath1, "w").close() # Add prodigal (empty) result files to prodigalRes directory prodi_path = gpath1 + "-prodigalRes" os.makedirs(prodi_path) gff_res = os.path.join(prodi_path, "toto.gff") ffn_res = os.path.join(prodi_path, "toto.ffn") faa_res = os.path.join(prodi_path, "toto.faa") for file in [gff_res, ffn_res, faa_res]: open(file, "w").close() # Create output directory for .fna file rep_dir = os.path.join(GENEPATH, "Replicons") os.makedirs(rep_dir) # GENOME 2 genome2 = "H299_H561.fasta" gpath2 = os.path.join(ANNOTEDIR, "genomes", genome2) # Copy results of prodigal for this genome to output dir (GENEPATH) orig_res_files = os.path.join(EXP_ANNOTE, genome2 + '-prodigalRes') used_res_path = os.path.join(GENEPATH, genome2 + "-prodigalRes") shutil.copytree(orig_res_files, used_res_path) # genomes = {genome: [name, gpath, to_annot, size, nbcont, l90]} genomes = { genome1: ["test_genome1", gpath1, gpath1, 12656, 3, 1], genome2: ["test_runprokka_H299", gpath2, gpath2, 456464645, 5, 1] } res_path = GENEPATH annotated_path = GENEPATH # Format both genomes skipped_format = ffunc.format_genomes(genomes, res_path, annotated_path, True, threads=2) assert skipped_format == ["wrong.fasta"] # Get all names of expected output files exp_dir = os.path.join(EXP_ANNOTE, "res_formatAll", "prodigal") exp_folders = ["LSTINFO", "Proteins", "Genes", "Replicons", "gff3"] exp_extensions = [".lst", ".prt", ".gen", ".fna", ".gff"] # Check that output files are created, and contain what is expected for fol, ext in zip(exp_folders, exp_extensions): exp_file = os.path.join(exp_dir, fol, "test_runprokka_H299" + ext) res_file = os.path.join(res_path, fol, "test_runprokka_H299" + ext) assert os.path.isfile(res_file) assert tutil.compare_order_content(res_file, exp_file) # Check log assert "Formatting all genomes" in caplog.text assert ( "Your genome test/data/annotate/generated_by_unit-tests/wrong.fasta does not " "contain any sequence, or is not in fasta format.") in caplog.text assert "Problems while generating Replicon file for test_genome1" in caplog.text
def test_format_1genome(caplog): """ Test that when prokka results are ok, all files are generated as expected. """ caplog.set_level(logging.DEBUG) name = "test.0417.00002" # path to original genome, given to prodigal for annotation gpath = os.path.join(TEST_ANNOTE, "original_name.fna") prok_path = TEST_ANNOTE # Create result directories prot_dir = os.path.join(GENEPATH, "Proteins") lst_dir = os.path.join(GENEPATH, "LSTINFO") rep_dir = os.path.join(GENEPATH, "Replicons") gene_dir = os.path.join(GENEPATH, "Genes") gff_dir = os.path.join(GENEPATH, "gff") os.makedirs(prot_dir) os.makedirs(lst_dir) os.makedirs(rep_dir) os.makedirs(gene_dir) os.makedirs(gff_dir) assert prokkafunc.format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, gene_dir, rep_dir, gff_dir) # Check output files content # Replicons exp_rep = os.path.join(EXP_ANNOTE, "res_created_rep-prokka.fna") res_rep_file = os.path.join(rep_dir, "test.0417.00002.fna") assert tutil.compare_order_content(exp_rep, res_rep_file) # Proteins exp_prt = os.path.join(EXP_ANNOTE, "res_create_prt_prokka.faa") res_prt_file = os.path.join(prot_dir, "test.0417.00002.prt") assert tutil.compare_order_content(exp_prt, res_prt_file) # Genes exp_gen = os.path.join(EXP_ANNOTE, "res_create_gene_prokka.gen") res_gen_file = os.path.join(gene_dir, "test.0417.00002.gen") assert tutil.compare_order_content(exp_gen, res_gen_file) # LSTINFO exp_lst = os.path.join(EXP_ANNOTE, "res_create_lst-prokka.lst") res_lst_file = os.path.join(lst_dir, "test.0417.00002.lst") assert tutil.compare_order_content(exp_lst, res_lst_file) # gff exp_gff = os.path.join(EXP_ANNOTE, "res_create_gff-prokka.gff") res_gff_file = os.path.join(gff_dir, "test.0417.00002.gff") assert tutil.compare_order_content(exp_gff, res_gff_file)
def test_run_prodigal_out_doesnt_exist(): """ Test that when the output directory does not exist, it creates it, and runs prodigal with all expected outfiles """ logger = my_logger("test_run_prodigal_out_doesnt_exist") utils.init_logger(LOGFILE_BASE, 0, 'test_run_prodigal_out_doesnt_exist') gpath = os.path.join(GEN_PATH, "H299_H561.fasta") out_dir = os.path.join(GENEPATH, "H299_H561.fasta-prodigalRes") cores_prodigal = 2 name = "test_runprodigal_H299" force = False trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") nbcont = 3 arguments = (gpath, GENEPATH, cores_prodigal, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prodigal(arguments) # Check content of tbl, ffn and faa files exp_dir = os.path.join(EXP_DIR, "H299_H561.fasta-prodigalRes", "ESCO.1015.00001") out_faa = os.path.join(out_dir, name + ".faa") out_ffn = os.path.join(out_dir, name + ".ffn") out_gff = os.path.join(out_dir, name + ".gff") # Check that faa and ffn files are as expected assert os.path.isfile(out_faa) assert tutil.compare_order_content(exp_dir + ".faa", out_faa) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".ffn", out_ffn) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".gff", out_gff) q = logger[0] assert q.qsize() == 3 assert q.get().message.startswith("Start annotating") assert q.get().message == ( "Prodigal command: prodigal -i test/data/annotate/genomes/" "H299_H561.fasta -d test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.ffn -a test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.faa -f gff -o test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_H299.gff -t " "test/data/annotate/test_files/A_H738-and-B2_A3_5.fna.trn " "-q") assert q.get().message.startswith("End annotating")
def test_run_prodigal_small(): """ Test that when the output directory does not exist, it creates it, and runs prodigal with all expected outfiles. Here, we run prodigal with --small option (on a small genome) """ logger = my_logger("test_run_prodigal_small") utils.init_logger(LOGFILE_BASE, 0, 'test_run_prodigal_small') gpath = os.path.join(GEN_PATH, "H299_H561.fasta") out_dir = os.path.join(GENEPATH, "H299_H561.fasta-prodigalRes") cores_prodigal = 2 name = "test_runprodigal_small_H299" force = False trn_file = "small option" nbcont = 3 arguments = (gpath, GENEPATH, cores_prodigal, name, force, nbcont, trn_file, logger[0]) assert afunc.run_prodigal(arguments) # Check content of tbl, ffn and faa files exp_dir = os.path.join(EXP_DIR, "H299_H561.fasta_small-prodigalRes", "test_runprodigal_small_H299") out_faa = os.path.join(out_dir, name + ".faa") out_ffn = os.path.join(out_dir, name + ".ffn") out_gff = os.path.join(out_dir, name + ".gff") # Check that faa and ffn files are as expected assert os.path.isfile(out_faa) assert tutil.compare_order_content(exp_dir + ".faa", out_faa) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".ffn", out_ffn) assert os.path.isfile(out_ffn) assert tutil.compare_order_content(exp_dir + ".gff", out_gff) # Check logs q = logger[0] assert q.qsize() == 3 assert q.get().message.startswith("Start annotating") prodigal_cmd = q.get().message assert ("Prodigal command: prodigal -i test/data/annotate/genomes/" "H299_H561.fasta -d test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_small_H299.ffn -a test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_small_H299.faa -f gff -o test/data/annotate/" "generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runprodigal_small_H299.gff -p meta -q") in prodigal_cmd assert q.get().message.startswith("End annotating")
def test_create_prt(caplog): """ Check that prt file is generated as expected """ caplog.set_level(logging.DEBUG) protfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal.outtest.ok.faa") res_prt_file = os.path.join(GENEPATH, "prodigal_res.prt") exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst") assert prodigalfunc.create_prt(protfile, res_prt_file, exp_lst) exp_prt = os.path.join(EXP_ANNOTE, "res_create_prt_prodigal.faa") assert tutil.compare_order_content(exp_prt, res_prt_file)
def test_create_gen_missingSeq(caplog): """ Check create gen file. A gene in lst does not have a sequence in ffn. Just skip it, and go to next sequence for gen file. """ caplog.set_level(logging.DEBUG) logger = logging.getLogger("test_prodigal") ffnfile = os.path.join(TEST_ANNOTE, "prokka_out_for_test-noSeqFor1gene.ffn") lstfile = os.path.join(EXP_ANNOTE, "res_create_lst-prokka.lst") res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen") assert prokkafunc.create_gen(ffnfile, lstfile, res_gen_file) exp_gen = os.path.join(EXP_ANNOTE, "res_create_gene_prokka-missGene.gen") assert tutil.compare_order_content(exp_gen, res_gen_file)
def test_create_gen(caplog): """ Check create gen file. """ caplog.set_level(logging.DEBUG) logger = logging.getLogger("test_prodigal") ffnfile = os.path.join(TEST_ANNOTE, "original_name.fna-prokkaRes", "prokka_out_for_test.ffn") lstfile = os.path.join(EXP_ANNOTE, "res_create_lst-prokka.lst") res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen") assert prokkafunc.create_gen(ffnfile, lstfile, res_gen_file) exp_gen = os.path.join(EXP_ANNOTE, "res_create_gene_prokka.gen") assert tutil.compare_order_content(exp_gen, res_gen_file)
def test_all_post(): """ Check that when running main method of post-treatment, it creates the 3 output files expected, with the expected content. """ pangenome = os.path.join(GENEPATH, "test_all_post") post.post_treat(FAMILIES, pangenome) # Check presence and content of quali matrix file assert os.path.isfile(pangenome + ".quali.txt") assert tutil.compare_order_content(pangenome + ".quali.txt", EXP_QUALIF) # Check presence and content of quanti matrix file assert os.path.isfile(pangenome + ".quanti.txt") assert tutil.compare_order_content(pangenome + ".quanti.txt", EXP_QUANTIF) # Check presence and content of summary file assert os.path.isfile(pangenome + ".summary.txt") assert tutil.compare_order_content(pangenome + ".summary.txt", EXP_SUMF) # Check that bin pangenome file was created (as it did not exist before) assert os.path.isfile(pangenome + ".bin")
def test_write_pers(): """ Test that output file is written as expected """ fams = {9: ["member_3", "member_12", "other_member_2"], 3: ["member_10", "member_100", "member_1"], 10: ["member_1", "member_2", "member_3"], 1: ["my_protein_3", "my_protein_12", "my_protein_2"], 5: ["ESCO.1216.00003.i001_01001", "SAEN.0215.00003.i009_00001", "ESCO.1017.00003.b001_00001", "ESCO.0812.00002.i002_02000", "ESCO.0812.00003.i002_02000"]} outfile = os.path.join(GENEPATH, "test-persistent_families.txt") persf.write_persistent(fams, outfile) expfile = os.path.join(EXP_PATH, "exp_persgenome1.txt") assert tutils.compare_order_content(outfile, expfile)
def test_open_out(): """ Check that given some families and a pagenome file, it creates 3 output files, with the expected content (quanti, quali, summary) """ pangenome = os.path.join(GENEPATH, "test_open_out_pangenome.txt") res = post.open_outputs_to_write(FAMS_BY_STRAIN, FAMILIES, ALL_STRAINS, pangenome) # Check function output qualis, quantis, sums = res assert qualis == EXP_QUALIS assert quantis == EXP_QUANTIS assert sums == EXP_SUMS # Check presence and content of quali matrix file assert os.path.isfile(pangenome + ".quali.txt") assert tutil.compare_order_content(pangenome + ".quali.txt", EXP_QUALIF) # Check presence and content of quanti matrix file assert os.path.isfile(pangenome + ".quanti.txt") assert tutil.compare_order_content(pangenome + ".quanti.txt", EXP_QUANTIF) # Check presence and content of summary file assert os.path.isfile(pangenome + ".summary.txt") assert tutil.compare_order_content(pangenome + ".summary.txt", EXP_SUMF)
def test_only_mash(capsys): """ Running only mash step (giving genomes and corresponding LSTINFO file) """ NCBI_species_name = "" NCBI_species_taxid = "" NCBI_taxid = "" NCBI_strains = "" NCBI_section = "refseq" levels = "" outdir = GENEPATH tmp_dir = "" threads = 1 norefseq = False db_dir = "" only_mash = True info_file = os.path.join(TEST_DIR, "test_lstinfo_onlymash.lst") l90 = 100 nbcont = 999 cutn = 5 min_dist = 1e-4 max_dist = 0.06 verbose = 1 quiet = False out_info_file = os.path.join(outdir, "LSTINFO-NA-filtered-0.0001_0.06.txt") assert prepare.main("cmd", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) == out_info_file out, err = capsys.readouterr() assert ("You asked to run only mash steps") in err assert ("You want to run only mash steps. Getting information from " "test/data/prepare/test_files/test_lstinfo_onlymash.lst") in out assert ("Found 5 genomes in total") in out assert ("Computing pairwise distances between all genomes") in out assert ("Sorting all 5 genomes by quality") in out assert ("Final number of genomes in dataset: 1") in out # Check output files assert len(os.listdir(os.path.join(outdir, "tmp_files"))) == 0 # Check logfiles are here log_files = glob.glob(os.path.join(outdir, "*log*")) assert len(log_files) == 3 # Check content of output lstinfo file out_lst = os.path.join(outdir, "LSTINFO-NA-filtered-0.0001_0.06.txt") exp_lst = os.path.join(DBDIR, "exp_files", "exp_lstinfo_run_only-mash.lst") assert tutil.compare_order_content(out_lst, exp_lst)
def test_group_by_genome(caplog): """ Test that giving a file with all proteins aligned, a list of genomes, and an output filename, it writes in output the alignment grouped by genome and returns True """ caplog.set_level(logging.DEBUG) alnfile = os.path.join(TESTPATH, "complete.cat.fictive4genomes.aln") all_genomes = [ "GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001", "GENO.1216.00002" ] outgrp = os.path.join(GENEPATH, "test_group_by_genome") args = (all_genomes, alnfile, outgrp) assert pal.group_by_genome(args) exp_grp = os.path.join(EXPPATH, "exp_fictive.grp.aln") assert tutil.compare_order_content(outgrp, exp_grp) assert "3 sequences found per genome" in caplog.text assert "Writing alignments per genome" in caplog.text
def test_build_bank_spedir_quiet(caplog): """ Build a protein bank from a list of genomes, and create it in a given output directory. """ caplog.set_level(logging.DEBUG) lstinfo = os.path.join(PATH_TEST_FILES, "list_to_pan.txt") dbpath = os.path.join(PATH_TEST_FILES, "example_db", "Proteins") name = "EXEM" spedir = os.path.join(GENEPATH, "test_build_prt", "toto") quiet = False outfile = psf.build_prt_bank(lstinfo, dbpath, name, spedir, quiet) exp_file = os.path.join(PATH_EXP_FILES, "exp_EXEM.All.prt") exp_out = os.path.join(spedir, name + ".All.prt") assert outfile == exp_out assert tutil.compare_order_content(exp_file, exp_out) # Check logs assert ("Building bank with all proteins to test/data/pangenome/" "generated_by_unit-tests/test_build_prt/toto/EXEM.All.prt") in caplog.text