def test_taxids(self): """ Test run filtering for names with """ params = self.default_params.copy() params["output_prefix"] = self.results_dir + "test_taxids" params["taxids"] = ["1224"] # Build config from params cfg = Config("report", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon report exited with an error") # General sanity check of results res = report_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon report has inconsistent results") # should have only matches with pattern self.assertTrue( (res["tre_pd"][~res["idx_base"]]["lineage"].str.contains( params["taxids"][0])).all(), "ganon report did not filter by taxids")
def test_min_frequency_perc(self): """ Test ganon table with --min-frequency below 1 """ params = self.default_params.copy() params[ "output_file"] = self.results_dir + "test_min_frequency_perc.tsv" params["min_frequency"] = 0.9 params[ "rank"] = "phylum" # Fusobacteria is left out from report_reads3.tre # Build config from params cfg = Config("table", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon table exited with an error") # General sanity check of results res = table_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon table has inconsistent results") # should have no zero entries (3 samples/3 min frequency) self.assertTrue((res["out_pd"].values > 0).all(), "ganon table min frequency filter failed")
def test_max_count_perc(self): """ Test ganon table with --max-count below 1 """ params = self.default_params.copy() params["output_file"] = self.results_dir + "test_max_count_perc.tsv" params["output_value"] = "percentage" params["max_count"] = 0.02 # Build config from params cfg = Config("table", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon table exited with an error") # General sanity check of results res = table_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon table has inconsistent results") # should output just counts higher than min_count (or zeros) self.assertTrue( ((res["out_pd"] == 0) | (res["out_pd"] <= params["max_count"])).all(axis=None), "ganon table min count filter failed")
def test_bin_fragment_overlap_length(self): """ Test changing bin, fragment and overlap length """ params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_bin_fragment_overlap_length" params["bin_length"] = 5692 params["fragment_length"] = 667 params["overlap_length"] = 349 # Build config from params cfg = Config("build", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon build exited with an error") # General sanity check of results res = build_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon build has inconsistent results") # Specific test # Check max size of fragments on bins self.assertTrue(max(res["bins_pd"]["length"])<=params["fragment_length"]+params["overlap_length"], "Fragment greater than max.") # Check max size of bins self.assertTrue(max(res["bins_pd"].groupby("binid").sum()["length"])<=params["bin_length"], "Bin length greater than max.")
def test_multiple_rep_files(self): """ Test run with multiple rep files as input """ params = self.default_params.copy() params["rep_files"] = [ data_dir + "report/results.rep", data_dir + "report/results2.rep" ] params["output_prefix"] = self.results_dir + "test_multiple_rep_files_" # Build config from params cfg = Config("report", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon report exited with an error") # General sanity check of results res = report_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon report has inconsistent results") # should have two outputs self.assertEqual( len(res), len(params["rep_files"]), "ganon report did not generate multiple report files")
def test_specialization_file(self): """ ganon build --specialization file (online: eutils) """ params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_specialization_file" params["specialization"] = "file" # Build config from params cfg = Config("build", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon build exited with an error") # General sanity check of results res = build_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon build has inconsistent results") # Specific test - count files self.assertEqual(sum(res["tax_pd"]["rank"] == "file"), 4, "failed to use file name as specialization") # Check if all targets ends with ".fasta.gz" self.assertTrue( (res["map_pd"]["target"].map(lambda x: x.endswith(".fasta.gz")) ).all(), "failed to use file name as specialization")
def test_duplicated_input_files(self): """ ganon build with duplicated input files. ganon-build will process all input files, but bins should be correct """ params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_duplicated_input_files" params["input_files"] = params["input_files"] * 4 # Build config from params cfg = Config("build", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon build exited with an error") # General sanity check of results res = build_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon build has inconsistent results") # Specific test # Unique entries on bins (not duplicated) self.assertTrue( res["bins_pd"][["seqid", "seqstart", "seqend"]].equals( res["bins_pd"][["seqid", "seqstart", "seqend"]].drop_duplicates()), "Duplicated entries of repeated sequences on bins")
def test_specialization_file_on_custom(self): """ ganon update --specialization file with previous generated index --specialization custom (online: eutils) """ params = self.default_params.copy() params["db_prefix"] = data_dir + "bacteria_custom" params[ "output_db_prefix"] = self.results_dir + "test_specialization_file_on_custom" params["specialization"] = "file" # Build config from params cfg = Config("update", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon update exited with an error") # General sanity check of results res = update_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon update has inconsistent results") # Specific test - count assemblies on tax (3 bac + 4 vir) self.assertEqual(sum(res["tax_pd"]["rank"] == "custom"), 3, "error updating") self.assertEqual(sum(res["tax_pd"]["rank"] == "file"), 4, "error updating")
def test_duplicated_seqinfo(self): """ ganon build with duplicated --seq-info-file entries """ params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_duplicated_seqinfo" params[ "seq_info_file"] = data_dir + "build/bacteria_seqinfo_duplicated.txt" # Build config from params cfg = Config("build", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon build exited with an error") # General sanity check of results res = build_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon build has inconsistent results") # ganon should remove the duplicates and just have unique entries on bins self.assertTrue( res["bins_pd"][["seqid", "seqstart", "seqend"]].equals( res["bins_pd"][["seqid", "seqstart", "seqend"]].drop_duplicates()), "Duplicated entries on bins")
def test_overlap_length(self): """ Test changing overlap length """ params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_overlap_length" params["bin_length"] = 10000 params["overlap_length"] = 999 # Build config from params cfg = Config("build", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon build exited with an error") # General sanity check of results res = build_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon build has inconsistent results") # Specific test # Check max size of fragments on bins self.assertTrue( max(res["bins_pd"]["length"]) <= params["bin_length"] + params["overlap_length"], "Fragment bigger than max. set")
def test_matches(self): """ Test ganon table with report type "matches" from ganon report """ params = self.default_params.copy() params["output_file"] = self.results_dir + "test_matches.tsv" params["tre_files"] = [ data_dir + "table/report_matches1.tre", data_dir + "table/report_matches2.tre", data_dir + "table/report_matches3.tre" ] params["unclassified_label"] = "unclassified" # Build config from params cfg = Config("table", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon table exited with an error") # General sanity check of results res = table_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon table has inconsistent results") # should have unclassified summing to 0 (not unclassified line reported) self.assertEqual(res["out_pd"]["unclassified"].sum(), 0, "ganon table min frequency filter failed")
def test_na_ranks(self): """ Test run reporting missing taxa """ params = self.default_params.copy() params["output_prefix"] = self.results_dir + "test_na_ranks" params["db_prefix"] = "" params["ranks"] = ["genus", "species", "na"] params["taxdump_file"] = [ data_dir + "mini_nodes.dmp", data_dir + "mini_names.dmp" ] # Build config from params cfg = Config("report", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon report exited with an error") # General sanity check of results res = report_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon report has inconsistent results") # check if only selected ranks were reported self.assertTrue((res["tre_pd"][~res["idx_base"]]["rank"].isin( params["ranks"])).all(), "ganon report did not report the correct ranks")
def test_extra_cols(self): """ Test ganon table with --unclassified-label and --filtered-label """ params = self.default_params.copy() params["output_file"] = self.results_dir + "test_extra_cols1.tsv" params["min_count"] = 0.02 params["unclassified_label"] = "UNC" params["filtered_label"] = "FIL" params["rank"] = "genus" cfg = Config("table", **params) self.assertTrue(ganon.main(cfg=cfg), "ganon table exited with an error") res1 = table_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res1, "ganon table has inconsistent results") # last 2 cols should be the fixed unclassified and filtered self.assertTrue( all(c in ["UNC", "FIL"] for c in res1["out_pd"].columns.values[-2:]), "ganon table extra cols failed") params["output_file"] = self.results_dir + "test_extra_cols2.tsv" params["unclassified_label"] = "UNC" params["filtered_label"] = None cfg = Config("table", **params) self.assertTrue(ganon.main(cfg=cfg), "ganon table exited with an error") res2 = table_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res2, "ganon table has inconsistent results") # last col should be the fixed unclassified self.assertEqual(res2["out_pd"].columns.values[-1], "UNC", "ganon table extra cols failed") # should not have the filtered self.assertFalse("FIL" in res2["out_pd"].columns.values, "ganon table extra cols failed") params["output_file"] = self.results_dir + "test_extra_cols3.tsv" params["unclassified_label"] = None params["filtered_label"] = "FIL" cfg = Config("table", **params) self.assertTrue(ganon.main(cfg=cfg), "ganon table exited with an error") res3 = table_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res3, "ganon table has inconsistent results") # last col should be the fixed filtered self.assertEqual(res3["out_pd"].columns.values[-1], "FIL", "ganon table extra cols failed") # should not have the unclassified self.assertFalse("UNC" in res3["out_pd"].columns.values, "ganon table extra cols failed") ## SAME LABEL, report together params["output_file"] = self.results_dir + "test_extra_cols4.tsv" params["unclassified_label"] = "UNASSIGNED" params["filtered_label"] = "UNASSIGNED" cfg = Config("table", **params) self.assertTrue(ganon.main(cfg=cfg), "ganon table exited with an error") res4 = table_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res4, "ganon table has inconsistent results") # last col should be the fixed label self.assertEqual(res4["out_pd"].columns.values[-1], "UNASSIGNED", "ganon table extra cols failed") # when reporting together, should match values self.assertTrue( all(res1["out_pd"][["UNC", "FIL"]].sum( axis=1) == res4["out_pd"]["UNASSIGNED"]), "ganon table extra cols failed")
def test_update_multiple(self): """ Test multiple update runs: 1) only remove 2) only add (reusing bins) 3) remove and add """ #Remove only (2 bacteria entries) params = self.default_params.copy() params[ "output_db_prefix"] = self.results_dir + "test_update_multiple_1" params["update_complete"] = True params["seq_info_file"] = data_dir + "update/bacteria_half_seqinfo.txt" params["input_files"] = [ data_dir + "build/bacteria_NC_010333.1.fasta.gz", data_dir + "build/bacteria_NC_017164.1.fasta.gz" ] # Build config from params cfg = Config("update", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon update exited with an error") # General sanity check of results res = update_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon update has inconsistent results") # Specific - keep only two entries self.assertEqual(res["bins_pd"]["seqid"].drop_duplicates().shape[0], 2, "sequences not removed from bins") self.assertEqual(res["map_pd"]["target"].drop_duplicates().shape[0], 2, "sequences not removed from .map") # Add only (2 viruses entries, not update complete) params["db_prefix"] = params["output_db_prefix"] params["update_complete"] = False params[ "output_db_prefix"] = self.results_dir + "test_update_multiple_2" params["seq_info_file"] = data_dir + "update/virus_part1_seqinfo.txt" params["input_files"] = [ data_dir + "update/virus_NC_003676.1.fasta.gz", data_dir + "update/virus_NC_011646.1.fasta.gz" ] # Copy seqinfo to be parsed later shutil.copy(params["seq_info_file"], params["db_prefix"] + ".seqinfo.txt") # Build config from params cfg = Config("update", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon update exited with an error") # General sanity check of results res = update_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon update has inconsistent results") # Add viruses and remove all bacteria params["db_prefix"] = params["output_db_prefix"] params["update_complete"] = True params[ "output_db_prefix"] = self.results_dir + "test_update_multiple_3" params["seq_info_file"] = data_dir + "update/virus_seqinfo.txt" params["input_files"] = [ data_dir + "update/virus_NC_003676.1.fasta.gz", data_dir + "update/virus_NC_011646.1.fasta.gz", data_dir + "update/virus_NC_032412.1.fasta.gz", data_dir + "update/virus_NC_035470.1.fasta.gz" ] # Build config from params cfg = Config("update", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon update exited with an error") # General sanity check of results res = update_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon update has inconsistent results") # At the end, should have only viruses on the index params_classify = { "db_prefix": params["output_db_prefix"], "single_reads": [data_dir + "vir.sim.1.fq", data_dir + "bac.sim.1.fq"], "max_error": 0, "output_all": True, "quiet": True, "output_prefix": self.results_dir + "test_update_multiple_classify" } # Build config from params cfg_classify = Config("classify", **params_classify) # Run self.assertTrue(ganon.main(cfg=cfg_classify), "ganon classify exited with an error") # General sanity check of results res = classify_sanity_check_and_parse(vars(cfg_classify)) self.assertIsNotNone(res, "ganon classify has inconsistent results") # should not contain any bacteria self.assertFalse( res["tre_pd"][res["tre_pd"]["rank"] == "superkingdom"]["name"].isin(["Bacteria"]).any(), "index was not properly updated, bacteria sequences remain")
def test_add_existing_bins(self): """ Test update without creating new bins """ # Build database with 2 sequences at superkingdom level with large bins params_build = { "db_prefix": self.results_dir + "test_add_existing_bins_part1", "taxdump_file": [data_dir + "mini_nodes.dmp", data_dir + "mini_names.dmp"], "input_files": [ data_dir + "update/virus_NC_003676.1.fasta.gz", data_dir + "update/virus_NC_011646.1.fasta.gz" ], "seq_info_file": data_dir + "update/virus_part1_seqinfo.txt", "write_seq_info_file": True, "rank": "superkingdom", "bin_length": 200000, "quiet": True } # Build config from params cfg_build = Config("build", **params_build) # Run self.assertTrue(ganon.main(cfg=cfg_build), "ganon update exited with an error") # General sanity check of results res_build = build_sanity_check_and_parse(vars(cfg_build)) # Copy seqinfo to be parsed later shutil.copy(params_build["seq_info_file"], params_build["db_prefix"] + ".seqinfo.txt") # Update with part2 - add virus to same bin and bacteria to new bins params = self.default_params.copy() params["db_prefix"] = params_build["db_prefix"] params[ "output_db_prefix"] = self.results_dir + "test_add_existing_bins_part2" params["input_files"] = [ data_dir + "update/virus_NC_032412.1.fasta.gz", data_dir + "update/virus_NC_035470.1.fasta.gz", data_dir + "build/bacteria_NC_010333.1.fasta.gz", data_dir + "build/bacteria_NC_017164.1.fasta.gz", data_dir + "build/bacteria_NC_017163.1.fasta.gz", data_dir + "build/bacteria_NC_017543.1.fasta.gz" ] params[ "seq_info_file"] = data_dir + "update/bacteria_virus_part2_seqinfo.txt" # Build config from params cfg = Config("update", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon update exited with an error") # General sanity check of results res = update_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon update has inconsistent results") # Check if new virus (10239) were added to the same bin 0 self.assertEqual( res["bins_pd"][res["bins_pd"]["taxid"] == "10239"].binid.drop_duplicates().max(), 0, "virus added to new bins") # Check if entries are unique (virus 10239, bacteria 2) self.assertEqual(res["bins_pd"].taxid.drop_duplicates().size, 2, "multiple taxids") # Classify to first part params_classify = { "db_prefix": params_build["db_prefix"], "single_reads": [data_dir + "vir.sim.1.fq", data_dir + "bac.sim.1.fq"], "max_error": 0, "output_all": True, "quiet": True, "output_prefix": self.results_dir + "test_add_existing_bins_classify_part1" } # Build config from params cfg_classify = Config("classify", **params_classify) # Run self.assertTrue(ganon.main(cfg=cfg_classify), "ganon classify exited with an error") # General sanity check of results res_classify1 = classify_sanity_check_and_parse(vars(cfg_classify)) self.assertIsNotNone(res, "ganon classify has inconsistent results") # Specific test - should contain only one superkingdom (virus 10239, bacteria 2) self.assertEqual( res_classify1["tre_pd"][res_classify1["tre_pd"]["rank"] == "superkingdom"]["target"].shape[0], 1, "more than one superkingdom as target") self.assertEqual( res_classify1["tre_pd"][res_classify1["tre_pd"]["rank"] == "superkingdom"]["target"].values[0], '10239', "wrong target") # Classify to second part params_classify["db_prefix"] = params["output_db_prefix"] params_classify[ "output_prefix"] = self.results_dir + "test_add_existing_bins_classify_part2" # Build config from params cfg_classify = Config("classify", **params_classify) # Run self.assertTrue(ganon.main(cfg=cfg_classify), "ganon classify exited with an error") # General sanity check of results res_classify2 = classify_sanity_check_and_parse(vars(cfg_classify)) self.assertIsNotNone(res, "ganon classify has inconsistent results") # Specific # Classification to the second updated index has to have more matches than the first self.assertTrue( res_classify2["all_pd"].shape[0] > res_classify1["all_pd"].shape[0], "updated index did not improve matches") # should contain only two superkingdoms (virus 10239, bacteria 2) self.assertEqual( res_classify2["tre_pd"][res_classify2["tre_pd"]["rank"] == "superkingdom"]["target"].shape[0], 2, "more than two superkingdom as target") self.assertTrue( res_classify2["tre_pd"][res_classify2["tre_pd"]["rank"] == "superkingdom"]["target"].isin( ['10239', '2']).all(), "wrong target")
def test_minimizers(self): """ ganon update with minimizers """ params_build = { "taxdump_file": [data_dir + "mini_nodes.dmp", data_dir + "mini_names.dmp"], "input_files": [ data_dir + "build/bacteria_NC_010333.1.fasta.gz", data_dir + "build/bacteria_NC_017164.1.fasta.gz", data_dir + "build/bacteria_NC_017163.1.fasta.gz", data_dir + "build/bacteria_NC_017543.1.fasta.gz" ], "seq_info_file": data_dir + "build/bacteria_seqinfo.txt", "write_seq_info_file": True, "rank": "species", "window_size": 27, "quiet": True } params_build["db_prefix"] = self.results_dir + "test_minimizers_build" # Build config from params cfg_build = Config("build", **params_build) # Run self.assertTrue(ganon.main(cfg=cfg_build), "ganon build exited with an error") # General sanity check of results res_build = build_sanity_check_and_parse(vars(cfg_build)) self.assertIsNotNone(res_build, "ganon build has inconsistent results") shutil.copy(params_build["seq_info_file"], params_build["db_prefix"] + ".seqinfo.txt") params = self.default_params.copy() params["db_prefix"] = params_build["db_prefix"] params[ "output_db_prefix"] = self.results_dir + "test_minimizers_update" # Build config from params cfg = Config("update", **params) # Run self.assertTrue(ganon.main(cfg=cfg), "ganon update exited with an error") # General sanity check of results res = update_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon update has inconsistent results") # Specific - check if number of bins increased self.assertTrue(res["map_pd"].binid.max() > 41, "no bins were added") # Classify simulated virus against updated index params_classify = { "db_prefix": params["output_db_prefix"], "single_reads": [data_dir + "vir.sim.1.fq", data_dir + "bac.sim.1.fq"], "rel_cutoff": 0, "rel_filter": 1, "output_lca": True, "output_all": True, "quiet": True, "output_prefix": self.results_dir + "test_default" } # Build config from params cfg_classify = Config("classify", **params_classify) # Run self.assertTrue(ganon.main(cfg=cfg_classify), "ganon classify exited with an error") # General sanity check of results res = classify_sanity_check_and_parse(vars(cfg_classify)) self.assertIsNotNone(res, "ganon classify has inconsistent results") # Specific tes - should return Viruses and Bacteria matches on the updated index self.assertTrue( res["tre_pd"][res["tre_pd"]["rank"] == "superkingdom"] ["name"].isin(["Bacteria", "Viruses"]).all(), "classification on updated index failed")