def test_GembaseHMMReport_extract(self): system = System(self.cfg, "T2SS", 10) gene_name = "gspD" gene = Gene(self.cfg, gene_name, system, self.profile_registry) shutil.copy( os.path.join(self._data_dir, "hmm", gene_name + self.cfg.res_search_suffix), self.cfg.working_dir) report_path = os.path.join(self.cfg.working_dir, gene_name + self.cfg.res_search_suffix) report = GembaseHMMReport(gene, report_path, self.cfg) report.extract() self.assertEqual(len(report.hits), 6) # gene, system, hit_id, hit_seq_length replicon_name, pos_hit, i_eval, score, profile_coverage, sequence_coverage, begin_match, end_match hits = [ Hit(gene, system, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), Hit(gene, system, "PSAE001c01_006940", 803, "PSAE001c01", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), Hit(gene, system, "PSAE001c01_013980", 759, "PSAE001c01", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), Hit(gene, system, "PSAE001c01_017350", 600, "PSAE001c01", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), Hit(gene, system, "PSAE001c01_018920", 776, "PSAE001c01", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), Hit(gene, system, "PSAE001c01_031420", 658, "PSAE001c01", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] self.assertListEqual(hits, report.hits)
def test_save_extract(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GembaseHMMReport(gene, report_path, self.cfg) report.extract() report.save_extract() extract_filename = gene_name + self.cfg.res_extract_suffix() extract_path = os.path.join(self.cfg.working_dir(), self.cfg.hmmer_dir(), extract_filename) self.assertTrue(os.path.exists(extract_path)) self.assertTrue(os.path.isfile(extract_path)) hits = [ CoreHit(gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(gene, "PSAE001c01_013980", 759, "PSAE001c01", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), CoreHit(gene, "PSAE001c01_017350", 600, "PSAE001c01", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), CoreHit(gene, "PSAE001c01_018920", 776, "PSAE001c01", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), CoreHit(gene, "PSAE001c01_031420", 658, "PSAE001c01", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] expected_extract_path = os.path.join(self.cfg.working_dir(), 'expected_extract') with open(expected_extract_path, 'w') as expected_extract: extract = """# gene: {name} extract from {path} hmm output # profile length= {len_profile:d} # i_evalue threshold= {i_evalue:.3f} # coverage threshold= {cov:.3f} # hit_id replicon_name position_hit hit_sequence_length gene_name gene_system i_eval score profile_coverage sequence_coverage begin end """.format(name=gene.name, path=report_path, len_profile=len(gene.profile), i_evalue=self.cfg.i_evalue_sel(), cov=self.cfg.coverage_profile()) expected_extract.write(extract) for h in hits: expected_extract.write(str(h)) self.assertFileEqual(extract_path, expected_extract_path)
def test_GembaseHMMReport_extract(self): system = System(self.cfg, "T2SS", 10) gene_name = "gspD" gene = Gene(self.cfg, gene_name, system, self.profile_registry) shutil.copy(os.path.join(self._data_dir, "hmm", gene_name + self.cfg.res_search_suffix), self.cfg.working_dir) report_path = os.path.join(self.cfg.working_dir, gene_name + self.cfg.res_search_suffix) report = GembaseHMMReport(gene, report_path, self.cfg) report.extract() self.assertEqual(len(report.hits), 5) #gene, system, hit_id, hit_seq_length replicon_name, pos_hit, i_eval, score, profile_coverage, sequence_coverage, begin_match, end_match hits=[ Hit(gene, system, "PSAE001c01_006940", 803,"PSAE001c01", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1)/ 803, 104, 741), Hit(gene, system, "PSAE001c01_013980", 759,"PSAE001c01", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1)/ 759, 105, 736), Hit(gene, system, "PSAE001c01_017350", 600,"PSAE001c01", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1)/ 600, 226, 506), Hit(gene, system, "PSAE001c01_018920", 776,"PSAE001c01", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1)/ 776, 48, 606), Hit(gene, system, "PSAE001c01_031420", 658,"PSAE001c01", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1)/ 658, 55, 614) ] self.assertListEqual(hits, report.hits)
def test_best_hit(self): gene_name = 'gspD' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GembaseHMMReport(c_gene, report_path, self.cfg) self.assertIsNone(report.best_hit()) report.extract() best_hit = report.best_hit() hit_expected = CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) self.assertEqual(hit_expected, best_hit)
def test_str(self): system = System(self.cfg, "T2SS", 10) gene_name = "gspD" gene = Gene(self.cfg, gene_name, system, self.profile_registry) shutil.copy( os.path.join(self._data_dir, "hmm", gene_name + self.cfg.res_search_suffix), self.cfg.working_dir) report_path = os.path.join(self.cfg.working_dir, gene_name + self.cfg.res_search_suffix) report = GembaseHMMReport(gene, report_path, self.cfg) report.extract() hits = [ Hit(gene, system, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), Hit(gene, system, "PSAE001c01_006940", 803, "PSAE001c01", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), Hit(gene, system, "PSAE001c01_013980", 759, "PSAE001c01", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), Hit(gene, system, "PSAE001c01_017350", 600, "PSAE001c01", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), Hit(gene, system, "PSAE001c01_018920", 776, "PSAE001c01", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), Hit(gene, system, "PSAE001c01_031420", 658, "PSAE001c01", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] s = "" s = "# gene: {0} extract from {1} hmm output\n".format( gene.name, report_path) s += "# profile length= {0:d}\n".format(len(gene.profile)) s += "# i_evalue threshold= {0:.3f}\n".format(self.cfg.i_evalue_sel) s += "# coverage threshold= {0:.3f}\n".format( self.cfg.coverage_profile) s += "# hit_id replicon_name position_hit hit_sequence_length gene_name gene_system i_eval score profile_coverage sequence_coverage begin end\n" for h in hits: s += str(h) self.assertEqual(str(report), s)
def test_str(self): gene_name = 'gspD' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GembaseHMMReport(c_gene, report_path, self.cfg) report.extract() hits = [ CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_006940", 803, "PSAE001c01", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_013980", 759, "PSAE001c01", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), CoreHit(c_gene, "PSAE001c01_017350", 600, "PSAE001c01", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), CoreHit(c_gene, "PSAE001c01_018920", 776, "PSAE001c01", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), CoreHit(c_gene, "PSAE001c01_031420", 658, "PSAE001c01", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] s = f"# gene: {c_gene.name} extract from {report_path} hmm output\n" s += f"# profile length= {len(c_gene.profile):d}\n" s += f"# i_evalue threshold= {self.cfg.i_evalue_sel():.3f}\n" s += f"# coverage threshold= {self.cfg.coverage_profile():.3f}\n" s += "# hit_id replicon_name position_hit hit_sequence_length gene_name gene_system i_eval score " \ "profile_coverage sequence_coverage begin end\n" for h in hits: s += str(h) self.assertMultiLineEqual(str(report), s)
def test_extract(self): gene_name = "gspD" c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GembaseHMMReport(c_gene, report_path, self.cfg) report.extract() self.assertEqual(len(report.hits), 6) # gene, model, hit_id, hit_seq_ length replicon_name, pos_hit, i_eval, # score, profile_coverage, sequence_coverage, begin_match, end_match hits = [ CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_006940", 803, "PSAE001c01", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), CoreHit(c_gene, "PSAE001c01_013980", 759, "PSAE001c01", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), CoreHit(c_gene, "PSAE001c01_017350", 600, "PSAE001c01", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), CoreHit(c_gene, "PSAE001c01_018920", 776, "PSAE001c01", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), CoreHit(c_gene, "PSAE001c01_031420", 658, "PSAE001c01", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] self.assertListEqual(hits, report.hits) report = GembaseHMMReport(c_gene, report_path, self.cfg) report.hits = hits self.assertIsNone(report.extract())
def test_str(self): system = System(self.cfg, "T2SS", 10) gene_name = "gspD" gene = Gene(self.cfg, gene_name, system, self.profile_registry) shutil.copy(os.path.join(self._data_dir, "hmm", gene_name + self.cfg.res_search_suffix), self.cfg.working_dir) report_path = os.path.join(self.cfg.working_dir, gene_name + self.cfg.res_search_suffix) report = GembaseHMMReport(gene, report_path, self.cfg) report.extract() hits=[ Hit(gene, system, "PSAE001c01_006940", 803,"PSAE001c01", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1)/ 803, 104, 741), Hit(gene, system, "PSAE001c01_013980", 759,"PSAE001c01", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1)/ 759, 105, 736), Hit(gene, system, "PSAE001c01_017350", 600,"PSAE001c01", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1)/ 600, 226, 506), Hit(gene, system, "PSAE001c01_018920", 776,"PSAE001c01", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1)/ 776, 48, 606), Hit(gene, system, "PSAE001c01_031420", 658,"PSAE001c01", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1)/ 658, 55, 614) ] s = "" s = "# gene: %s extract from %s hmm output\n" % (gene.name, report_path) s += "# profile length= %d\n" % len(gene.profile) s += "# i_evalue threshold= %f\n" % self.cfg.i_evalue_sel s += "# coverage threshold= %f\n" % self.cfg.coverage_profile s += "# hit_id replicon_name position_hit hit_sequence_length gene_name gene_system i_eval score profile_coverage sequence_coverage begin end\n" for h in hits: s += str(h) self.assertEqual(str(report), s)