コード例 #1
0
ファイル: test_region.py プロジェクト: zachcp/antismash
 def test_genbank(self):
     dummy_record = Record(Seq("A" * 100, generic_dna))
     clusters = [
         create_cluster(3, 20, "prodA"),
         create_cluster(25, 41, "prodB")
     ]
     for cluster in clusters:
         dummy_record.add_cluster(cluster)
     subregion = SubRegion(FeatureLocation(35, 71), "test", 0.7)
     dummy_record.add_subregion(subregion)
     supercluster = SuperCluster(SuperCluster.kinds.NEIGHBOURING, clusters)
     dummy_record.add_supercluster(supercluster)
     region = Region(superclusters=[supercluster], subregions=[subregion])
     dummy_record.add_region(region)
     with NamedTemporaryFile(suffix=".gbk") as output:
         region.write_to_genbank(output.name)
         bio = list(seqio.parse(output.name))
     assert len(bio) == 1
     rec = Record.from_biopython(bio[0], taxon="bacteria")
     assert len(rec.get_regions()) == 1
     new = rec.get_region(0)
     assert new.location.start == 3 - region.location.start
     assert new.location.end == 71 - region.location.start
     assert new.products == region.products
     assert new.probabilities == region.probabilities
コード例 #2
0
ファイル: __init__.py プロジェクト: zachcp/antismash
def regenerate_previous_results(results: Dict[str, Any], record: Record,
                                _options: ConfigType) -> Optional[ClusterFinderRuleResults]:
    """ Regenerate previous results. """
    if not results:
        return None

    regenerated = ClusterFinderRuleResults.from_json(results, record)
    if not regenerated:
        return None
    for cluster in regenerated.get_predicted_clusters():
        record.add_cluster(cluster)
    return regenerated
コード例 #3
0
class HmmDetectionTest(unittest.TestCase):
    def setUp(self):
        self.rules_file = path.get_full_path(__file__, "..", "cluster_rules.txt")
        self.signature_file = path.get_full_path(__file__, "..", "data", "hmmdetails.txt")
        self.signature_names = {sig.name for sig in core.get_signature_profiles()}
        self.filter_file = path.get_full_path(__file__, "..", "filterhmmdetails.txt")
        self.results_by_id = {
            "GENE_1": [
                FakeHSPHit("modelA", "GENE_1", 0, 10, 50, 0),
                FakeHSPHit("modelB", "GENE_1", 0, 10, 50, 0)
            ],
            "GENE_2": [
                FakeHSPHit("modelC", "GENE_2", 0, 10, 50, 0),
                FakeHSPHit("modelB", "GENE_2", 0, 10, 50, 0)
            ],
            "GENE_3": [
                FakeHSPHit("modelC", "GENE_3", 0, 10, 50, 0),
                FakeHSPHit("modelF", "GENE_3", 0, 10, 50, 0)
            ],
            "GENE_4": [
                FakeHSPHit("modelA", "GENE_4", 0, 10, 50, 0),
                FakeHSPHit("modelE", "GENE_4", 0, 10, 50, 0)
            ],
            "GENE_5": [
                FakeHSPHit("modelA", "GENE_5", 0, 10, 50, 0),
                FakeHSPHit("modelG", "GENE_5", 0, 10, 50, 0)
            ]
        }
        self.feature_by_id = {
            "GENE_1": DummyCDS(0, 30000, locus_tag="GENE_1"),
            "GENE_2": DummyCDS(30000, 50000, locus_tag="GENE_2"),
            "GENE_3": DummyCDS(70000, 90000, locus_tag="GENE_3"),
            "GENE_X": DummyCDS(95000, 100000, locus_tag="GENE_X"),  # no hits
            "GENE_4": DummyCDS(125000, 140000, locus_tag="GENE_4"),
            "GENE_5": DummyCDS(130000, 150000, locus_tag="GENE_5")
        }

        self.test_names = {"modelA", "modelB", "modelC", "modelF", "modelG",
                           "a", "b", "c", "d"}

        self.rules = rule_parser.Parser("\n".join([
                "RULE MetaboliteA CUTOFF 10 EXTENT 5 CONDITIONS modelA",
                "RULE MetaboliteB CUTOFF 10 EXTENT 5 CONDITIONS cds(modelA and modelB)",
                "RULE MetaboliteC CUTOFF 10 EXTENT 5 CONDITIONS (modelA and modelB)",
                "RULE MetaboliteD CUTOFF 20 EXTENT 5 CONDITIONS minimum(2,[modelC,modelB]) and modelA",
                "RULE Metabolite0 CUTOFF 1 EXTENT 3 CONDITIONS modelF",
                "RULE Metabolite1 CUTOFF 1 EXTENT 3 CONDITIONS modelG"]), self.test_names).rules
        self.features = []
        for gene_id in self.feature_by_id:
            self.features.append(self.feature_by_id[gene_id])
        self.features.sort(key=lambda x: x.location.start)  # vital for py3 < 3.5
        self.record = Record()
        self.record._record.seq = Seq("A"*150000)
        for feature in self.features:
            self.record.add_cds_feature(feature)

    def tearDown(self):
        # clear out any leftover config adjustments
        get_config().__dict__.clear()

    def test_overlaps_but_not_contains(self):
        # should get gene2 and gene3
        rules = rule_parser.Parser("\n".join([
                "RULE Overlap CUTOFF 25 EXTENT 5 CONDITIONS modelB and modelF "
                "RULE OverlapImpossible CUTOFF 25 EXTENT 5 CONDITIONS modelA and modelF"]), self.test_names).rules
        detected_types, cluster_type_hits = hmm_detection.apply_cluster_rules(self.record, self.results_by_id, rules)
        assert detected_types == {"GENE_2": {"Overlap": {"modelB"}},
                                  "GENE_3": {"Overlap": {"modelF"}}}

        assert cluster_type_hits == {"Overlap": {"GENE_2", "GENE_3"}}

        # only 1 cluster should be found, since it requires both genes
        # if forming clusters by .is_contained_by(), 2 clusters will be formed
        # if finding rule hits uses .is_contained_by(), no clusters will be formed
        rules_by_name = {rule.name: rule for rule in rules}
        clusters = hmm_detection.find_clusters(self.record, cluster_type_hits, rules_by_name)
        assert len(clusters) == 1
        assert clusters[0].product == "Overlap"
        assert clusters[0].core_location.start == 30000
        assert clusters[0].core_location.end == 90000

    def test_core(self):
        # should be no failing prerequisites
        assert core.check_prereqs() == []
        # always runs
        assert core.is_enabled(None)

    def test_apply_cluster_rules(self):
        detected_types, cluster_type_hits = hmm_detection.apply_cluster_rules(self.record, self.results_by_id,
                                                                              self.rules)
        for gid in detected_types:
            detected_types[gid] = set(detected_types[gid])
        expected_types = {
            "GENE_1": set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]),
            "GENE_2": set(["MetaboliteC", "MetaboliteD"]),
            "GENE_3": set(["Metabolite0"]),
            "GENE_4": set(["MetaboliteA"]),
            "GENE_5": set(["Metabolite1", "MetaboliteA"])
        }
        assert detected_types == expected_types

        assert cluster_type_hits == {"MetaboliteA": {"GENE_1", "GENE_4", "GENE_5"},
                                     "MetaboliteB": {"GENE_1"},
                                     "MetaboliteC": {"GENE_1", "GENE_2"},
                                     'MetaboliteD': {'GENE_1', 'GENE_2'},
                                     'Metabolite0': {'GENE_3'},
                                     'Metabolite1': {'GENE_5'}}

    def test_find_clusters(self):
        cds_features_by_type = {"MetaboliteA": {"GENE_1", "GENE_4", "GENE_5"},
                                "MetaboliteB": {"GENE_1"},
                                "MetaboliteC": {"GENE_1", "GENE_2"},
                                'MetaboliteD': {'GENE_1', 'GENE_2'},
                                'Metabolite0': {'GENE_3'},
                                'Metabolite1': {'GENE_5'}}
        rules = {rule.name: rule for rule in self.rules}
        for cluster in hmm_detection.find_clusters(self.record, cds_features_by_type, rules):
            self.record.add_cluster(cluster)
        assert len(self.record.get_clusters()) == 7
        cluster_products = sorted([cluster.product for cluster in self.record.get_clusters()])
        assert cluster_products == sorted(["Metabolite%s" % i for i in "01AABCD"])
        self.record.create_superclusters()
        assert len(self.record.get_superclusters()) == 3
        self.record.create_regions()
        assert len(self.record.get_regions()) == 3
        result_regions = []
        for region in self.record.get_regions():
            result_regions.append(sorted(cds.get_name() for cds in region.cds_children))

        expected_regions = [
            ["GENE_1", "GENE_2"],
            ["GENE_3"],
            ["GENE_4", "GENE_5"]
        ]
        assert result_regions == expected_regions

    def test_create_rules(self):
        rules = hmm_detection.create_rules(self.rules_file, self.signature_names)
        assert len(rules) == 52
        t1pks_rules = [rule for rule in rules if rule.name == "T1PKS"]
        assert len(t1pks_rules) == 1
        rule = t1pks_rules[0]
        assert rule.cutoff == 20000
        assert rule.extent == 20000

    def test_profiles_parsing(self):
        profiles = signatures.get_signature_profiles()
        assert len(profiles) == 241  # ensures we don't delete any by accident

    def test_filter(self):
        # fake HSPs all in one CDS with overlap > 20 and query_ids from the same equivalence group

        # not overlapping by > 20
        first = FakeHSPHit("AMP-binding", "A", 50, 90, 0.1, None)
        second = FakeHSPHit("A-OX", "A", 70, 100, 0.5, None)
        new, by_id = hmm_detection.filter_results([first, second], {"A": [first, second]},
                                                  self.filter_file, self.signature_names)
        assert new == [first, second]
        assert by_id == {"A": [first, second]}

        # overlapping, in same group
        first.hit_end = 91
        assert hmm_detection.hsp_overlap_size(first, second) == 21
        new, by_id = hmm_detection.filter_results([first, second], {"A": [first, second]},
                                                  self.filter_file, self.signature_names)
        assert new == [second]
        assert by_id == {"A": [second]}

        # overlapping, not in same group
        second.query_id = "none"
        new, by_id = hmm_detection.filter_results([first, second], {"A": [first, second]},
                                                  self.filter_file, self.signature_names)
        assert new == [first, second]
        assert by_id == {"A": [first, second]}

        # not in the same CDS, but int he same group
        second.hit_id = "B"
        second.query_id = "A-OX"
        new, by_id = hmm_detection.filter_results([first, second], {"A": [first], "B": [second]},
                                                  self.filter_file, self.signature_names)
        assert new == [first, second]
        assert by_id == {"A": [first], "B": [second]}

    def test_filter_multiple(self):
        # all in one CDS no overlap and the same query_ids -> cull all but the best score

        # not overlapping, not same query_id
        first = FakeHSPHit("AMP-binding", "A", 50, 60, 0.1, None)
        second = FakeHSPHit("A-OX", "A", 70, 100, 0.5, None)
        both = [first, second]
        by_id = {"A": [first, second]}
        new, by_id = hmm_detection.filter_result_multiple(list(both), dict(by_id))
        assert new == [first, second]
        assert by_id == {"A": [first, second]}

        # not overlapping, same query_id
        first.query_id = "A-OX"
        new, by_id = hmm_detection.filter_result_multiple(list(both), dict(by_id))
        assert new == [second]
        assert by_id == {"A": [second]}

        # not in same CDS, same query_id
        second.hit_id = "B"
        by_id = {"A": [first], "B": [second]}
        new, by_id = hmm_detection.filter_result_multiple(list(both), dict(by_id))
        assert new == [first, second]
        assert by_id == {"A": [first], "B": [second]}

    def test_equivalence_groups(self):
        group_file = path.get_full_path(os.path.dirname(__file__), "filterhmmdetails.txt")
        sets = []
        with open(group_file) as group_lines:
            sets = [set(line.strip().split(',')) for line in group_lines]

        # ensure they have at least two elements
        assert all(len(s) > 1 for s in sets)

        # ensure that the groups are disjoint
        for i, group in enumerate(sets):
            for other in sets[i + 1:]:
                assert group.isdisjoint(other)

    def test_hsp_overlap_size(self):
        overlap_size = hmm_detection.hsp_overlap_size
        first = FakeHSPHit("A", "A", 50, 60, 0., None)
        second = FakeHSPHit("B", "B", 70, 100, 0., None)
        # no overlap
        assert overlap_size(first, second) == 0
        first.hit_end = 70
        # still no overlap, end isn't inclusive
        assert overlap_size(first, second) == 0
        # a mix of second starting inside first
        for i in range(1, 30):
            first.hit_end += 1
            assert overlap_size(first, second) == i
        # second wholly contained
        first.hit_end = 110
        assert overlap_size(first, second) == 30

        # first inside second
        first.hit_start = 75
        assert overlap_size(first, second) == 25

        # first inside second, but direction reversed
        first.hit_end = 50
        with self.assertRaises(AssertionError):
            overlap_size(first, second)

    def test_hmm_files_and_details_match(self):
        data_dir = path.get_full_path(os.path.dirname(__file__), "data", "")
        details_files = {prof.path for prof in signatures.get_signature_profiles()}
        details_files = {filepath.replace(data_dir, "") for filepath in details_files}
        data_dir_contents = set(glob.glob(data_dir + "*.hmm"))
        data_dir_contents = {filepath.replace(data_dir, "") for filepath in data_dir_contents}
        # ignore bgc_seeds.hmm for the sake of comparison, it's a generated aggregate
        data_dir_contents.discard("bgc_seeds.hmm")
        missing_files = details_files - data_dir_contents
        assert not missing_files
        extra_files = data_dir_contents - details_files
        assert not extra_files
        # finally, just to be sure
        assert data_dir_contents == details_files
コード例 #4
0
class TestSuperCluster(unittest.TestCase):
    def setUp(self):
        self.record = Record(Seq("A" * 100))
        clusters = [create_cluster(0, 0, 10, 10)]
        for cluster in clusters:
            self.record.add_cluster(cluster)

    def test_kinds_attachment(self):
        assert SuperCluster.kinds == SuperClusterKind

    def test_record_linkage(self):
        cluster = SuperCluster(SuperCluster.kinds.INTERLEAVED,
                               self.record.get_clusters())
        with self.assertRaisesRegex(ValueError,
                                    "SuperCluster not contained in record"):
            cluster.get_supercluster_number()
        self.record.add_supercluster(cluster)
        assert cluster.get_supercluster_number() == 1

    def test_bad_kind(self):
        with self.assertRaisesRegex(TypeError, "should be SuperClusterKind"):
            SuperCluster("berf", self.record.get_clusters())

    def test_no_clusters(self):
        with self.assertRaisesRegex(ValueError,
                                    "cannot exist without at least one"):
            SuperCluster(SuperCluster.kinds.INTERLEAVED, [])

    def test_rules(self):
        cluster = SuperCluster(SuperCluster.kinds.INTERLEAVED,
                               self.record.get_clusters())
        assert cluster.detection_rules == [
            cluster.detection_rule for cluster in self.record.get_clusters()
        ]

    def test_smiles_and_polymer(self):
        cluster = SuperCluster(SuperCluster.kinds.INTERLEAVED,
                               self.record.get_clusters())
        assert cluster.smiles_structure is None
        assert cluster.polymer is None

    def test_conversion(self):
        kind = SuperClusterKind.INTERLEAVED
        original = SuperCluster(kind,
                                self.record.get_clusters(),
                                smiles="dummy smiles",
                                polymer="dummy polymer")
        self.record.add_supercluster(original)
        assert original.products == ["a"]
        assert len(original.clusters) == 1
        bios = original.to_biopython()
        assert len(bios) == 1
        bio = bios[0]
        assert bio.qualifiers["product"] == ["a"]
        assert bio.qualifiers["kind"] == [str(kind)]
        assert bio.qualifiers["candidate_cluster_number"] == [
            str(original.get_supercluster_number())
        ]
        assert bio.qualifiers["SMILES"] == ["dummy smiles"]
        assert bio.qualifiers["polymer"] == ["dummy polymer"]
        assert bio.qualifiers["contig_edge"] == ["True"]
        regenerated = SuperCluster.from_biopython(bio)
        assert isinstance(regenerated, TemporarySuperCluster)
        assert regenerated.products == original.products
        assert regenerated.location == original.location
        assert regenerated.smiles_structure == original.smiles_structure
        assert regenerated.polymer == original.polymer
        assert regenerated.clusters == [
            cluster.get_cluster_number()
            for cluster in self.record.get_clusters()
        ]
        assert regenerated.kind == original.kind

        real = regenerated.convert_to_real_feature(self.record)
        assert isinstance(real, SuperCluster)
        assert len(real.clusters) == len(self.record.get_clusters())
        for reference, record_cluster in zip(real.clusters,
                                             self.record.get_clusters()):
            assert reference is record_cluster

        # attempt a conversion with a record missing the cluster
        self.record.clear_clusters()
        with self.assertRaisesRegex(ValueError,
                                    "Not all referenced clusters are present"):
            regenerated.convert_to_real_feature(self.record)
コード例 #5
0
ファイル: main.py プロジェクト: zachcp/antismash
def run_detection(
    record: Record, options: ConfigType,
    module_results: Dict[str, Union[ModuleResults, Dict[str, Any]]]
) -> Dict[str, float]:
    """ Detect different secondary metabolite clusters, PFAMs, and domains.

        Arguments:
            record: the Record to run detection over
            options: antiSMASH config
            module_results: a dictionary mapping a module's name to results from
                            a previous run on this module, as a ModuleResults subclass
                            or in JSON form

        Returns:
            the time taken by each detection module as a dictionary
    """
    # strip any existing antismash results first
    record_processing.strip_record(record)

    timings = {}  # type: Dict[str, float]

    # run full genome detections
    for module in [full_hmmer]:
        run_module(record, cast(AntismashModule, module), options,
                   module_results, timings)
        results = module_results.get(module.__name__)
        if results:
            assert isinstance(results, ModuleResults)
            logging.debug("Adding detection results from %s to record",
                          module.__name__)
            results.add_to_record(record)

    # generate cluster predictions
    logging.info("Detecting secondary metabolite clusters")
    for module in [
            hmm_detection, cassis, clusterfinder_probabilistic,
            clusterfinder_rule
    ]:
        run_module(record, cast(AntismashModule, module), options,
                   module_results, timings)
        results = module_results.get(module.__name__)
        if results:
            assert isinstance(results, DetectionResults)
            for cluster in results.get_predicted_clusters():
                record.add_cluster(cluster)
            for region in results.get_predicted_subregions():
                record.add_subregion(region)

    logging.debug("%d clusters found", len(record.get_clusters()))
    logging.debug("%d subregions found", len(record.get_subregions()))

    # create superclusters and regions
    record.create_superclusters()
    record.create_regions()

    if not record.get_regions():
        logging.info("No regions detected, skipping record")
        record.skip = "No regions detected"
        return timings

    logging.info("%d region(s) detected in record", len(record.get_regions()))

    # finally, run any detection limited to genes in clusters
    for module in [nrps_pks_domains, cluster_hmmer, genefunctions]:
        run_module(record, cast(AntismashModule, module), options,
                   module_results, timings)
        results = module_results.get(module.__name__)
        if results:
            assert isinstance(results, ModuleResults)
            logging.debug("Adding detection results from %s to record",
                          module.__name__)
            results.add_to_record(record)

    return timings