def test_trim(self):
     nisin_path = helpers.get_path_to_nisin_genbank()
     records = record_processing.parse_input_sequence(nisin_path, start=10, end=5000)
     assert len(records) == 1
     assert isinstance(records[0], Record)
     assert len(records[0].get_cds_features()) == 2
     assert len(records[0].seq) == 4990
Example #2
0
 def check_add_to_record(self, input_file, results):
     record = record_processing.parse_input_sequence(input_file)[0]
     results.add_to_record(record)
     for domain in record.get_pfam_domains():
         if domain.gene_ontologies:
             assert sorted(domain.gene_ontologies.ids) == sorted(
                 results.get_all_gos())
Example #3
0
 def test_nisin(self):
     nisin_path = helpers.get_path_to_nisin_genbank()
     records = record_processing.parse_input_sequence(nisin_path)
     assert len(records) == 1
     assert isinstance(records[0], Record)
     assert len(records[0].get_cds_features()) == 11
     assert len(records[0].seq) == 15016
Example #4
0
    def test_nisin_fasta_only(self):
        config.update_config({"genefinding_tool": "none"})
        filepath = path.get_full_path(__file__, "data", "nisin.fasta")
        records = record_processing.parse_input_sequence(filepath)
        assert len(records) == 1
        assert not records[0].get_cds_features()
        # make sure genefinding wasn't run with default options
        with self.assertRaisesRegex(AntismashInputError,
                                    "all records skipped"):
            record_processing.pre_process_sequences(records, self.options,
                                                    self.genefinding)
        assert not self.genefinding.was_run
        assert not records[0].get_cds_features()

        # make sure genefinding was run when not 'none'
        records[0].skip = False
        config.update_config({"genefinding_tool": "not-none"})
        # due to no genes actually being marked, it'll raise an error
        with self.assertRaisesRegex(AntismashInputError,
                                    "all records skipped"):
            record_processing.pre_process_sequences(records, self.options,
                                                    self.genefinding)
        # but genefinding was still run
        assert self.genefinding.was_run
        # still no features because we used dummy genefinding
        for record in records:
            assert not record.get_cds_features()
            assert record.skip.lower() == "no genes found"
Example #5
0
def read_data(sequence_file, options) -> serialiser.AntismashResults:
    """ Reads in the data to be used in the analysis run. Can be provided as
        as a sequence file (fasta/genbank) or as file of prior results

        Arguments:
            sequence_file: A fasta/genbank file to read (or None)
            options: An antismash Config instance

        Returns:
            a AntismashResults instance, populated only if reusing results

    """
    if not sequence_file and not options.reuse_results:
        raise ValueError("No sequence file or prior results to read")

    if sequence_file:
        records = record_processing.parse_input_sequence(
            sequence_file, options.taxon, options.minlength, options.start,
            options.end)
        return serialiser.AntismashResults(
            sequence_file.rsplit(os.sep, 1)[-1], records,
            [{} for i in range(len(records))], __version__)

    logging.debug("Attempting to reuse previous results in: %s",
                  options.reuse_results)
    with open(options.reuse_results) as handle:
        contents = handle.read()
        if not contents:
            raise ValueError("No results contained in file: %s" %
                             options.reuse_results)
    results = serialiser.AntismashResults.from_file(options.reuse_results,
                                                    options.taxon)
    return results
Example #6
0
 def test_nisin(self):
     record = parse_input_sequence(get_path_to_nisin_fasta())[0]
     assert record.get_feature_count() == 0
     record = pre_process_sequences([record], self.options, genefinding)[0]
     assert record.get_feature_count() == 12
     # and make sure they're all CDS features
     assert len(record.get_cds_features()) == 12
Example #7
0
    def test_add_to_record(self):
        nisin = helpers.get_path_to_nisin_genbank()
        record = record_processing.parse_input_sequence(nisin)[0]
        assert not record.get_pfam_domains()

        # add a test PFAM
        pfam = PFAMDomain(FeatureLocation(2, 5),
                          description="test",
                          protein_start=5,
                          protein_end=10,
                          identifier="PF00005",
                          domain="PF00005",
                          tool="test")
        pfam.domain_id = "test"
        record.add_pfam_domain(pfam)
        assert len(record.get_pfam_domains()) == 1

        # run pfam2go and add the results
        results = pfam2go.run_on_record(record, None, self.options)
        assert pfam in results.pfam_domains_with_gos

        assert not pfam.gene_ontologies
        results.add_to_record(record)
        assert pfam.gene_ontologies

        # check the contents of the annotation
        for domain in record.get_pfam_domains():
            assert domain.gene_ontologies
            assert sorted(domain.gene_ontologies.ids) == sorted(
                results.get_all_gos(domain))
    def test_reuse(self):
        nisin = helpers.get_path_to_nisin_genbank()
        record = record_processing.parse_input_sequence(nisin)[0]

        results = helpers.run_and_regenerate_results_for_module(
            nisin, cluster_hmmer, self.options)
        json = results.to_json()
        assert len(results.hits) == 24
        self.check_add_to_record(nisin, results)

        # test regeneration when thresholds are less restrictive
        new_score_threshold = self.original_min_score - .1
        self.set_min_score(new_score_threshold)
        new_results = cluster_hmmer.regenerate_previous_results(
            json, record, self.options)
        assert new_results is None
        self.set_min_score(self.original_min_score)

        new_evalue_threshold = self.original_max_evalue + .1
        self.set_max_evalue(new_evalue_threshold)
        new_results = cluster_hmmer.regenerate_previous_results(
            json, record, self.options)
        assert new_results is None
        self.set_max_evalue(self.original_max_evalue)

        # test regeneration when evalue threshold is more restrictive
        new_evalue_threshold = sorted(hit["evalue"]
                                      for hit in results.hits)[12]
        assert new_evalue_threshold < self.original_max_evalue
        new_hits = []
        for hit in results.hits:
            if hit["evalue"] <= new_evalue_threshold:
                new_hits.append(hit)
        new_hits.sort(key=lambda x: x["evalue"])
        assert len(new_hits) < 24

        self.set_max_evalue(new_evalue_threshold)
        new_results = cluster_hmmer.regenerate_previous_results(
            json, record, self.options)
        self.set_max_evalue(self.original_max_evalue)
        assert sorted(new_results.hits, key=lambda x: x["evalue"]) == new_hits
        self.check_add_to_record(nisin, results)

        # test regeneration when score threshold is more restrictive
        new_score_threshold = sorted(hit["score"] for hit in results.hits)[12]
        assert new_score_threshold > cluster_hmmer.MIN_SCORE
        new_hits = []
        for hit in results.hits:
            if hit["score"] >= new_score_threshold:
                new_hits.append(hit)
        new_hits.sort(key=lambda x: x["score"])
        assert len(new_hits) < 24

        self.set_min_score(new_score_threshold)
        new_results = cluster_hmmer.regenerate_previous_results(
            json, record, self.options)
        self.set_min_score(self.original_min_score)
        assert sorted(new_results.hits, key=lambda x: x["score"]) == new_hits
        self.check_add_to_record(nisin, results)
 def test_nisin_fasta_gff(self):
     fasta = path.get_full_path(__file__, "data", "nisin.fasta")
     gff = path.get_full_path(__file__, "data", "nisin.gff3")
     config.update_config({"genefinding_gff3": gff})
     records = record_processing.parse_input_sequence(fasta, gff_file=gff)
     record_processing.pre_process_sequences(records, self.options, self.genefinding)
     assert not self.genefinding.was_run
     assert len(records[0].get_cds_features()) == 11
 def test_fumigatus_cluster(self):
     record = parse_input_sequence(self.data_file('fumigatus.cluster1.fna'),
                                   taxon="fungi")[0]
     assert record.get_feature_count() == 0
     record = pre_process_sequences([record], self.options, genefinding)[0]
     assert record.get_feature_count() == 11
     # and make sure they're all CDS features
     assert len(record.get_cds_features()) == 11
Example #11
0
 def test_shotgun(self):
     filepath = path.get_full_path(__file__, "data", "wgs.gbk")
     records = record_processing.parse_input_sequence(filepath)
     with self.assertRaisesRegex(
             AntismashInputError,
             "incomplete whole genome shotgun records are not supported"):
         record_processing.pre_process_sequences(records, self.options,
                                                 self.genefinding)
Example #12
0
 def setUp(self):
     test_file = path.get_full_path(__file__, 'data', 'NC_003888.3.cluster011.gbk')
     self.record = record_processing.parse_input_sequence(test_file)[0]
     self.cluster = Protocluster(FeatureLocation(0, len(self.record.seq)),
                                 surrounding_location=FeatureLocation(0, len(self.record.seq)),
                                 cutoff=20, neighbourhood_range=0, tool="test", product="T2PKS",
                                 detection_rule="dummy rule")
     self.record.add_protocluster(self.cluster)
     self.record.create_candidate_clusters()
     self.record.create_regions()
    def test_minimum_length(self):
        nisin_path = helpers.get_path_to_nisin_genbank()
        records = record_processing.parse_input_sequence(nisin_path,
                                                         minimum_length=-16)
        assert len(records) == 1

        records = record_processing.parse_input_sequence(nisin_path,
                                                         minimum_length=15016)
        assert len(records) == 1

        records = record_processing.parse_input_sequence(nisin_path,
                                                         minimum_length=15017)
        assert not records

        for bad_len in [5.6, None, "5"]:
            with self.assertRaisesRegex(TypeError,
                                        "minimum_length must be an int"):
                record_processing.parse_input_sequence(path,
                                                       minimum_length=bad_len)
 def test_records_with_bad_names(self):
     # reuse fumigatus and change the id to bad ids
     for bad in [
             ".",  # changes due to glimmerhmm
             "-bad",  # could cause a fasta file to be created that is interpreted as an arg
     ]:
         record = parse_input_sequence(
             self.data_file('fumigatus.cluster1.fna'), taxon="fungi")[0]
         record.id = bad
         record = pre_process_sequences([record], self.options,
                                        genefinding)[0]
         assert record.get_cds_features()
 def test_multiple_biosynthetic_enzymes(self):
     filename = path.get_full_path(__file__, 'data',
                                   'CP013129.1.section.gbk')
     rec = record_processing.parse_input_sequence(filename,
                                                  taxon="bacteria")[0]
     rec.clear_cds_motifs()
     assert rec.get_cluster(0).products == ("lanthipeptide", "nrps")
     assert rec.get_cluster(0).cds_children
     result = run_specific_analysis(rec)
     assert len(result.clusters) == 1
     assert result.clusters[1] == set(["AQF52_7190", "AQF52_7168"])
     motif = result.motifs_by_locus["AQF52_7190"][0]
     assert motif.peptide_subclass == "Class II"
     motif = result.motifs_by_locus["AQF52_7168"][0]
     assert motif.peptide_subclass == "Class III"
Example #16
0
def read_data(sequence_file: Optional[str],
              options: ConfigType) -> serialiser.AntismashResults:
    """ Reads in the data to be used in the analysis run. Can be provided as
        as a sequence file (fasta/genbank) or as file of prior results

        Arguments:
            sequence_file: A fasta/genbank file to read (or None)
            options: An antismash Config instance

        Returns:
            a AntismashResults instance, populated only if reusing results

    """
    if not sequence_file and not options.reuse_results:
        raise ValueError("No sequence file or prior results to read")

    if sequence_file:
        records = record_processing.parse_input_sequence(
            sequence_file,
            options.taxon,
            options.minlength,
            options.start,
            options.end,
            gff_file=options.genefinding_gff3)
        results = serialiser.AntismashResults(sequence_file.rsplit(os.sep,
                                                                   1)[-1],
                                              records, [{} for i in records],
                                              __version__,
                                              taxon=options.taxon)
        update_config({"input_file": os.path.splitext(results.input_file)[1]})
    else:
        logging.debug("Attempting to reuse previous results in: %s",
                      options.reuse_results)
        with open(options.reuse_results) as handle:
            contents = handle.read()
            if not contents:
                raise ValueError("No results contained in file: %s" %
                                 options.reuse_results)
        results = serialiser.AntismashResults.from_file(options.reuse_results)
        for record in results.records:
            record.strip_antismash_annotations()
        if options.taxon != results.taxon:
            logging.info("Reusing taxon %s from prior results", results.taxon)
            update_config({"taxon": results.taxon})

    update_config({"input_file": os.path.splitext(results.input_file)[0]})
    return results
Example #17
0
    def test_regeneration(self):
        datafile = helpers.get_path_to_balhymicin_genbank()
        results = helpers.run_and_regenerate_results_for_module(
            datafile, active_site_finder, self.options)
        assert results.pairings
        for domain, labels in results.pairings:
            for label in labels:
                assert label
                assert isinstance(label, str)
            assert isinstance(domain, secmet.AntismashDomain)
        record = parse_input_sequence(datafile)

        # check the reuse portion works
        rerun = active_site_finder.run_on_record(record, results, self.options)
        assert rerun is results  # specifically checking it's the same object

        with self.assertRaisesRegex(AssertionError, "str"):
            active_site_finder.run_on_record(record, "invalid", self.options)
Example #18
0
    def test_nisin(self):
        record = parse_input_sequence(helpers.get_path_to_nisin_with_detection())[0]
        clusters = record.get_clusters()
        assert clusters
        for cluster in clusters:
            assert cluster.cds_children
        assert record.get_cds_features_within_clusters()
        before_count = record.get_feature_count()

        assert tta.check_prereqs() == []
        assert tta.check_options(self.options) == []
        assert tta.is_enabled(self.options)
        prior_results = None
        results = tta.run_on_record(record, prior_results, self.options)
        assert isinstance(results, ModuleResults)
        assert len(results.features) == 174
        assert record.get_feature_count() == before_count
        results.add_to_record(record)
        assert record.get_feature_count() == before_count + 174
    def test_nisin_fasta_only(self):
        config.update_config({"genefinding_tool": "none"})
        filepath = path.get_full_path(__file__, "data", "nisin.fasta")
        records = record_processing.parse_input_sequence(filepath)
        assert len(records) == 1
        assert not records[0].get_cds_features()
        # make sure genefinding wasn't run with default options
        record_processing.pre_process_sequences(records, self.options,
                                                self.genefinding)
        assert not self.genefinding.was_run
        assert not records[0].get_cds_features()

        # make sure genefinding was run when not 'none'
        records[0].skip = False
        config.update_config({"genefinding_tool": "not-none"})
        record_processing.pre_process_sequences(records, self.options,
                                                self.genefinding)
        assert self.genefinding.was_run
        # still no features because we used dummy genefinding
        assert not records[0].get_cds_features()
Example #20
0
 def setUp(self):
     test_file = path.get_full_path(__file__, 'data',
                                    'NC_003888.3.cluster011.gbk')
     self.record = record_processing.parse_input_sequence(test_file)[0]
     self.cluster = Cluster(FeatureLocation(0, len(self.record.seq)),
                            surrounding_location=FeatureLocation(
                                0, len(self.record.seq)),
                            cutoff=20,
                            neighbourhood_range=0,
                            tool="test",
                            product="T2PKS",
                            detection_rule="dummy rule")
     self.record.add_cluster(self.cluster)
     self.record.create_superclusters()
     self.record.create_regions()
     hmm_results = {
         'SCO5072':
         [HMMResult("KR", 1, 265, evalue=3.1e-49, bitscore=159.4)],
         'SCO5079':
         [HMMResult("DIMER", 4, 293, evalue=8.7e-131, bitscore=426.8)],
         'SCO5080':
         [HMMResult("OXY", 8, 377, evalue=2.1e-14, bitscore=44.7)],
         'SCO5086':
         [HMMResult("KR_C9", 0, 261, evalue=1.9e-134, bitscore=438.4)],
         'SCO5087':
         [HMMResult("KS", 44, 463, evalue=3.5e-234, bitscore=768.6)],
         'SCO5088':
         [HMMResult("CLF_7", 1, 401, evalue=1.2e-226, bitscore=743.5)],
         'SCO5089': [HMMResult("ACP", 4, 86, evalue=5e-36, bitscore=114.2)],
         'SCO5090':
         [HMMResult("CYC_C7-C12", 1, 312, evalue=7.8e-124, bitscore=404)],
         'SCO5091':
         [HMMResult("CYC_C5-C14", 3, 297, evalue=4.4e-143, bitscore=467.3)],
         'SCO5094':
         [HMMResult("MET", 40, 155, evalue=9.8e-11, bitscore=32.7)],
         'SCO5097':
         [HMMResult("KR", 3, 247, evalue=3.3e-40, bitscore=129.8)],
     }
     mock("t2pks_analysis.run_t2pks_hmmscan", returns=hmm_results)
     mock("t2pks_analysis.run_starter_unit_blastp", returns={})
Example #21
0
    def test_add_to_record(self):
        nisin = helpers.get_path_to_nisin_genbank()
        record = record_processing.parse_input_sequence(nisin)[0]
        assert not record.get_pfam_domains()

        # add a test PFAM
        pfam = helpers.DummyPFAMDomain(identifier="PF00005", domain="PF00005")
        record.add_pfam_domain(pfam)
        assert len(record.get_pfam_domains()) == 1

        # run pfam2go and add the results
        results = pfam2go.run_on_record(record, None, self.options)
        assert pfam in results.pfam_domains_with_gos

        assert not pfam.gene_ontologies
        results.add_to_record(record)
        assert pfam.gene_ontologies

        # check the contents of the annotation
        for domain in record.get_pfam_domains():
            assert domain.gene_ontologies
            assert sorted(domain.gene_ontologies.ids) == sorted(
                results.get_all_gos(domain))
Example #22
0
 def check_add_to_record(self, input_file, results):
     record = record_processing.parse_input_sequence(input_file)[0]
     assert not record.get_pfam_domains()
     results.add_to_record(record)
     assert len(record.get_pfam_domains()) == len(results.hits)
Example #23
0
 def test_empty(self):
     with NamedTemporaryFile(suffix=".gbk") as temp:
         with self.assertRaisesRegex(AntismashInputError,
                                     "no valid records found"):
             record_processing.parse_input_sequence(temp.name)
Example #24
0
 def test_nonexistant(self):
     with self.assertRaisesRegex(AntismashInputError,
                                 "No such file or directory"):
         record_processing.parse_input_sequence("does_not_exist.gbk")
Example #25
0
 def read_nisin(self):
     records = record_processing.parse_input_sequence(
         helpers.get_path_to_nisin_genbank())
     assert len(records) == 1
     return records
 def test_empty(self):
     with NamedTemporaryFile(suffix=".gbk") as temp:
         with self.assertRaisesRegex(RuntimeError,
                                     "No records could be read from file"):
             record_processing.parse_input_sequence(temp.name)
 def test_nonexistant(self):
     with self.assertRaisesRegex(ValueError, "Sequence file not found: .*"):
         record_processing.parse_input_sequence("does_not_exist.gbk")
Example #28
0
 def check_add_to_record(self, input_file, results):
     record = record_processing.parse_input_sequence(input_file)[0]
     assert not record.get_antismash_domains_by_tool("tigrfam")
     results.add_to_record(record)
     assert len(record.get_antismash_domains_by_tool("tigrfam")) == len(
         results.hits)