def test_trim(self): nisin_path = helpers.get_path_to_nisin_genbank() records = record_processing.parse_input_sequence(nisin_path, start=10, end=5000) assert len(records) == 1 assert isinstance(records[0], Record) assert len(records[0].get_cds_features()) == 2 assert len(records[0].seq) == 4990
def check_add_to_record(self, input_file, results): record = record_processing.parse_input_sequence(input_file)[0] results.add_to_record(record) for domain in record.get_pfam_domains(): if domain.gene_ontologies: assert sorted(domain.gene_ontologies.ids) == sorted( results.get_all_gos())
def test_nisin(self): nisin_path = helpers.get_path_to_nisin_genbank() records = record_processing.parse_input_sequence(nisin_path) assert len(records) == 1 assert isinstance(records[0], Record) assert len(records[0].get_cds_features()) == 11 assert len(records[0].seq) == 15016
def test_nisin_fasta_only(self): config.update_config({"genefinding_tool": "none"}) filepath = path.get_full_path(__file__, "data", "nisin.fasta") records = record_processing.parse_input_sequence(filepath) assert len(records) == 1 assert not records[0].get_cds_features() # make sure genefinding wasn't run with default options with self.assertRaisesRegex(AntismashInputError, "all records skipped"): record_processing.pre_process_sequences(records, self.options, self.genefinding) assert not self.genefinding.was_run assert not records[0].get_cds_features() # make sure genefinding was run when not 'none' records[0].skip = False config.update_config({"genefinding_tool": "not-none"}) # due to no genes actually being marked, it'll raise an error with self.assertRaisesRegex(AntismashInputError, "all records skipped"): record_processing.pre_process_sequences(records, self.options, self.genefinding) # but genefinding was still run assert self.genefinding.was_run # still no features because we used dummy genefinding for record in records: assert not record.get_cds_features() assert record.skip.lower() == "no genes found"
def read_data(sequence_file, options) -> serialiser.AntismashResults: """ Reads in the data to be used in the analysis run. Can be provided as as a sequence file (fasta/genbank) or as file of prior results Arguments: sequence_file: A fasta/genbank file to read (or None) options: An antismash Config instance Returns: a AntismashResults instance, populated only if reusing results """ if not sequence_file and not options.reuse_results: raise ValueError("No sequence file or prior results to read") if sequence_file: records = record_processing.parse_input_sequence( sequence_file, options.taxon, options.minlength, options.start, options.end) return serialiser.AntismashResults( sequence_file.rsplit(os.sep, 1)[-1], records, [{} for i in range(len(records))], __version__) logging.debug("Attempting to reuse previous results in: %s", options.reuse_results) with open(options.reuse_results) as handle: contents = handle.read() if not contents: raise ValueError("No results contained in file: %s" % options.reuse_results) results = serialiser.AntismashResults.from_file(options.reuse_results, options.taxon) return results
def test_nisin(self): record = parse_input_sequence(get_path_to_nisin_fasta())[0] assert record.get_feature_count() == 0 record = pre_process_sequences([record], self.options, genefinding)[0] assert record.get_feature_count() == 12 # and make sure they're all CDS features assert len(record.get_cds_features()) == 12
def test_add_to_record(self): nisin = helpers.get_path_to_nisin_genbank() record = record_processing.parse_input_sequence(nisin)[0] assert not record.get_pfam_domains() # add a test PFAM pfam = PFAMDomain(FeatureLocation(2, 5), description="test", protein_start=5, protein_end=10, identifier="PF00005", domain="PF00005", tool="test") pfam.domain_id = "test" record.add_pfam_domain(pfam) assert len(record.get_pfam_domains()) == 1 # run pfam2go and add the results results = pfam2go.run_on_record(record, None, self.options) assert pfam in results.pfam_domains_with_gos assert not pfam.gene_ontologies results.add_to_record(record) assert pfam.gene_ontologies # check the contents of the annotation for domain in record.get_pfam_domains(): assert domain.gene_ontologies assert sorted(domain.gene_ontologies.ids) == sorted( results.get_all_gos(domain))
def test_reuse(self): nisin = helpers.get_path_to_nisin_genbank() record = record_processing.parse_input_sequence(nisin)[0] results = helpers.run_and_regenerate_results_for_module( nisin, cluster_hmmer, self.options) json = results.to_json() assert len(results.hits) == 24 self.check_add_to_record(nisin, results) # test regeneration when thresholds are less restrictive new_score_threshold = self.original_min_score - .1 self.set_min_score(new_score_threshold) new_results = cluster_hmmer.regenerate_previous_results( json, record, self.options) assert new_results is None self.set_min_score(self.original_min_score) new_evalue_threshold = self.original_max_evalue + .1 self.set_max_evalue(new_evalue_threshold) new_results = cluster_hmmer.regenerate_previous_results( json, record, self.options) assert new_results is None self.set_max_evalue(self.original_max_evalue) # test regeneration when evalue threshold is more restrictive new_evalue_threshold = sorted(hit["evalue"] for hit in results.hits)[12] assert new_evalue_threshold < self.original_max_evalue new_hits = [] for hit in results.hits: if hit["evalue"] <= new_evalue_threshold: new_hits.append(hit) new_hits.sort(key=lambda x: x["evalue"]) assert len(new_hits) < 24 self.set_max_evalue(new_evalue_threshold) new_results = cluster_hmmer.regenerate_previous_results( json, record, self.options) self.set_max_evalue(self.original_max_evalue) assert sorted(new_results.hits, key=lambda x: x["evalue"]) == new_hits self.check_add_to_record(nisin, results) # test regeneration when score threshold is more restrictive new_score_threshold = sorted(hit["score"] for hit in results.hits)[12] assert new_score_threshold > cluster_hmmer.MIN_SCORE new_hits = [] for hit in results.hits: if hit["score"] >= new_score_threshold: new_hits.append(hit) new_hits.sort(key=lambda x: x["score"]) assert len(new_hits) < 24 self.set_min_score(new_score_threshold) new_results = cluster_hmmer.regenerate_previous_results( json, record, self.options) self.set_min_score(self.original_min_score) assert sorted(new_results.hits, key=lambda x: x["score"]) == new_hits self.check_add_to_record(nisin, results)
def test_nisin_fasta_gff(self): fasta = path.get_full_path(__file__, "data", "nisin.fasta") gff = path.get_full_path(__file__, "data", "nisin.gff3") config.update_config({"genefinding_gff3": gff}) records = record_processing.parse_input_sequence(fasta, gff_file=gff) record_processing.pre_process_sequences(records, self.options, self.genefinding) assert not self.genefinding.was_run assert len(records[0].get_cds_features()) == 11
def test_fumigatus_cluster(self): record = parse_input_sequence(self.data_file('fumigatus.cluster1.fna'), taxon="fungi")[0] assert record.get_feature_count() == 0 record = pre_process_sequences([record], self.options, genefinding)[0] assert record.get_feature_count() == 11 # and make sure they're all CDS features assert len(record.get_cds_features()) == 11
def test_shotgun(self): filepath = path.get_full_path(__file__, "data", "wgs.gbk") records = record_processing.parse_input_sequence(filepath) with self.assertRaisesRegex( AntismashInputError, "incomplete whole genome shotgun records are not supported"): record_processing.pre_process_sequences(records, self.options, self.genefinding)
def setUp(self): test_file = path.get_full_path(__file__, 'data', 'NC_003888.3.cluster011.gbk') self.record = record_processing.parse_input_sequence(test_file)[0] self.cluster = Protocluster(FeatureLocation(0, len(self.record.seq)), surrounding_location=FeatureLocation(0, len(self.record.seq)), cutoff=20, neighbourhood_range=0, tool="test", product="T2PKS", detection_rule="dummy rule") self.record.add_protocluster(self.cluster) self.record.create_candidate_clusters() self.record.create_regions()
def test_minimum_length(self): nisin_path = helpers.get_path_to_nisin_genbank() records = record_processing.parse_input_sequence(nisin_path, minimum_length=-16) assert len(records) == 1 records = record_processing.parse_input_sequence(nisin_path, minimum_length=15016) assert len(records) == 1 records = record_processing.parse_input_sequence(nisin_path, minimum_length=15017) assert not records for bad_len in [5.6, None, "5"]: with self.assertRaisesRegex(TypeError, "minimum_length must be an int"): record_processing.parse_input_sequence(path, minimum_length=bad_len)
def test_records_with_bad_names(self): # reuse fumigatus and change the id to bad ids for bad in [ ".", # changes due to glimmerhmm "-bad", # could cause a fasta file to be created that is interpreted as an arg ]: record = parse_input_sequence( self.data_file('fumigatus.cluster1.fna'), taxon="fungi")[0] record.id = bad record = pre_process_sequences([record], self.options, genefinding)[0] assert record.get_cds_features()
def test_multiple_biosynthetic_enzymes(self): filename = path.get_full_path(__file__, 'data', 'CP013129.1.section.gbk') rec = record_processing.parse_input_sequence(filename, taxon="bacteria")[0] rec.clear_cds_motifs() assert rec.get_cluster(0).products == ("lanthipeptide", "nrps") assert rec.get_cluster(0).cds_children result = run_specific_analysis(rec) assert len(result.clusters) == 1 assert result.clusters[1] == set(["AQF52_7190", "AQF52_7168"]) motif = result.motifs_by_locus["AQF52_7190"][0] assert motif.peptide_subclass == "Class II" motif = result.motifs_by_locus["AQF52_7168"][0] assert motif.peptide_subclass == "Class III"
def read_data(sequence_file: Optional[str], options: ConfigType) -> serialiser.AntismashResults: """ Reads in the data to be used in the analysis run. Can be provided as as a sequence file (fasta/genbank) or as file of prior results Arguments: sequence_file: A fasta/genbank file to read (or None) options: An antismash Config instance Returns: a AntismashResults instance, populated only if reusing results """ if not sequence_file and not options.reuse_results: raise ValueError("No sequence file or prior results to read") if sequence_file: records = record_processing.parse_input_sequence( sequence_file, options.taxon, options.minlength, options.start, options.end, gff_file=options.genefinding_gff3) results = serialiser.AntismashResults(sequence_file.rsplit(os.sep, 1)[-1], records, [{} for i in records], __version__, taxon=options.taxon) update_config({"input_file": os.path.splitext(results.input_file)[1]}) else: logging.debug("Attempting to reuse previous results in: %s", options.reuse_results) with open(options.reuse_results) as handle: contents = handle.read() if not contents: raise ValueError("No results contained in file: %s" % options.reuse_results) results = serialiser.AntismashResults.from_file(options.reuse_results) for record in results.records: record.strip_antismash_annotations() if options.taxon != results.taxon: logging.info("Reusing taxon %s from prior results", results.taxon) update_config({"taxon": results.taxon}) update_config({"input_file": os.path.splitext(results.input_file)[0]}) return results
def test_regeneration(self): datafile = helpers.get_path_to_balhymicin_genbank() results = helpers.run_and_regenerate_results_for_module( datafile, active_site_finder, self.options) assert results.pairings for domain, labels in results.pairings: for label in labels: assert label assert isinstance(label, str) assert isinstance(domain, secmet.AntismashDomain) record = parse_input_sequence(datafile) # check the reuse portion works rerun = active_site_finder.run_on_record(record, results, self.options) assert rerun is results # specifically checking it's the same object with self.assertRaisesRegex(AssertionError, "str"): active_site_finder.run_on_record(record, "invalid", self.options)
def test_nisin(self): record = parse_input_sequence(helpers.get_path_to_nisin_with_detection())[0] clusters = record.get_clusters() assert clusters for cluster in clusters: assert cluster.cds_children assert record.get_cds_features_within_clusters() before_count = record.get_feature_count() assert tta.check_prereqs() == [] assert tta.check_options(self.options) == [] assert tta.is_enabled(self.options) prior_results = None results = tta.run_on_record(record, prior_results, self.options) assert isinstance(results, ModuleResults) assert len(results.features) == 174 assert record.get_feature_count() == before_count results.add_to_record(record) assert record.get_feature_count() == before_count + 174
def test_nisin_fasta_only(self): config.update_config({"genefinding_tool": "none"}) filepath = path.get_full_path(__file__, "data", "nisin.fasta") records = record_processing.parse_input_sequence(filepath) assert len(records) == 1 assert not records[0].get_cds_features() # make sure genefinding wasn't run with default options record_processing.pre_process_sequences(records, self.options, self.genefinding) assert not self.genefinding.was_run assert not records[0].get_cds_features() # make sure genefinding was run when not 'none' records[0].skip = False config.update_config({"genefinding_tool": "not-none"}) record_processing.pre_process_sequences(records, self.options, self.genefinding) assert self.genefinding.was_run # still no features because we used dummy genefinding assert not records[0].get_cds_features()
def setUp(self): test_file = path.get_full_path(__file__, 'data', 'NC_003888.3.cluster011.gbk') self.record = record_processing.parse_input_sequence(test_file)[0] self.cluster = Cluster(FeatureLocation(0, len(self.record.seq)), surrounding_location=FeatureLocation( 0, len(self.record.seq)), cutoff=20, neighbourhood_range=0, tool="test", product="T2PKS", detection_rule="dummy rule") self.record.add_cluster(self.cluster) self.record.create_superclusters() self.record.create_regions() hmm_results = { 'SCO5072': [HMMResult("KR", 1, 265, evalue=3.1e-49, bitscore=159.4)], 'SCO5079': [HMMResult("DIMER", 4, 293, evalue=8.7e-131, bitscore=426.8)], 'SCO5080': [HMMResult("OXY", 8, 377, evalue=2.1e-14, bitscore=44.7)], 'SCO5086': [HMMResult("KR_C9", 0, 261, evalue=1.9e-134, bitscore=438.4)], 'SCO5087': [HMMResult("KS", 44, 463, evalue=3.5e-234, bitscore=768.6)], 'SCO5088': [HMMResult("CLF_7", 1, 401, evalue=1.2e-226, bitscore=743.5)], 'SCO5089': [HMMResult("ACP", 4, 86, evalue=5e-36, bitscore=114.2)], 'SCO5090': [HMMResult("CYC_C7-C12", 1, 312, evalue=7.8e-124, bitscore=404)], 'SCO5091': [HMMResult("CYC_C5-C14", 3, 297, evalue=4.4e-143, bitscore=467.3)], 'SCO5094': [HMMResult("MET", 40, 155, evalue=9.8e-11, bitscore=32.7)], 'SCO5097': [HMMResult("KR", 3, 247, evalue=3.3e-40, bitscore=129.8)], } mock("t2pks_analysis.run_t2pks_hmmscan", returns=hmm_results) mock("t2pks_analysis.run_starter_unit_blastp", returns={})
def test_add_to_record(self): nisin = helpers.get_path_to_nisin_genbank() record = record_processing.parse_input_sequence(nisin)[0] assert not record.get_pfam_domains() # add a test PFAM pfam = helpers.DummyPFAMDomain(identifier="PF00005", domain="PF00005") record.add_pfam_domain(pfam) assert len(record.get_pfam_domains()) == 1 # run pfam2go and add the results results = pfam2go.run_on_record(record, None, self.options) assert pfam in results.pfam_domains_with_gos assert not pfam.gene_ontologies results.add_to_record(record) assert pfam.gene_ontologies # check the contents of the annotation for domain in record.get_pfam_domains(): assert domain.gene_ontologies assert sorted(domain.gene_ontologies.ids) == sorted( results.get_all_gos(domain))
def check_add_to_record(self, input_file, results): record = record_processing.parse_input_sequence(input_file)[0] assert not record.get_pfam_domains() results.add_to_record(record) assert len(record.get_pfam_domains()) == len(results.hits)
def test_empty(self): with NamedTemporaryFile(suffix=".gbk") as temp: with self.assertRaisesRegex(AntismashInputError, "no valid records found"): record_processing.parse_input_sequence(temp.name)
def test_nonexistant(self): with self.assertRaisesRegex(AntismashInputError, "No such file or directory"): record_processing.parse_input_sequence("does_not_exist.gbk")
def read_nisin(self): records = record_processing.parse_input_sequence( helpers.get_path_to_nisin_genbank()) assert len(records) == 1 return records
def test_empty(self): with NamedTemporaryFile(suffix=".gbk") as temp: with self.assertRaisesRegex(RuntimeError, "No records could be read from file"): record_processing.parse_input_sequence(temp.name)
def test_nonexistant(self): with self.assertRaisesRegex(ValueError, "Sequence file not found: .*"): record_processing.parse_input_sequence("does_not_exist.gbk")
def check_add_to_record(self, input_file, results): record = record_processing.parse_input_sequence(input_file)[0] assert not record.get_antismash_domains_by_tool("tigrfam") results.add_to_record(record) assert len(record.get_antismash_domains_by_tool("tigrfam")) == len( results.hits)