def test_limit_to_record_complete(self): records = self.read_double_nisin() config.update_config({"limit_to_record": "bad_id"}) with self.assertRaisesRegex(AntismashInputError, "no sequences matched filter"): record_processing.pre_process_sequences(records, self.options, self.genefinding)
def test_nisin_fasta_only(self): config.update_config({"genefinding_tool": "none"}) filepath = path.get_full_path(__file__, "data", "nisin.fasta") records = record_processing.parse_input_sequence(filepath) assert len(records) == 1 assert not records[0].get_cds_features() # make sure genefinding wasn't run with default options with self.assertRaisesRegex(AntismashInputError, "all records skipped"): record_processing.pre_process_sequences(records, self.options, self.genefinding) assert not self.genefinding.was_run assert not records[0].get_cds_features() # make sure genefinding was run when not 'none' records[0].skip = False config.update_config({"genefinding_tool": "not-none"}) # due to no genes actually being marked, it'll raise an error with self.assertRaisesRegex(AntismashInputError, "all records skipped"): record_processing.pre_process_sequences(records, self.options, self.genefinding) # but genefinding was still run assert self.genefinding.was_run # still no features because we used dummy genefinding for record in records: assert not record.get_cds_features() assert record.skip.lower() == "no genes found"
def test_shotgun(self): filepath = path.get_full_path(__file__, "data", "wgs.gbk") records = record_processing.parse_input_sequence(filepath) with self.assertRaisesRegex( AntismashInputError, "incomplete whole genome shotgun records are not supported"): record_processing.pre_process_sequences(records, self.options, self.genefinding)
def test_limit_to_record_partial(self): records = self.read_double_nisin() assert all(rec.skip is None for rec in records) config.update_config({"limit_to_record": records[0].id}) records[0].id += "_changed" record_processing.pre_process_sequences(records, self.options, self.genefinding) assert not records[1].skip assert records[0].skip.startswith("did not match filter")
def test_nisin_fasta_gff(self): fasta = path.get_full_path(__file__, "data", "nisin.fasta") gff = path.get_full_path(__file__, "data", "nisin.gff3") config.update_config({"genefinding_gff3": gff}) records = record_processing.parse_input_sequence(fasta, gff_file=gff) record_processing.pre_process_sequences(records, self.options, self.genefinding) assert not self.genefinding.was_run assert len(records[0].get_cds_features()) == 11
def test_limit(self): records = self.read_double_nisin() assert all(rec.skip is None for rec in records) assert not self.options.triggered_limit config.update_config({"limit": 1}) record_processing.pre_process_sequences(records, self.options, self.genefinding) assert records[0].skip is None assert records[1].skip.startswith("skipping all but first 1") assert self.options.triggered_limit
def test_duplicate_record_ids(self): records = self.read_double_nisin() assert records[0].id == records[1].id records = record_processing.pre_process_sequences( records, self.options, self.genefinding) assert len(records) == 2 assert records[0].id != records[1].id
def test_nisin(self): record = parse_input_sequence(get_path_to_nisin_fasta())[0] assert record.get_feature_count() == 0 record = pre_process_sequences([record], self.options, genefinding)[0] assert record.get_feature_count() == 12 # and make sure they're all CDS features assert len(record.get_cds_features()) == 12
def test_fumigatus_cluster(self): record = parse_input_sequence(self.data_file('fumigatus.cluster1.fna'), taxon="fungi")[0] assert record.get_feature_count() == 0 record = pre_process_sequences([record], self.options, genefinding)[0] assert record.get_feature_count() == 11 # and make sure they're all CDS features assert len(record.get_cds_features()) == 11
def test_nisin_fasta_only(self): config.update_config({"genefinding_tool": "none"}) filepath = path.get_full_path(__file__, "data", "nisin.fasta") records = record_processing.parse_input_sequence(filepath) assert len(records) == 1 assert not records[0].get_cds_features() # make sure genefinding wasn't run with default options record_processing.pre_process_sequences(records, self.options, self.genefinding) assert not self.genefinding.was_run assert not records[0].get_cds_features() # make sure genefinding was run when not 'none' records[0].skip = False config.update_config({"genefinding_tool": "not-none"}) record_processing.pre_process_sequences(records, self.options, self.genefinding) assert self.genefinding.was_run # still no features because we used dummy genefinding assert not records[0].get_cds_features()
def test_records_with_bad_names(self): # reuse fumigatus and change the id to bad ids for bad in [ ".", # changes due to glimmerhmm "-bad", # could cause a fasta file to be created that is interpreted as an arg ]: record = parse_input_sequence( self.data_file('fumigatus.cluster1.fna'), taxon="fungi")[0] record.id = bad record = pre_process_sequences([record], self.options, genefinding)[0] assert record.get_cds_features()
def _run_antismash(sequence_file: Optional[str], options: ConfigType) -> int: """ The real run_antismash, assumes logging is set up around it """ logging.info("antiSMASH version: %s", options.version) _log_found_executables(options) detection_modules = get_detection_modules() analysis_modules = get_analysis_modules() output_modules = get_output_modules() modules = detection_modules + analysis_modules + output_modules if options.list_plugins: list_plugins(modules) return 0 options.all_enabled_modules = list(filter(lambda x: x.is_enabled(options), modules)) if options.check_prereqs_only: try: check_prerequisites(modules, options) except RuntimeError: print("Some module prerequisites not satisfied") return 1 print("All prerequisites satisfied") return 0 else: check_prerequisites(options.all_enabled_modules, options) # start up profiling if relevant if options.profile: profiler = cProfile.Profile() profiler.enable() # ensure the provided options are valid if not verify_options(options, options.all_enabled_modules): return 1 # check that at least one module will run if not options.all_enabled_modules: raise ValueError("No detection or analysis modules enabled") start_time = datetime.now() results = read_data(sequence_file, options) # reset module timings results.timings_by_record.clear() prepare_output_directory(options.output_dir, sequence_file or options.reuse_results) results.records = record_processing.pre_process_sequences(results.records, options, cast(AntismashModule, genefinding)) for record, module_results in zip(results.records, results.results): # skip if we're not interested in it if record.skip: continue logging.info("Analysing record: %s", record.id) timings = run_detection(record, options, module_results) # and skip analysis if detection didn't find anything if not record.get_regions(): continue analysis_timings = analyse_record(record, options, analysis_modules, module_results) timings.update(analysis_timings) results.timings_by_record[record.id] = timings # Write results json_filename = os.path.join(options.output_dir, results.input_file) json_filename = os.path.splitext(json_filename)[0] + ".json" logging.debug("Writing json results to '%s'", json_filename) results.write_to_file(json_filename) # now that the json is out of the way, annotate the record # otherwise we could double annotate some areas annotate_records(results) # create relevant output files write_outputs(results, options) # save profiling data if options.profile: profiler.disable() write_profiling_results(profiler, os.path.join(options.output_dir, "profiling_results")) running_time = datetime.now() - start_time # display module runtimes before total time if options.debug: log_module_runtimes(results.timings_by_record) logging.debug("antiSMASH calculation finished at %s; runtime: %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S"), str(running_time)) logging.info("antiSMASH status: SUCCESS") return 0
def run_antismash(sequence_file: Optional[str], options) -> int: """ The complete antismash pipeline. Reads in data, runs detection and analysis modules over any records found, then outputs the results to file. Arguments: sequence_file: the sequence file to read in records from, can be None if reusing results options: command line options as an argparse.Namespace detection_modules: None or a list of modules to use for detection, if None defaults will be used analysis_modules: None or a list of modules to use for analysis, if None defaults will be used Returns: 0 if requested operations completed succesfully, otherwise 1 Exceptions may also be raised """ logfile = options.logfile setup_logging(logfile=logfile, verbose=options.verbose, debug=options.debug) detection_modules = get_detection_modules() analysis_modules = get_analysis_modules() modules = detection_modules + analysis_modules if options.list_plugins: list_plugins(modules) return 0 options.all_enabled_modules = [ module for module in modules if module.is_enabled(options) ] # converts from a namespace to an antismash.config.Config instance so # modules can't fiddle with it options = update_config(options) if options.check_prereqs_only: try: check_prerequisites(modules) except RuntimeError: print("Some module prerequisites not satisfied") return 1 print("All prerequisites satisfied") return 0 else: check_prerequisites(options.all_enabled_modules) # start up profiling if relevant if options.profile: profiler = cProfile.Profile() profiler.enable() # ensure the provided options are valid if not verify_options(options, options.all_enabled_modules): return 1 # TODO: change to a raise? # check that at least one module will run if not options.all_enabled_modules: raise ValueError("No detection or analysis modules enabled") start_time = datetime.now() results = read_data(sequence_file, options) # reset module timings results.timings_by_record = {} prepare_output_directory(options.output_dir, sequence_file or options.reuse_results) results.records = record_processing.pre_process_sequences( results.records, options, genefinding) for seq_record, previous_result in zip(results.records, results.results): # skip if we're not interested in it if seq_record.skip: continue timings = run_detection(seq_record, options, previous_result) # and skip analysis if detection didn't find anything if not seq_record.get_clusters(): continue analysis_timings = analyse_record(seq_record, options, analysis_modules, previous_result) timings.update(analysis_timings) results.timings_by_record[seq_record.id] = timings # Write results json_filename = os.path.join(options.output_dir, results.input_file) json_filename = os.path.splitext(json_filename)[0] + ".json" logging.debug("Writing json results to '%s'", json_filename) results.write_to_file(json_filename) # now that the json is out of the way, annotate the record # otherwise we could double annotate some areas annotate_records(results) # create relevant output files write_outputs(results, options) # save profiling data if options.profile: profiler.disable() write_profiling_results( profiler, os.path.join(options.output_dir, "profiling_results")) running_time = datetime.now() - start_time # display module runtimes before total time if options.debug: log_module_runtimes(results.timings_by_record) logging.debug("antiSMASH calculation finished at %s; runtime: %s", str(datetime.now()), str(running_time)) logging.info("antiSMASH status: SUCCESS") return 0
def run_on_records(self, records): record_processing.pre_process_sequences(records, self.options, self.genefinding)