def test_check_prereqs_missing_executables(self): options = build_config(["--check-prereqs"], isolated=True, modules=get_all_modules()) update_config({"executables": Namespace()}) mock("antismash.config.get_config", returns=options) assert hasattr(get_config(), "executables") assert not get_config().executables.__dict__ with self.assertRaisesRegex(RuntimeError, "failing prereq"): antismash.main.check_prerequisites(get_all_modules(), options)
def run_diamond(subcommand: str, opts: Optional[List[str]] = None) -> RunResult: """ Run a diamond subcommand, possibly with further options. Arguments: subcommand: the diamond subcommand to run opts: a list of additional argument strings to pass to diamond Returns: RunResult of running diamond """ config = get_config() with TemporaryDirectory() as temp_dir: params = [ config.cb_diamond_executable, subcommand, "--threads", str(config.cpus), "--tmpdir", temp_dir, ] if opts: params.extend(opts) result = execute(params) if not result.successful(): raise RuntimeError("diamond failed to run: %s -> %s" % (subcommand, result.stderr[-100:])) return result
def store_percentage_identities(seq_record): clusters = utils.get_cluster_features(seq_record) cfg = config.get_config() for cluster in clusters: features = [ feature for feature in utils.get_cluster_cds_features(cluster, seq_record) if 'sec_met' in feature.qualifiers ] cdhit_table, gene_to_cluster = utils.get_cdhit_table( features, float(cfg.cdh_display_cutoff)) for cdhit_cluster in cdhit_table: if len(cdhit_cluster["genes"]) > 1: cl_features = [ feature for feature in features if utils.get_gene_id( feature) in cdhit_cluster["genes"].keys() ] pct_table = utils.get_pct_identity_table(cl_features) for cds in cl_features: result = ",".join([ "%s=%s" % (othercds, pct_table[utils.get_gene_id(cds)][othercds]) for othercds in pct_table[utils.get_gene_id( cds)].keys() ]) for ann in cds.qualifiers['sec_met']: if ann.startswith("Percentage identity"): del ann cds.qualifiers['sec_met'].append( "Percentage identity: %s" % (result))
def create_rules_dict(enabled_clustertypes): "Create a cluster rules dictionary from the cluster rules file" rulesdict = {} first = True cfg = config.get_config() for hmm_model in cfg.enabled_detection_models: dir_path = path.dirname(path.abspath(__file__)) prefix = "" if hmm_model != "default": dir_path = path.join(dir_path, hmm_model) prefix = hmm_model + "/" #TODO: We should move all user-customizable files into config subdirectory; the rulefiles are redundant also in hmm_detection_dblookup for line in open(path.join(dir_path, "cluster_rules.txt"), "r"): # skip the first line with the legend if first: first = False continue parts = line.split('\t') if len(parts) < 3: continue key = prefix + parts.pop(0) if key not in enabled_clustertypes: continue rules = parts.pop(0) cutoff = int(float(parts.pop(0)) * 1000.00 * cfg.cutoff_multiplier) extension = int( float(parts.pop(0)) * 1000.00 * cfg.cutoff_multiplier) rulesdict[key] = (rules, cutoff, extension) return rulesdict
def test_canonical_base_filename(self): options = build_parser(modules=self.all_modules).parse_args([]) expected = os.path.join("out", "foo.1_example") res = main.canonical_base_filename("foo.1_example.gbk", "out", options) assert res == expected assert get_config().output_basename == os.path.basename(expected) res = main.canonical_base_filename( "/some/long/path/foo.1_example.gbff", "out", options) assert res == expected res = main.canonical_base_filename("foo.1_example.fa", "out", options) assert res == expected res = main.canonical_base_filename("foo.1_example.gbff.gz", "out", options) assert res == expected options = build_parser(modules=self.all_modules).parse_args( ["--output-basename", "foo.1"]) expected = os.path.join("out", "foo.1") res = main.canonical_base_filename("foo.1_example.gbk", "out", options) assert res == expected res = main.canonical_base_filename("foo.1_example.gbff", "out", options) assert res == expected res = main.canonical_base_filename("foo.1_example.fa", "out", options) assert res == expected res = main.canonical_base_filename("foo.1_example.gbff.gz", "out", options) assert res == expected
def __init__(self, cluster_feature: secmet.Cluster, ranking: List[Tuple[ReferenceCluster, Score]], reference_proteins: Dict[str, Protein], prefix: str) -> None: if ranking: assert reference_proteins self.prefix = prefix self.query_cluster = QueryCluster(cluster_feature) query_cluster_number = cluster_feature.get_cluster_number() cluster_limit = get_config().cb_nclusters self.colour_lookup = build_colour_groups(list(cluster_feature.cds_children), ranking[:cluster_limit]) self.hits = [] # type: List[Cluster] record_prefix = cluster_feature.parent_record.id.split(".", 1)[0] num_added = 0 queries = set() for cluster, score in ranking: if record_prefix == cluster.accession.split("_", 1)[0]: continue # determine overall strand direction of hits hit_genes = set() strand = determine_strand_of_cluster(cluster_feature, score.scored_pairings) for query, subject in score.scored_pairings: queries.add(query.id) hit_genes.add(subject.name) svg_cluster = Cluster.from_reference_cluster(cluster, query_cluster_number, score, reference_proteins, num_added + 1, len(hit_genes), strand) self.hits.append(svg_cluster) num_added += 1 # obey the cluster display limit from options if num_added >= cluster_limit: break self.max_length = self._size_of_largest_cluster() self._organise_strands()
def prepare_output_directory(name: str, input_file: str) -> None: """ Ensure the ouptut directory exists and is usable Raises an exception if the directory is unusable, or if results not being reused and directory not empty Arguments: name: the path of the directory input_file: the path of the input file Returns: None """ # if not supplied, set the output directory to be the sequence name input_prefix = os.path.basename(canonical_base_filename(input_file, "", get_config())) if not name: name = os.path.abspath(input_prefix) update_config({"output_dir": name}) if os.path.exists(name): if not os.path.isdir(name): raise RuntimeError("Output directory %s exists and is not a directory" % name) # not empty (apart from a possible input dir), and not reusing its results if not input_file.endswith(".json") and \ list(filter(_ignore_patterns, glob.glob(os.path.join(name, "*")))): raise RuntimeError("Output directory contains other files, aborting for safety") # --reuse logging.debug("Removing existing region genbank files") for genbank in glob.glob(os.path.join(name, "*.region???.gbk")): os.remove(genbank) logging.debug("Reusing output directory: %s", name) else: logging.debug("Creating output directory: %s", name) os.mkdir(name)
def prepare_data(logging_only: bool = False) -> List[str]: """ Prepare the databases. """ failure_messages = [] # known failure_messages.extend(prepare_known_data(logging_only)) # general clusterblastdir = os.path.join(get_config().database_dir, "clusterblast") if "mounted_at_runtime" in clusterblastdir: # can't prepare these return failure_messages cluster_defs = os.path.join(clusterblastdir, 'clusters.txt') protein_seqs = os.path.join(clusterblastdir, "proteins.fasta") db_file = os.path.join(clusterblastdir, "proteins.dmnd") # check the DBv3 region info exists instead of single cluster numbers with open(protein_seqs) as handle: sample = handle.readline() if "-" not in sample.split("|", 3)[1]: failure_messages.append( "clusterblast database out of date, update with download-databases" ) # and don't bother pressing them return failure_messages failure_messages.extend( check_clusterblast_files(cluster_defs, protein_seqs, db_file, logging_only=logging_only)) return failure_messages
def setUp(self): options = build_config( ["--minimal", "--enable-tta", "--tta-threshold", "0"], isolated=True, modules=antismash.get_all_modules()) self.old_config = get_config().__dict__ self.options = update_config(options)
def run_hmmpfam2(query_hmmfile: str, target_sequence: str) -> List: # TODO cleanup """ Run hmmpfam2 over the provided HMM file and fasta input Arguments: query_hmmfile: the HMM file to use target_sequence: a string in fasta format of the sequence to run Returns: a list of results as parsed by SearchIO """ config = get_config() command = ["hmmpfam2", "--cpu", str(config.cpus), query_hmmfile, '-'] # Allow to disable multithreading for HMMer2 calls in the command line #TODO fix options for this if config.get('hmmer2') and 'multithreading' in config.hmmer2 and \ not config.hmmer2.multithreading: command = command[0:1] + command[3:] result = execute(command, stdin=target_sequence) if not result.successful(): logging.debug('hmmpfam2 returned %d: %r while searching %r', result.return_code, result.stderr, query_hmmfile) raise RuntimeError("hmmpfam2 problem while running %s", command) res_stream = StringIO(result.stdout) results = list(SearchIO.parse(res_stream, 'hmmer2-text')) return results
def run_hmmpfam2(query_hmmfile: str, target_sequence: str, extra_args: List[str] = None ) -> List[SearchIO._model.query.QueryResult]: # pylint: disable=protected-access """ Run hmmpfam2 over the provided HMM file and fasta input Arguments: query_hmmfile: the HMM file to use target_sequence: a string in fasta format of the sequence to run Returns: a list of results as parsed by SearchIO """ config = get_config() command = ["hmmpfam2"] # Allow to disable multithreading for HMMer2 calls in the command line #TODO fix options for this if config.get('hmmer2') and 'multithreading' in config.hmmer2 and \ config.hmmer2.multithreading: command.extend(["--cpu", str(config.cpus)]) if extra_args: command.extend(extra_args) command.extend([query_hmmfile, '-']) result = execute(command, stdin=target_sequence) if not result.successful(): logging.debug('hmmpfam2 returned %d: %r while searching %r', result.return_code, result.stderr, query_hmmfile) raise RuntimeError("hmmpfam2 problem while running %s: %s" % (command, result.stderr)) res_stream = StringIO(result.stdout) return list(SearchIO.parse(res_stream, 'hmmer2-text'))
def run_blastp(target_blastp_database: str, query_sequence: str, opts: List[str] = None, results_file: str = None ) -> List[SearchIO._model.query.QueryResult]: """ Runs blastp over a single sequence against a database and returns the results as parsed by Bio.SearchIO. Arguments: target_blastp_database: the blastp database to compare to query_sequence: the sequence being compared opts: a list of extra arguments to pass to blastp, or None results_file: a path to keep a copy of blastp results in, if provided Returns: a list of QueryResults as parsed from blast output by SearchIO """ if not query_sequence: raise ValueError("Cannot run blastp on empty sequence") config = get_config() command = ["blastp", "-num_threads", str(config.cpus), "-db", target_blastp_database] if opts is not None: command.extend(opts) result = execute(command, stdin=query_sequence) if not result.successful(): raise RuntimeError('blastp returned %d: %r while scanning %r' % ( result.return_code, result.stderr.replace("\n", ""), query_sequence[:100])) if results_file is not None: with open(results_file, 'w') as fh: fh.write(result.stdout) return list(SearchIO.parse(StringIO(result.stdout), 'blast-text'))
def ensure_database_pressed(filepath: str, return_not_raise: bool = False) -> List[str]: """ Ensures that the given HMMer database exists and that the hmmpress generated files aren't out of date. Arguments: filepath: the path to the HMMer database return_not_raise: whether to catch errors and return their messages as strings Returns: any encountered error messages, will never be populated without return_not_raise == True """ components = ["{}{}".format(filepath, ext) for ext in ['.h3f', '.h3i', '.h3m', '.h3p']] if path.is_outdated(components, filepath): logging.info("%s components missing or obsolete, re-pressing database", filepath) if "hmmpress" not in get_config().executables: msg = "Failed to hmmpress {!r}: cannot find executable for hmmpress".format(filepath) if not return_not_raise: raise RuntimeError(msg) return [msg] result = subprocessing.run_hmmpress(filepath) if not result.successful(): msg = "Failed to hmmpress {!r}: {}".format(filepath, result.stderr) if not return_not_raise: raise RuntimeError(msg) return [msg] return []
def ensure_cds_info(genefinding: Callable[[Record, Any], None], sequence: Record) -> Record: """ Ensures the given record has CDS features with unique locus tags. CDS features are retrieved from GFF file or via genefinding, depending on antismash options. Records without CDS features will have their skip flag marked. Arguments: genefinding: the relevant run_on_record(record, options) function to use for finding genes if no GFF file being used record: the Record instance to ensure CDS features for Returns: the Record instance provided """ options = get_config() if sequence.skip: return sequence if not sequence.get_cds_features(): if not options.genefinding_gff3 and options.genefinding_tool != "none": logging.info( "No CDS features found in record %r, running gene finding.", sequence.id) genefinding(sequence, options) if not sequence.get_cds_features(): logging.info("No genes found, skipping record") sequence.skip = "No genes found" return sequence return sequence
def check_prereqs() -> List[str]: "Check if all required applications are around" options = get_config() # Tuple is ( binary_name, optional) _required_binaries = [ ('blastp', False), ('makeblastdb', False), ('diamond', False), ] _required_files = [ ('geneclusterprots.dmnd', False), ('geneclusterprots.fasta', False), ('geneclusters.txt', False), ] clusterblastdir = os.path.join(options.database_dir, "clusterblast") failure_messages = [] for binary_name, optional in _required_binaries: if path.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate file: %r" % binary_name) for file_name, optional in _required_files: if path.locate_file(os.path.join(clusterblastdir, file_name)) is None and not optional: failure_messages.append("Failed to locate file: %r" % file_name) failure_messages.extend(check_known_prereqs(options)) failure_messages.extend(check_sub_prereqs(options)) return failure_messages
def test_classification_with_colon(self): # since SMCOG id and description are stored in a string separated by :, # ensure that descriptions containing : are properly handled # test gene is AQF52_5530 from CP013129.1 translation = ( "MDTHQREEDPVAARRDRTHYLYLAVIGAVLLGIAVGFLAPGVAVELKPLGTGFVN" "LIKMMISPIIFCTIVLGVGSVRKAAKVGAVGGLALGYFLVMSTVALAIGLLVGNL" "LEPGSGLHLTKEIAEAGAKQAEGGGESTPDFLLGIIPTTFVSAFTEGEVLQTLLV" "ALLAGFALQAMGAAGEPVLRGIGHIQRLVFRILGMIMWVAPVGAFGAIAAVVGAT" "GAAALKSLAVIMIGFYLTCGLFVFVVLGAVLRLVAGINIWTLLRYLGREFLLILS" "TSSSESALPRLIAKMEHLGVSKPVVGITVPTGYSFNLDGTAIYLTMASLFVAEAM" "GDPLSIGEQISLLVFMIIASKGAAGVTGAGLATLAGGLQSHRPELVDGVGLIVGI" "DRFMSEARALTNFAGNAVATVLVGTWTKEIDKARVTEVLAGNIPFDEKTLVDDHA" "PVPVPDQRAEGGEEKARAGV") cds = helpers.DummyCDS(0, len(translation)) cds.translation = translation results = smcogs.classify("test", [cds], get_config()) assert results.best_hits[cds.get_name( )].hit_id == "SMCOG1212:sodium:dicarboxylate symporter" record = helpers.DummyRecord(seq=translation) record.add_cds_feature(cds) record.add_protocluster(helpers.DummyProtocluster(0, len(translation))) # if we don't handle multiple semicolons right, this line will crash results.add_to_record(record) gene_functions = cds.gene_functions.get_by_tool("smcogs") assert len(gene_functions) == 1 assert str(gene_functions[0]).startswith( "transport (smcogs) SMCOG1212:sodium:dicarboxylate symporter" " (Score: 416; E-value: 2.3e-126)")
def run_diamond(query_file: str, database_file: str, mode: str = "blastp", opts: Optional[List[str]] = None) -> str: """ Runs diamond, comparing the given query to the given database Arguments: query_file: the path of query sequence file database_file: the path of the database to compare to mode: the mode to use (defaults to blastp) opts: any extra options to pass to diamond Returns: the output from running diamond """ with TemporaryDirectory() as temp_dir: command = [ "diamond", mode, "--db", database_file, "--threads", str(get_config().cpus), "--query", query_file, "--tmpdir", temp_dir, ] if opts: command.extend(opts) result = execute(command) if not result.successful(): raise RuntimeError("diamond failed to run: %s -> %s" % (command, result.stderr[-100:])) return result.stdout
def parallel_execute(commands: List[List[str]], cpus: Optional[int] = None, timeout: Optional[int] = None, verbose: bool = True) -> List[int]: """ Limited return vals, only returns return codes """ if verbose: runner = verbose_child_process else: runner = child_process os.setpgid(0, 0) if not cpus: cpus = get_config().cpus assert isinstance(cpus, int) pool = multiprocessing.Pool(cpus) jobs = pool.map_async(runner, commands) try: errors = jobs.get(timeout=timeout) except multiprocessing.TimeoutError: pool.terminate() assert isinstance(timeout, int) raise RuntimeError("One of %d child processes timed out after %d seconds" % ( cpus, timeout)) except KeyboardInterrupt: logging.error("Interrupted by user") pool.terminate() raise pool.close() return errors
def setUp(self): options = build_config(self.get_args(), isolated=True, modules=get_all_modules()) self.old_config = get_config().__dict__ self.options = update_config(options) assert clusterblast.check_prereqs(self.options) == [] assert clusterblast.check_options(self.options) == [] assert clusterblast.is_enabled(self.options)
def setUp(self): self.format0_file = path.get_full_path(__file__, "data", "format0.dmnd") self.format1_file = path.get_full_path(__file__, "data", "format1.dmnd") self.empty = path.get_full_path(__file__, "data", "empty.dmnd") options = build_config([], isolated=True, modules=get_all_modules()) self.old_config = get_config().__dict__ self.options = update_config(options)
def description_text(self) -> str: """ returns the Region description """ description_text = 'Location: {:,d} - {:,d} nt. (total: {:,d} nt)'.format( self.location.start + 1, self.location.end, len(self.location)) if get_config().cf_create_clusters and self.probabilities: description_text += 'ClusterFinder probabilities: %s. ' % self.probabilities return description_text
def description_text(self) -> str: """ returns the Region description """ description_text = 'Location: %s - %s nt. ' % (self.location.start + 1, self.location.end) if get_config().cf_create_clusters and self.probabilities: description_text += 'ClusterFinder probabilities: %s. ' % self.probabilities return description_text
def test_namespace_initialisation(self): # test intialisation from namespace namespace = Namespace() namespace.taxon = 'fungi' config = update_config(namespace) assert config.taxon == 'fungi' # a new constructor should keep the value assert get_config().taxon == 'fungi'
def load_clusterblast_database(seq_record, searchtype="general"): options = config.get_config() accessiondict = {} for cds in utils.get_cds_features(seq_record): accessiondict[utils.get_gene_acc(cds)] = utils.get_gene_accession(cds) clusters = load_geneclusters(searchtype) proteinlocations, proteinstrands, proteinannotations, proteintags = load_geneclusterproteins(accessiondict, searchtype) return clusters, proteinlocations, proteinstrands, proteinannotations, proteintags
def _ignore_patterns(entry: str) -> bool: """File name patterns that we want to ignore for the "outdir is empty" check.""" config = get_config() if entry.endswith('/input') and os.path.isdir(entry): return False if os.path.abspath(entry) == os.path.abspath(config.logfile): return False return True
def from_json(json: Dict[str, Any], record: Record) -> Optional["TTAResults"]: """ Constructs a new TTAResults instance from a json format and the original record analysed. """ if json["schema_version"] != TTAResults.schema_version: return None options = get_config() results = TTAResults(json["record_id"], json["gc_content"], options.tta_threshold) # if old results were excluding based on too low a GC content, rerun if json["threshold"] > results.gc_content and options.tta_threshold <= results.gc_content: return None # otherwise, if the threshold is now too high, skip all the codons if json["gc_content"] >= get_config().tta_threshold: for info in json["TTA codons"]: start = info["start"] strand = info["strand"] results.new_feature_from_basics(start, strand) return results
def _size_of_largest_cluster(self) -> int: query_length = len(self.query_cluster) length = query_length for cluster in self.hits: if len(cluster) > length: length = len(cluster) min_scale = get_config().cb_min_homology_scale # if this would shrink the query too much, use the minimum allowed if query_length / length < min_scale: length = int(query_length / min_scale) return length
def setUp(self): options = build_config(self.get_args(), isolated=True, modules=antismash.get_all_modules()) self.old_config = get_config().__dict__ self.options = update_config(options) self.record = self.build_record( helpers.get_path_to_nisin_with_detection()) prepare_data()
def test_from_json_higher_bitscore(self): json = self.create_results().to_json() assert get_config().rre_cutoff == 25. new = 35. assert self.hits[0].score > new assert self.hits[1].score < new update_config({"rre_cutoff": new}) result = RREFinderResults.from_json(json, self.record) assert len(result.hits_by_cds) == 1 assert result.hits_by_cds[self.hits[0].locus_tag] == [self.hits[0]] assert len(result.hits_by_protocluster) == 1 assert result.hits_by_protocluster[1] == [self.hits[0].locus_tag]
def test_from_json_higher_min_length(self): json = self.create_results().to_json() assert get_config().rre_min_length == 50 new = 80 assert len(self.hits[0]) < new assert len(self.hits[1]) > new update_config({"rre_min_length": new}) results = RREFinderResults.from_json(json, self.record) assert len(results.hits_by_cds) == 1 assert results.hits_by_cds[self.hits[1].locus_tag] == [self.hits[1]] assert len(results.hits_by_protocluster) == 1 assert results.hits_by_protocluster[2] == [self.hits[1].locus_tag]