def test_timeout(self): start = time.time() with self.assertRaisesRegex( RuntimeError, "Child process .* timed out after 1 second"): subprocessing.execute(["sleep", "10"], timeout=1) elapsed = time.time() - start assert elapsed < 1.5
def test_redirection(self): result = subprocessing.execute(["echo", "test"], stdout=open(os.devnull, "w")) with self.assertRaisesRegex(ValueError, "stdout was redirected to file, unable to access"): result.stdout.strip() assert not result.stderr assert not result.return_code assert result.get_command_string() == "echo test" result = subprocessing.execute(["cat", "--bad-option"], stderr=open(os.devnull, "w")) assert result.stdout.strip() == "" with self.assertRaisesRegex(ValueError, "stderr was redirected to file, unable to access"): assert result.stderr.startswith("cat: unrecognized") assert result.return_code
def test_piping(self): result = subprocessing.execute(["pwd"]) assert result.stdout.strip() == os.getcwd() assert result.stderr.strip() == "" assert not result.return_code and result.successful() result = subprocessing.execute(["cat", "--bad-option"]) assert result.stdout.strip() == "" assert result.stderr.startswith("cat: unrecognized") assert result.return_code and not result.successful() result = subprocessing.execute(["cat"], stdin="fish") assert result.stdout.strip() == "fish" assert not result.stderr assert not result.return_code and result.successful()
def run_prodigal(record: Record, options: ConfigType) -> None: """ Run progidal to annotate prokaryotic sequences """ if "basedir" in options.get('prodigal', ''): basedir = options.prodigal.basedir else: basedir = "" with TemporaryDirectory(change=True): name = record.id.lstrip('-') if not name: name = "unknown" fasta_file = '%s.fasta' % name result_file = '%s.predict' % name with open(fasta_file, 'w') as handle: seqio.write([record.to_biopython()], handle, 'fasta') # run prodigal prodigal = [path.join(basedir, 'prodigal')] prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file]) if options.genefinding_tool == "prodigal-m" or len(record.seq) < 20000: prodigal.extend(['-p', 'meta']) err = execute(prodigal).stderr if err.find('Error') > -1: logging.error("Failed to run prodigal: %r", err) raise RuntimeError("prodigal error: %s" % err) found = 0 for line in open(result_file, 'r'): # skip first line if not line.startswith('>'): continue name, start_chunk, end_chunk, prodigal_strand = line[1:].rstrip( ).split("_") try: start = int(start_chunk) end = int(end_chunk) if prodigal_strand == "+": strand = 1 else: strand = -1 except ValueError: logging.error('Malformatted prodigal output line %r', line.rstrip()) continue if start > end: strand = -1 start, end = end, start loc = FeatureLocation(start - 1, end, strand=strand) translation = record.get_aa_translation_from_location(loc) feature = CDSFeature(loc, locus_tag='ctg%s_%s' % (record.record_index, name), translation=translation, translation_table=record.transl_table) record.add_cds_feature(feature) found += 1 logging.debug("prodigal found %d CDS features", found)
def download_resfam(db_dir: str) -> None: """Download and sanitise the Resfam database.""" archive_filename = os.path.join(db_dir, "resfam", "Resfams.hmm.gz") filename = os.path.splitext(archive_filename)[0] url = RESFAM_URL # checksum of existing not matched because it has a convert timestamp in it # So check size and line count as an approximation if present_and_size_matches(filename, RESFAM_SIZE) and \ present_and_line_count_matches(filename, RESFAM_LINES): print("Resfams database present and checked") return print("Downloading Resfam database") check_diskspace(url) download_if_not_present(url, archive_filename, RESFAM_ARCHIVE_CHECKSUM) filename = unzip_file(archive_filename, gzip, gzip.zlib.error) # type: ignore delete_file(filename + ".gz") # remove tabs converted = execute(["hmmconvert", filename]) print("Ensuring all cutoffs are present") # add TC to those entries missing them # calculated as 10% less than the minimum scoring hit in their own group missing_cutoffs = { "RF0174": int(374 * 0.9), "RF0172": int(85 * 0.9), "RF0173": int(295 * 0.9), "RF0168": int(691 * 0.9), } with open(filename, "w") as handle: lines = list(converted.stdout) i = 0 while i < len(lines): # find an accession while i < len(lines) and not lines[i].startswith("ACC"): handle.write(lines[i]) i += 1 # end of file with no new accession if i >= len(lines): break # write the accession line itself handle.write(lines[i]) # add the cutoffs if missing acc = lines[i].split()[1] if acc not in missing_cutoffs: continue value = missing_cutoffs[acc] # an accession of interest, so add cutoffs in the same place as others while not lines[i].startswith("CKSUM"): handle.write(lines[i]) i += 1 # write the CKSUM line handle.write(lines[i]) # and finally add the cutoffs for cutoff in ["GA", "TC", "NC"]: handle.write("%s %d.00 %d.00\n" % (cutoff, value, value)) i += 1 ensure_database_pressed(filename)
def run_diamond(query: str, database: str, tempdir: str, options: ConfigType) -> str: """ Runs diamond, comparing the given query to the given database Arguments: query: the path of query sequence file target: the path of the database to compare to tempdir: the path of a temporary directory for diamond to use options: antismash Config Returns: the name of the output file created """ logging.debug("Running external command: diamond") command = [ "diamond", "blastp", "--db", database, "--threads", str(options.cpus), "--query", query, "--compress", "0", "--max-target-seqs", "10000", "--evalue", "1e-05", "--out", "input.out", "--outfmt", "6", # 6 is blast tabular format, just as in blastp "--tmpdir", tempdir ] result = subprocessing.execute(command) if not result.successful(): raise RuntimeError("diamond failed to run: %s -> %s" % (command, result.stderr[-100:])) return "input.out"
def get_git_version() -> str: """Get the sha1 of the current git version""" args = ['git', 'rev-parse', '--short', 'HEAD'] try: return execute(args).stdout.strip() except OSError: pass return ""
def get_git_version(fallback_filename: Optional[str] = GIT_VERSION_FALLBACK_FILENAME) -> str: """Get the sha1 of the current git version""" git_version = "" try: version_cmd = execute(['git', 'rev-parse', '--short', 'HEAD']) status_cmd = execute(['git', 'status', '--porcelain']) if version_cmd.successful() and status_cmd.successful(): git_version = version_cmd.stdout.strip() changes = status_cmd.stdout.splitlines() if changes: git_version += "(changed)" except OSError: pass if git_version == "" and fallback_filename: if locate_file(fallback_filename, silent=True): with open(fallback_filename, 'rt') as handle: git_version = handle.read().strip() return git_version
def run_external(fasta_filename: str) -> str: """ Runs glimmerhmm on the provided fasta file and returns the stdout output from glimmerhmm. """ glimmerhmm = ['glimmerhmm', fasta_filename, path.get_full_path(__file__, "data/train_crypto"), "-g"] run_result = execute(glimmerhmm) if run_result.stderr.find('ERROR') > -1: logging.error("Failed to run GlimmerHMM: %r", run_result.stderr) raise RuntimeError("Failed to run GlimmerHMM: %s" % run_result.stderr) return run_result.stdout
def alignsmcogs(smcog: str, input_number: int) -> str: """ Align to multiple sequence alignment, output as fasta file """ reference = path.get_full_path(__file__, "data", "%s_muscle.fasta" % str(smcog).lower()) output_filename = "muscle%d.fasta" % input_number musclecommand = ["muscle", "-quiet", "-profile", "-in1", reference, "-in2", "input" + str(input_number) + ".fasta", "-out", output_filename] result = subprocessing.execute(musclecommand) if result.return_code: raise RuntimeError("Muscle failed to run: %s, %s" % (musclecommand, result.stderr[-100:])) return output_filename
def run_nrpspredictor(a_domains: List[AntismashDomain], options: ConfigType) -> Dict[str, Prediction]: """ Runs NRPSPredictor2 over the provided A domains. Arguments: a_domains: a list of AntismashDomains, one for each A domain options: antismash options Returns: a dictionary mapping each domain name to a PredictorSVMResult """ # NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, # extract 8 Angstrom residues and insert this into NRPSPredictor nrps_predictor_dir = path.get_full_path(__file__, "external", "NRPSPredictor2") data_dir = os.path.join(nrps_predictor_dir, 'data') lib_dir = os.path.join(nrps_predictor_dir, 'lib') jar_file = os.path.join(nrps_predictor_dir, 'build', 'NRPSpredictor2.jar') java_separator = ":" if sys.platform == "win32": java_separator = ";" classpath = java_separator.join([ jar_file, os.path.join(lib_dir, 'java-getopt-1.0.13.jar'), os.path.join(lib_dir, 'Utilities.jar'), os.path.join(lib_dir, 'libsvm.jar') ]) input_filename = "signatures.fa" output_filename = "svm_output.txt" bacterial = "1" if options.taxon == "bacteria" else '0' signatures = [get_34_aa_signature(a_domain) for a_domain in a_domains] with TemporaryDirectory(change=True): # Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs with open(input_filename, "w") as handle: for sig, domain in zip(signatures, a_domains): handle.write("%s\t%s\n" % (sig, domain.get_name())) # Run NRPSPredictor2 SVM commands = [ 'java', '-Ddatadir=%s' % data_dir, '-cp', classpath, 'org.roettig.NRPSpredictor2.NRPSpredictor2', '-i', input_filename, '-r', output_filename, '-s', '1', '-b', bacterial ] result = subprocessing.execute(commands) if not result.successful(): raise RuntimeError("NRPSPredictor2 failed: %s" % result.stderr) with open(output_filename) as handle: lines = handle.read().splitlines()[1:] # strip the header return read_output(lines)
def hmmsearch(fasta_format: str, hmm: str) -> float: """ Runs hmmsearch, only taking a single value from the output """ result = subprocessing.execute(["hmmsearch", "--noali", hmm, "-"], stdin=fasta_format) if not result.successful(): logging.error("hmmsearch stderr: %s", result.stderr) raise RuntimeError("hmmsearch exited non-zero") if "[No targets detected" in result.stdout: return 0. text = result.stdout start = text.find('Domain annotation for each sequence:') end = text[start:].find('Internal pipeline statistics summary:') lines = text[start:start + end].splitlines() return float(lines[4].split()[2])
def make_blastdb(inputfile: str, db_prefix: str) -> subprocessing.RunResult: """ Runs makeblastdb on the inputs to create a blast protein database makeblastdb will create 3 files with the given prefix and the extensions: .pin, .phr, .psq Arguments: inputfile: the input filename db_prefix: the prefix to use for the created database Returns: a subprocessing.RunResult instance """ command = ["makeblastdb", "-in", inputfile, "-out", db_prefix, "-dbtype", "prot"] result = subprocessing.execute(command) if not result.successful(): raise RuntimeError("makeblastdb failed to run: %s -> %s" % (command, result.stderr[-100:])) return result
def draw_tree(input_number: int, output_dir: str, tag: str) -> str: """ Construct a PNG for display via fasttree Returns: the filename of the image generated """ matplotlib.use('Agg') command = [ "fasttree", "-quiet", "-fastest", "-noml", "trimmed_alignment%d.fasta" % input_number ] run_result = subprocessing.execute(command) if not run_result.successful(): raise RuntimeError("Fasttree failed to run successfully:", run_result.stderr) handle = StringIO(run_result.stdout) tree_filename = os.path.join(output_dir, tag + '.png') try: tree = Phylo.read(handle, 'newick') except NewickError: logging.debug('Invalid newick tree for %r', tag) return '' # enforce a minimum distance between branches max_size = max(tree.distance(node) for node in tree.get_terminals()) for clade in tree.get_nonterminals() + tree.get_terminals(): if not clade.branch_length: clade.branch_length = max_size / 20 else: clade.branch_length = abs(clade.branch_length) + max_size / 20 # change the colour of the query gene label_colors = {tag: 'green'} Phylo.draw(tree, do_show=False, label_colors=label_colors, label_func=lambda node: str(node).replace("|", " ")) fig = matplotlib.pyplot.gcf() fig.set_size_inches(20, (tree.count_terminals() / 3)) matplotlib.pyplot.axis('off') fig.savefig(os.path.join(output_dir, tag + '.png'), bbox_inches='tight') matplotlib.pyplot.close(fig) return tree_filename
def run_blast(query: str, database: str) -> str: """ Runs blastp, comparing the given query with the given database An output file will be created, using the name of the query but with the extension changed to .out Arguments: query: the path of query sequence file target: the path of the database to compare to Returns: the name of the created output file """ out_file = query.rsplit(".", 1)[0] + ".out" command = ["blastp", "-db", database, "-query", query, "-outfmt", "6", "-max_target_seqs", "10000", "-evalue", "1e-05", "-out", out_file] res = subprocessing.execute(command) if not res.successful(): raise RuntimeError("blastp run failed: %s..." % res.stderr[-200:]) return out_file
def compile_pfam(filename): """Compile a HMMer database with hmmpress.""" command = ['hmmpress', '-f', filename] execute(command)