Exemple #1
0
 def test_timeout(self):
     start = time.time()
     with self.assertRaisesRegex(
             RuntimeError, "Child process .* timed out after 1 second"):
         subprocessing.execute(["sleep", "10"], timeout=1)
     elapsed = time.time() - start
     assert elapsed < 1.5
    def test_redirection(self):
        result = subprocessing.execute(["echo", "test"], stdout=open(os.devnull, "w"))
        with self.assertRaisesRegex(ValueError, "stdout was redirected to file, unable to access"):
            result.stdout.strip()
        assert not result.stderr
        assert not result.return_code
        assert result.get_command_string() == "echo test"

        result = subprocessing.execute(["cat", "--bad-option"], stderr=open(os.devnull, "w"))
        assert result.stdout.strip() == ""
        with self.assertRaisesRegex(ValueError, "stderr was redirected to file, unable to access"):
            assert result.stderr.startswith("cat: unrecognized")
        assert result.return_code
Exemple #3
0
    def test_piping(self):
        result = subprocessing.execute(["pwd"])
        assert result.stdout.strip() == os.getcwd()
        assert result.stderr.strip() == ""
        assert not result.return_code and result.successful()

        result = subprocessing.execute(["cat", "--bad-option"])
        assert result.stdout.strip() == ""
        assert result.stderr.startswith("cat: unrecognized")
        assert result.return_code and not result.successful()

        result = subprocessing.execute(["cat"], stdin="fish")
        assert result.stdout.strip() == "fish"
        assert not result.stderr
        assert not result.return_code and result.successful()
Exemple #4
0
def run_prodigal(record: Record, options: ConfigType) -> None:
    """ Run progidal to annotate prokaryotic sequences
    """
    if "basedir" in options.get('prodigal', ''):
        basedir = options.prodigal.basedir
    else:
        basedir = ""
    with TemporaryDirectory(change=True):
        name = record.id.lstrip('-')
        if not name:
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([record.to_biopython()], handle, 'fasta')

        # run prodigal
        prodigal = [path.join(basedir, 'prodigal')]
        prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file])
        if options.genefinding_tool == "prodigal-m" or len(record.seq) < 20000:
            prodigal.extend(['-p', 'meta'])

        err = execute(prodigal).stderr
        if err.find('Error') > -1:
            logging.error("Failed to run prodigal: %r", err)
            raise RuntimeError("prodigal error: %s" % err)
        found = 0
        for line in open(result_file, 'r'):
            # skip first line
            if not line.startswith('>'):
                continue
            name, start_chunk, end_chunk, prodigal_strand = line[1:].rstrip(
            ).split("_")

            try:
                start = int(start_chunk)
                end = int(end_chunk)
                if prodigal_strand == "+":
                    strand = 1
                else:
                    strand = -1
            except ValueError:
                logging.error('Malformatted prodigal output line %r',
                              line.rstrip())
                continue

            if start > end:
                strand = -1
                start, end = end, start

            loc = FeatureLocation(start - 1, end, strand=strand)
            translation = record.get_aa_translation_from_location(loc)
            feature = CDSFeature(loc,
                                 locus_tag='ctg%s_%s' %
                                 (record.record_index, name),
                                 translation=translation,
                                 translation_table=record.transl_table)
            record.add_cds_feature(feature)
            found += 1
    logging.debug("prodigal found %d CDS features", found)
Exemple #5
0
def download_resfam(db_dir: str) -> None:
    """Download and sanitise the Resfam database."""
    archive_filename = os.path.join(db_dir, "resfam", "Resfams.hmm.gz")
    filename = os.path.splitext(archive_filename)[0]
    url = RESFAM_URL

    # checksum of existing not matched because it has a convert timestamp in it
    # So check size and line count as an approximation
    if present_and_size_matches(filename, RESFAM_SIZE) and \
       present_and_line_count_matches(filename, RESFAM_LINES):
        print("Resfams database present and checked")
        return

    print("Downloading Resfam database")
    check_diskspace(url)
    download_if_not_present(url, archive_filename, RESFAM_ARCHIVE_CHECKSUM)
    filename = unzip_file(archive_filename, gzip, gzip.zlib.error)  # type: ignore
    delete_file(filename + ".gz")
    # remove tabs
    converted = execute(["hmmconvert", filename])
    print("Ensuring all cutoffs are present")
    # add TC to those entries missing them
    # calculated as 10% less than the minimum scoring hit in their own group
    missing_cutoffs = {
        "RF0174": int(374 * 0.9),
        "RF0172": int(85 * 0.9),
        "RF0173": int(295 * 0.9),
        "RF0168": int(691 * 0.9),
    }
    with open(filename, "w") as handle:
        lines = list(converted.stdout)
        i = 0
        while i < len(lines):
            # find an accession
            while i < len(lines) and not lines[i].startswith("ACC"):
                handle.write(lines[i])
                i += 1
            # end of file with no new accession
            if i >= len(lines):
                break
            # write the accession line itself
            handle.write(lines[i])

            # add the cutoffs if missing
            acc = lines[i].split()[1]
            if acc not in missing_cutoffs:
                continue
            value = missing_cutoffs[acc]
            # an accession of interest, so add cutoffs in the same place as others
            while not lines[i].startswith("CKSUM"):
                handle.write(lines[i])
                i += 1
            # write the CKSUM line
            handle.write(lines[i])
            # and finally add the cutoffs
            for cutoff in ["GA", "TC", "NC"]:
                handle.write("%s    %d.00 %d.00\n" % (cutoff, value, value))
            i += 1

    ensure_database_pressed(filename)
Exemple #6
0
def run_diamond(query: str, database: str, tempdir: str, options: ConfigType) -> str:
    """ Runs diamond, comparing the given query to the given database

        Arguments:
            query: the path of query sequence file
            target: the path of the database to compare to
            tempdir: the path of a temporary directory for diamond to use
            options: antismash Config

        Returns:
            the name of the output file created
    """
    logging.debug("Running external command: diamond")
    command = [
        "diamond", "blastp",
        "--db", database,
        "--threads", str(options.cpus),
        "--query", query,
        "--compress", "0",
        "--max-target-seqs", "10000",
        "--evalue", "1e-05",
        "--out", "input.out",
        "--outfmt", "6",  # 6 is blast tabular format, just as in blastp
        "--tmpdir", tempdir
    ]
    result = subprocessing.execute(command)
    if not result.successful():
        raise RuntimeError("diamond failed to run: %s -> %s" % (command, result.stderr[-100:]))
    return "input.out"
Exemple #7
0
def get_git_version() -> str:
    """Get the sha1 of the current git version"""
    args = ['git', 'rev-parse', '--short', 'HEAD']
    try:
        return execute(args).stdout.strip()
    except OSError:
        pass
    return ""
def get_git_version(fallback_filename: Optional[str] = GIT_VERSION_FALLBACK_FILENAME) -> str:
    """Get the sha1 of the current git version"""
    git_version = ""
    try:
        version_cmd = execute(['git', 'rev-parse', '--short', 'HEAD'])
        status_cmd = execute(['git', 'status', '--porcelain'])
        if version_cmd.successful() and status_cmd.successful():
            git_version = version_cmd.stdout.strip()
            changes = status_cmd.stdout.splitlines()
            if changes:
                git_version += "(changed)"
    except OSError:
        pass
    if git_version == "" and fallback_filename:
        if locate_file(fallback_filename, silent=True):
            with open(fallback_filename, 'rt') as handle:
                git_version = handle.read().strip()
    return git_version
def run_external(fasta_filename: str) -> str:
    """ Runs glimmerhmm on the provided fasta file and returns the stdout output
        from glimmerhmm.
    """
    glimmerhmm = ['glimmerhmm', fasta_filename,
                  path.get_full_path(__file__, "data/train_crypto"), "-g"]
    run_result = execute(glimmerhmm)
    if run_result.stderr.find('ERROR') > -1:
        logging.error("Failed to run GlimmerHMM: %r", run_result.stderr)
        raise RuntimeError("Failed to run GlimmerHMM: %s" % run_result.stderr)
    return run_result.stdout
Exemple #10
0
def alignsmcogs(smcog: str, input_number: int) -> str:
    """ Align to multiple sequence alignment, output as fasta file """
    reference = path.get_full_path(__file__, "data", "%s_muscle.fasta" % str(smcog).lower())
    output_filename = "muscle%d.fasta" % input_number
    musclecommand = ["muscle", "-quiet", "-profile", "-in1", reference,
                     "-in2", "input" + str(input_number) + ".fasta",
                     "-out", output_filename]
    result = subprocessing.execute(musclecommand)
    if result.return_code:
        raise RuntimeError("Muscle failed to run: %s, %s" % (musclecommand, result.stderr[-100:]))
    return output_filename
def run_nrpspredictor(a_domains: List[AntismashDomain],
                      options: ConfigType) -> Dict[str, Prediction]:
    """ Runs NRPSPredictor2 over the provided A domains.

        Arguments:
            a_domains: a list of AntismashDomains, one for each A domain
            options: antismash options

        Returns:
            a dictionary mapping each domain name to a PredictorSVMResult
    """
    # NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain,
    # extract 8 Angstrom residues and insert this into NRPSPredictor
    nrps_predictor_dir = path.get_full_path(__file__, "external",
                                            "NRPSPredictor2")
    data_dir = os.path.join(nrps_predictor_dir, 'data')
    lib_dir = os.path.join(nrps_predictor_dir, 'lib')
    jar_file = os.path.join(nrps_predictor_dir, 'build', 'NRPSpredictor2.jar')
    java_separator = ":"
    if sys.platform == "win32":
        java_separator = ";"
    classpath = java_separator.join([
        jar_file,
        os.path.join(lib_dir, 'java-getopt-1.0.13.jar'),
        os.path.join(lib_dir, 'Utilities.jar'),
        os.path.join(lib_dir, 'libsvm.jar')
    ])
    input_filename = "signatures.fa"
    output_filename = "svm_output.txt"
    bacterial = "1" if options.taxon == "bacteria" else '0'

    signatures = [get_34_aa_signature(a_domain) for a_domain in a_domains]

    with TemporaryDirectory(change=True):
        # Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs
        with open(input_filename, "w") as handle:
            for sig, domain in zip(signatures, a_domains):
                handle.write("%s\t%s\n" % (sig, domain.get_name()))
        # Run NRPSPredictor2 SVM
        commands = [
            'java',
            '-Ddatadir=%s' % data_dir, '-cp', classpath,
            'org.roettig.NRPSpredictor2.NRPSpredictor2', '-i', input_filename,
            '-r', output_filename, '-s', '1', '-b', bacterial
        ]
        result = subprocessing.execute(commands)
        if not result.successful():
            raise RuntimeError("NRPSPredictor2 failed: %s" % result.stderr)

        with open(output_filename) as handle:
            lines = handle.read().splitlines()[1:]  # strip the header

    return read_output(lines)
Exemple #12
0
def hmmsearch(fasta_format: str, hmm: str) -> float:
    """ Runs hmmsearch, only taking a single value from the output """
    result = subprocessing.execute(["hmmsearch", "--noali", hmm, "-"], stdin=fasta_format)
    if not result.successful():
        logging.error("hmmsearch stderr: %s", result.stderr)
        raise RuntimeError("hmmsearch exited non-zero")

    if "[No targets detected" in result.stdout:
        return 0.

    text = result.stdout
    start = text.find('Domain annotation for each sequence:')
    end = text[start:].find('Internal pipeline statistics summary:')
    lines = text[start:start + end].splitlines()
    return float(lines[4].split()[2])
Exemple #13
0
def make_blastdb(inputfile: str, db_prefix: str) -> subprocessing.RunResult:
    """ Runs makeblastdb on the inputs to create a blast protein database

        makeblastdb will create 3 files with the given prefix and the extensions:
            .pin, .phr, .psq

        Arguments:
            inputfile: the input filename
            db_prefix: the prefix to use for the created database

        Returns:
            a subprocessing.RunResult instance
    """
    command = ["makeblastdb", "-in", inputfile, "-out", db_prefix, "-dbtype", "prot"]
    result = subprocessing.execute(command)
    if not result.successful():
        raise RuntimeError("makeblastdb failed to run: %s -> %s" % (command, result.stderr[-100:]))
    return result
Exemple #14
0
def draw_tree(input_number: int, output_dir: str, tag: str) -> str:
    """ Construct a PNG for display via fasttree

        Returns:
            the filename of the image generated
    """
    matplotlib.use('Agg')
    command = [
        "fasttree", "-quiet", "-fastest", "-noml",
        "trimmed_alignment%d.fasta" % input_number
    ]
    run_result = subprocessing.execute(command)
    if not run_result.successful():
        raise RuntimeError("Fasttree failed to run successfully:",
                           run_result.stderr)

    handle = StringIO(run_result.stdout)
    tree_filename = os.path.join(output_dir, tag + '.png')
    try:
        tree = Phylo.read(handle, 'newick')
    except NewickError:
        logging.debug('Invalid newick tree for %r', tag)
        return ''

    # enforce a minimum distance between branches
    max_size = max(tree.distance(node) for node in tree.get_terminals())
    for clade in tree.get_nonterminals() + tree.get_terminals():
        if not clade.branch_length:
            clade.branch_length = max_size / 20
        else:
            clade.branch_length = abs(clade.branch_length) + max_size / 20
    # change the colour of the query gene
    label_colors = {tag: 'green'}

    Phylo.draw(tree,
               do_show=False,
               label_colors=label_colors,
               label_func=lambda node: str(node).replace("|", " "))
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(20, (tree.count_terminals() / 3))
    matplotlib.pyplot.axis('off')
    fig.savefig(os.path.join(output_dir, tag + '.png'), bbox_inches='tight')
    matplotlib.pyplot.close(fig)
    return tree_filename
Exemple #15
0
def run_blast(query: str, database: str) -> str:
    """ Runs blastp, comparing the given query with the given database

        An output file will be created, using the name of the query but with the
        extension changed to .out

        Arguments:
            query: the path of query sequence file
            target: the path of the database to compare to

        Returns:
            the name of the created output file
    """
    out_file = query.rsplit(".", 1)[0] + ".out"
    command = ["blastp", "-db", database, "-query", query, "-outfmt", "6",
               "-max_target_seqs", "10000", "-evalue", "1e-05",
               "-out", out_file]
    res = subprocessing.execute(command)
    if not res.successful():
        raise RuntimeError("blastp run failed: %s..." % res.stderr[-200:])
    return out_file
def compile_pfam(filename):
    """Compile a HMMer database with hmmpress."""
    command = ['hmmpress', '-f', filename]
    execute(command)