Example #1
0
def run_maxcluster_cluster(predictions,
                           method="average",
                           rmsd=True,
                           clustering_threshold=None,
                           binary="maxcluster"):
    """
    Compare a set of predicted structures to an experimental structure
    using maxcluster.

    For clustering functionality, use run_maxcluster_clustering() function.

    Parameters
    ----------
    predictions : list(str)
        List of PDB files that should be compared against experiment
    method : {"single", "average", "maximum", "pairs_min", "pairs_abs"}, optional (default: "average")
        Clustering method (single / average / maximum linkage,
        or min / absolute size neighbour pairs
    clustering_threshold : float (optional, default: None)
        Initial clustering threshold (maxcluster -T option)
    rmsd : bool, optional (default: True)
        Use RMSD-based clustering (faster)
    binary : str, optional (default: "maxcluster")
        Path to maxcluster binary

    Returns
    -------
    pandas.DataFrame
        Clustering result table (see parse_maxcluster_clustering
        for more detailed explanation)
    """
    # create a list of files for input to maxcluster
    list_file = temp()
    with open(list_file, "w") as f:
        for pred_file in predictions:
            f.write(pred_file + "\n")

    method_map = {
        "single": 1,
        "average": 2,
        "maximum": 3,
        "pairs_min": 4,
        "pairs_abs": 5,
    }

    if method not in method_map:
        raise InvalidParameterError("Method must be one of the following: " +
                                    ", ".join(method_map.keys()))

    cmd = [binary, "-l", list_file, "-C", str(method_map[method])]

    if rmsd:
        cmd += ["-rmsd"]

    if clustering_threshold is not None:
        cmd += ["-T", str(clustering_threshold)]

    return_code, stdout, stderr = run(cmd)

    return parse_maxcluster_clustering(stdout)
Example #2
0
def run_maxcluster_compare(predictions,
                           experiment,
                           normalization_length=None,
                           distance_cutoff=None,
                           binary="maxcluster"):
    """
    Compare a set of predicted structures to an experimental structure
    using maxcluster.
    
    For clustering functionality, use run_maxcluster_clustering() function.
    
    For a high-level wrapper around this function that removes
    problematic atoms and compares multiple models, please look at 
    evcouplings.fold.protocol.compare_models_maxcluster().
    
    Parameters
    ----------
    predictions : list(str)
        List of PDB files that should be compared against experiment
    experiment : str
        Path of experimental structure PDB file. Note that the numbering
        and residues in this file must agree with the predicted structure,
        and that the structure may not contain duplicate atoms (multiple
        models, or alternative locations for the same atom).
    normalization_length : int, optional (default: None)
        Use this length to normalize the Template Modeling (TM)
        score (-N option of maxcluster). If None, will normalize
        by length of experiment.
    distance_cutoff : float, optional (default: None)
        Distance cutoff for MaxSub search (-d option of maxcluster).
        If None, will use maxcluster auto-calibration.
    binary : str, optional (default: "maxcluster")
        Path to maxcluster binary

    Returns
    -------
    pandas.DataFrame
        Comparison result table (see parse_maxcluster_comparison
        for more detailed explanation)
    """
    # create a list of files for input to maxcluster
    list_file = temp()
    with open(list_file, "w") as f:
        for pred_file in predictions:
            f.write(pred_file + "\n")

    cmd = [binary, "-l", list_file, "-e", experiment]

    # normalization length for TM score calculation
    if normalization_length is not None:
        cmd += ["-N", str(normalization_length)]

    # distance cutoff for MaxSub search
    if distance_cutoff is not None:
        cmd += ["-d", str(distance_cutoff)]

    return_code, stdout, stderr = run(cmd)

    return parse_maxcluster_comparison(stdout)
Example #3
0
def run_hhfilter(input_file,
                 output_file,
                 threshold=95,
                 columns="a2m",
                 binary="hhfilter"):
    """
    Redundancy-reduce a sequence alignment using hhfilter
    from the HHsuite alignment suite.

    Parameters
    ----------
    input_file : str
        Path to input alignment in A2M/FASTA format
    output_file : str
        Path to output alignment (will be in A3M format)
    threshold : int, optional (default: 95)
        Sequence identity threshold for maximum pairwise
        identity (between 0 and 100)
    columns : {"first", "a2m"}, optional (default: "a2m")
        Definition of match columns (based on first sequence
        or upper-case columns (a2m))
    binary : str
        Path to hhfilter binary

    Returns
    -------
    str
        output_file

    Raises
    ------
    ResourceError
        If output alignment is non-existent/empty
    ValueError
        Upon invalid value of columns parameter
    """
    if columns not in ["first", "a2m"]:
        raise ValueError("Invalid column selection: {}".format(columns))

    verify_resources("Alignment file does not exist or is empty", input_file)

    create_prefix_folders(output_file)

    cmd = [
        binary, "-i", input_file, "-o", output_file, "-id",
        str(threshold), "-M", columns, "-v",
        str(2)
    ]

    return_code, stdout, stderr = run(cmd)

    verify_resources(
        "hhfilter returned empty alignment: "
        "stdout={} stderr={} file={}".format(stdout, stderr, output_file),
        output_file)

    return output_file
Example #4
0
def run_plmc(alignment, couplings_file, param_file=None,
             focus_seq=None, alphabet=None, theta=None,
             scale=None, ignore_gaps=False, iterations=None,
             lambda_h=None, lambda_J=None, lambda_g=None,
             cpu=None, binary="plmc"):
    """
    Run plmc on sequence alignment and store
    files with model parameters and pair couplings.

    Parameters
    ----------
    alignment : str
        Path to input sequence alignment
    couplings_file : str
        Output path for file with evolutionary couplings
        (folder will be created)
    param_file : str
        Output path for binary file containing model
        parameters (folder will be created)
    focus_seq : str, optional (default: None)
        Name of focus sequence, if None, non-focus mode
        will be used
    alphabet : str, optional (default: None)
        Alphabet for model inference. If None, standard
        amino acid alphabet including gap will be used.
        First character in string corresponds to gap
        character (relevant for ignore_gaps).
    theta : float, optional (default: None)
        Sequences with pairwise identity >= theta
        will be clustered and their sequence weights
        downweighted as 1 / num_cluster_members.
        Important: Note that plmc will be parametrized using
        1 - theta. If None, default value in plmc will be used,
        which corresponds to theta=0.8 (plmc setting 0.2).
    scale : float, optional (default: None)
        Scale weights of clusters by this value.
        If None, default value in plmc (1.0) will be used
    ignore_gaps : bool, optional (default: False)
        Exclude gaps from parameter inference. Gap
        character is first character of alphabet
        parameter.
    iterations : int, optional (default: None)
        Maximum iterations for optimization.
    lambda_h : float, optional (default: None)
        l2 regularization strength on fields.
        If None, plmc default will be used.
    lambda_J : float, optional (default: None)
        l2-regularization strength on couplings.
        If None, plmc default will be used
    lambda_g : float, optional (default: None)
        group l1-regularization strength on couplings
        If None, plmc default will be used.
    cpu : Number of cores to use for running plmc.
        Note that plmc has to be compiled in openmp
        mode to runnable with multiple cores.
        Can also be set to "max".
    binary : str, optional (default: "plmc")
        Path to plmc binary

    Returns
    -------
    PlmcResult
        namedtuple containing output files and
        parsed fields from console output of plmc

    Raises
    ------
    ExternalToolError
    """
    create_prefix_folders(couplings_file)

    # Make sure input alignment exists
    verify_resources(
        "Alignment file does not exist", alignment
    )

    cmd = [
        binary,
        "-c", couplings_file,
    ]

    # store eij file if explicitly requested
    if param_file is not None:
        create_prefix_folders(param_file)
        cmd += ["-o", param_file]

    # focus sequence mode and ID
    if focus_seq is not None:
        # TODO: for now split exclude sequence
        # region from focus seq name, otherwise
        # plmc does not remap names. If this
        # behaviour changes in plmc, remove the
        # following line.
        focus_seq = focus_seq.split("/")[0]
        cmd += ["-f", focus_seq]

    # exclude gaps from calculation?
    if ignore_gaps:
        cmd += ["-g"]

    # maximum number of iterations, can also be "max"
    if iterations is not None:
        cmd += ["-m", str(iterations)]

    # set custom alphabet
    # (first character is gap by default in nogap mode)
    if alphabet is not None:
        cmd += ["-a", alphabet]

    # sequence reweighting
    if theta is not None:
        # transform into plmc convention (1-theta)
        theta = 1.0 - theta
        cmd += ["-t", str(theta)]

    # cluster weight
    if scale is not None:
        cmd += ["-s", str(scale)]

    # L2 regularization weight for fields
    if lambda_h is not None:
        cmd += ["-lh", str(lambda_h)]

    # L2 regularization weight for pair couplings
    if lambda_J is not None:
        cmd += ["-le", str(lambda_J)]

    # Group L1 regularization weight for pair couplings
    if lambda_g is not None:
        cmd += ["-lg", str(lambda_g)]

    # Number of cores to use for calculation
    if cpu is not None:
        cmd += ["-n", str(cpu)]

    # finally also add input alignment (main parameter)
    cmd += [alignment]

    # TODO: for now do not check returncode because sometimes
    # returncode == -11 (segfault) despite successful calculation
    return_code, stdout, stderr = run(cmd, check_returncode=False)

    # TODO: remove this segfault-hunting output once fixed
    if return_code != 0:
        # if not a segfault, still raise exception
        if return_code != -11:
            from evcouplings.utils.system import ExternalToolError
            raise ExternalToolError(
                "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
                    cmd, return_code, stdout, stderr
                )
            )

        print("PLMC NON-ZERO RETURNCODE:", return_code)
        print(cmd)
        print(" ".join(cmd))
        print("stdout:", stdout)
        print("stderr:", stderr)

    iter_df, out_fields = parse_plmc_log(stderr)

    # also check we actually calculated couplings...
    if not valid_file(couplings_file):
        raise ResourceError(
            "plmc returned no couplings: stdout={} stderr={} file={}".format(
                stdout, stderr, couplings_file
            )
        )

    # ... and parameter file, if requested
    if param_file and not valid_file(param_file):
        raise ResourceError(
            "plmc returned no parameter file: stdout={} stderr={} file={}".format(
                stdout, stderr, param_file
            )
        )

    return PlmcResult(
        couplings_file, param_file,
        iter_df, *out_fields
    )
Example #5
0
def run_jackhmmer(query,
                  database,
                  prefix,
                  use_bitscores,
                  domain_threshold,
                  seq_threshold,
                  iterations=5,
                  nobias=False,
                  cpu=None,
                  stdout_redirect=None,
                  checkpoints_hmm=False,
                  checkpoints_ali=False,
                  binary="jackhmmer"):
    """
    Run jackhmmer sequence search against target database.
    Refer to HMMER Userguide for explanation of these parameters.

    Parameters
    ----------
    query : str
        File containing query sequence
    database : str
        File containing sequence database
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    use_bitscores : bool
        Use bitscore inclusion thresholds rather than E-values.
    domain_threshold : int or float or str
        Inclusion threshold applied on the domain level
        (e.g. "1E-03" or 0.001 or 50)
    seq_threshold : int or float or str
        Inclusion threshold applied on the sequence level
        (e.g. "1E-03" or 0.001 or 50)
    iterations : int
        number of jackhmmer search iterations
    nobias : bool, optional (default: False)
        Turn of bias correction
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    checkpoints_hmm : bool, optional (default: False)
        Store checkpoint HMMs to prefix.<iter>.hmm
    checkpoints_ali : bool, optional (default: False)
        Store checkpoint alignments to prefix.<iter>.sto
    binary : str (default: "jackhmmer")
        Path to jackhmmer binary (put in PATH for
        default to work)

    Returns
    -------
    JackhmmerResult
        namedtuple with fields corresponding to the different
        output files (prefix, alignment, output, tblout, domtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources("Input file does not exist or is empty", query, database)

    create_prefix_folders(prefix)

    # store filenames of all individual results;
    # these will be returned as result of the
    # function.
    result = JackhmmerResult(
        prefix, prefix + ".sto",
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
        prefix + ".tblout", prefix + ".domtblout")

    cmd = [
        binary, "-N",
        str(iterations), "-o", result.output, "-A", result.alignment,
        "--tblout", result.tblout, "--domtblout", result.domtblout, "--noali",
        "--notextw"
    ]

    # reporting thresholds are set accordingly to
    # inclusion threshold to reduce memory footprit
    if use_bitscores:
        cmd += [
            "-T",
            str(seq_threshold), "--domT",
            str(domain_threshold), "--incT",
            str(seq_threshold), "--incdomT",
            str(domain_threshold)
        ]
    else:
        cmd += [
            "-E",
            str(seq_threshold), "--domE",
            str(domain_threshold), "--incE",
            str(seq_threshold), "--incdomE",
            str(domain_threshold)
        ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    # bias correction filter
    if nobias:
        cmd += ["--nobias"]

    # save checkpoints for alignments and HMMs?
    if checkpoints_ali:
        cmd += ["--chkali", prefix]
    if checkpoints_hmm:
        cmd += ["--chkhmm", prefix]

    cmd += [query, database]

    return_code, stdout, stderr = run(cmd)

    # also check we actually created some sort of alignment
    verify_resources(
        "jackhmmer returned empty alignment: "
        "stdout={} stderr={} file={}".format(stdout, stderr, result.alignment),
        result.alignment)

    return result
Example #6
0
def run_hmmscan(query,
                database,
                prefix,
                use_model_threshold=True,
                threshold_type="cut_ga",
                use_bitscores=True,
                domain_threshold=None,
                seq_threshold=None,
                nobias=False,
                cpu=None,
                stdout_redirect=None,
                binary="hmmscan"):
    """
    Run hmmscan of HMMs in database against sequences in query
    to identify matches of these HMMs.
    Refer to HMMER Userguide for explanation of these parameters.

    Parameters
    ----------
    query : str
        File containing query sequence(s)
    database : str
        File containing HMM database (prepared with hmmpress)
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    use_model_threshold: bool (default: True)
        Use model-specific inclusion thresholds from
        HMM database rather than global bitscore/E-value
        thresholds (use_bitscores, domain_threshold and
        seq_threshold are overriden by this flag).
    threshold-type: {"cut_ga", "cut_nc", "cut_tc"} (default: "cut_ga")
        Use gathering (default), noise or trusted cutoff
        to define scan hits. Please refer to HMMER manual for
        details.
    use_bitscores : bool
        Use bitscore inclusion thresholds rather than E-values.
        Overriden by use_model_threshold flag.
    domain_threshold : int or float or str
        Inclusion threshold applied on the domain level
        (e.g. "1E-03" or 0.001 or 50)
    seq_threshold : int or float or str
        Inclusion threshold applied on the sequence level
        (e.g. "1E-03" or 0.001 or 50)
    nobias : bool, optional (default: False)
        Turn of bias correction
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    binary : str (default: "hmmscan")
        Path to hmmscan binary (put in PATH for
        default to work)

    Returns
    -------
    HmmscanResult
        namedtuple with fields corresponding to the different
        output files (prefix, output, tblout, domtblout, pfamtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources("Input file does not exist or is empty", query, database)

    create_prefix_folders(prefix)

    result = HmmscanResult(
        prefix,
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
        prefix + ".tblout", prefix + ".domtblout", prefix + ".pfamtblout")

    cmd = [
        binary,
        "-o",
        result.output,
        "--tblout",
        result.tblout,
        "--domtblout",
        result.domtblout,
        "--pfamtblout",
        result.pfamtblout,
        "--notextw",
        "--acc",
    ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    # bias correction filter
    if nobias:
        cmd += ["--nobias"]

    # either use model-specific threshold, or custom
    # bitscore/E-value thresholds
    if use_model_threshold:
        THRESHOLD_CHOICES = ["cut_ga", "cut_nc", "cut_tc"]
        if threshold_type not in THRESHOLD_CHOICES:
            raise ValueError("Invalid model threshold, valid choices are: " +
                             ", ".join(THRESHOLD_CHOICES))

        cmd += ["--" + threshold_type]
    else:
        if seq_threshold is None or domain_threshold is None:
            raise ValueError("Must define sequence- and domain-level reporting"
                             "thresholds, or use gathering threshold instead.")

        if use_bitscores:
            cmd += [
                "-T",
                str(seq_threshold),
                "--domT",
                str(domain_threshold),
            ]
        else:
            cmd += [
                "-E",
                str(seq_threshold),
                "--domE",
                str(domain_threshold),
            ]

    cmd += [database, query]

    return_code, stdout, stderr = run(cmd)

    # also check we actually created a table with hits
    verify_resources(
        "hmmscan did not return results: "
        "stdout={} stderr={} file={}".format(stdout, stderr, result.domtblout),
        result.domtblout)

    return result
Example #7
0
def run_cns_13(inp_script=None,
               inp_file=None,
               log_file=None,
               source_script=None,
               binary="cns"):
    """
    Run CNSsolve 1.3

    Note that the user is responsible for verifying the output products
    of CNS, since their paths are determined by .inp scripts and
    hard to check automatically and in a general way.

    Either input_script or input_file has to be specified.

    Parameters
    ----------
    inp_script : str, optional (default: None)
        CNS ".inp" input script (actual commands, not file)
    inp_file : str, optional (default: None)
        Path to .inp input script file. Will override
        inp_script if also specified.
    log_file : str, optional (default: None)
        Save CNS stdout output to this file
    source_script : str, optional (default: None)
        Script to set CNS environment variables.
        This should typically point to .cns_solve_env_sh
        in the CNS installation main directory (the
        shell script itself needs to be edited to
        contain the path of the installation)
    binary : str, optional (default: "cns")
        Name of CNS binary

    Raises
    ------
    ExternalToolError
        If call to CNS fails
    InvalidParameterError
        If no input script (file or string) given
    """
    # usually need to source script to set up environment for CNS
    if source_script is not None:
        cmd = "source {};".format(source_script)
    else:
        cmd = ""

    cmd += binary

    if inp_script is None and inp_file is None:
        raise InvalidParameterError(
            "Must specify either input_script or input_file")

    # read input script, this is fed into CNS using stdin
    if inp_file is not None:
        with open(inp_file) as f:
            inp_script = "".join(f.readlines())

    # run and store output
    return_code, stdout, stderr = run(cmd, stdin=inp_script, shell=True)

    # write stdout output to log file
    if log_file is not None:
        with open(log_file, "w") as f:
            f.write(stdout)
Example #8
0
def run_cns(inp_script=None, inp_file=None, log_file=None, binary="cns"):
    """
    Run CNSsolve 1.21 (without worrying about environment setup)

    Note that the user is responsible for verifying the output products
    of CNS, since their paths are determined by .inp scripts and
    hard to check automatically and in a general way.

    Either input_script or input_file has to be specified.

    Parameters
    ----------
    inp_script : str, optional (default: None)
        CNS ".inp" input script (actual commands, not file)
    inp_file : str, optional (default: None)
        Path to .inp input script file. Will override
        inp_script if also specified.
    log_file : str, optional (default: None)
        Save CNS stdout output to this file
    binary : str, optional (default: "cns")
        Absolute path of CNS binary

    Raises
    ------
    ExternalToolError
        If call to CNS fails
    InvalidParameterError
        If no input script (file or string) given
    """
    # make sure we have absolute path
    binary = path.abspath(binary)

    # extract main installation directory
    cns_main_dir = binary
    for i in range(3):
        cns_main_dir = path.dirname(cns_main_dir)

    # create environment
    env = deepcopy(os.environ)
    library_dir = path.join(cns_main_dir, "libraries")
    module_dir = path.join(cns_main_dir, "modules")

    env["CNS_SOLVE"] = cns_main_dir
    env["CNS_LIB"] = library_dir
    env["CNS_MODULE"] = module_dir
    env["CNS_HELPLIB"] = path.join(cns_main_dir, "helplip")

    for var, subdir in [
        ("CNS_TOPPAR", "toppar"),
        ("CNS_CONFDB", "confdb"),
        ("CNS_XTALLIB", "xtal"),
        ("CNS_NMRLIB", "nmr"),
        ("CNS_XRAYLIB", "xray"),
    ]:
        env[var] = path.join(library_dir, subdir)

    for var, subdir in [
        ("CNS_XTALMODULE", "xtal"),
        ("CNS_NMRMODULE", "nmr"),
    ]:
        env[var] = path.join(module_dir, subdir)

    if inp_script is None and inp_file is None:
        raise InvalidParameterError(
            "Must specify either input_script or input_file")

    # read input script, this is fed into CNS using stdin
    if inp_file is not None:
        with open(inp_file) as f:
            inp_script = "".join(f.readlines())

    # run and store output
    return_code, stdout, stderr = run(binary, stdin=inp_script)

    # write stdout output to log file
    if log_file is not None:
        with open(log_file, "w") as f:
            f.write(stdout)
Example #9
0
def run_psipred(fasta_file, output_dir, binary="runpsipred"):
    """
    Run psipred secondary structure prediction

    psipred output file convention: run_psipred creates
    output files <rootname>.ss2 and <rootname2>.horiz
    in the current working directory, where <rootname>
    is extracted from the basename of the input file
    (e.g. /home/test/<rootname>.fa)

    Parameters
    ----------
    fasta_file : str
        Input sequence file in FASTA format
    output_dir : str
        Directory in which output will be saved
    binary : str, optional (default: "cns")
        Path of psipred executable (runpsipred)

    Returns
    -------
    ss2_file : str
        Absolute path to prediction output in "VFORMAT"
    horiz_file : str
        Absolute path to prediction output in "HFORMAT"

    Raises
    ------
    ExternalToolError
        If call to psipred fails
    """
    # make sure we have absolute path
    binary = path.abspath(binary)
    fasta_file = path.abspath(fasta_file)
    output_dir = path.abspath(output_dir)

    # make sure input file is valid
    verify_resources("Input FASTA file is invalid", fasta_file)

    # make sure output directory exists
    makedirs(output_dir)

    # execute psipred;
    # we have to start it from output directory so
    # result files end up there (this is hardcoded
    # in runpsipred)
    return_code, stdout, stderr = run(
        [binary, fasta_file],
        working_dir=output_dir,
    )

    # determine where psipred will store output based
    # on logic from runpsipred script
    rootname, _ = path.splitext(path.basename(fasta_file))
    output_prefix = path.join(output_dir, rootname)

    # construct paths to output files in vertical and horizontal formats
    ss2_file = output_prefix + ".ss2"
    horiz_file = output_prefix + ".horiz"

    # make sure we actually predicted something
    verify_resources("psipred output is invalid", ss2_file, horiz_file)

    return ss2_file, horiz_file
Example #10
0
def run_hmmbuild(alignment_file,
                 prefix,
                 cpu=None,
                 stdout_redirect=None,
                 symfrac=None,
                 binary="hmmbuild"):
    """
    Profile HMM construction from multiple sequence alignments
    Refer to HMMER documentation for details.

    http://eddylab.org/software/hmmer3/3.1b2/Userguide.pdf

    Parameters
    ----------
    alignment_file : str
        File containing the multiple sequence alignment. Can be in 
        Stockholm, a2m, or clustal formats, or any other format 
        recognized by hmmer. Please note that ALL POSITIONS 
        above the symfrac cutoff will be used in HMM 
        construction (if the alignment contains columns that are 
        insertions relative to the query sequence, this may be 
        problematic for structure comparison)
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    symfrac : float, optional (default: None)
        range 0.0 - 1.0, HMMbuild will use columns with 
        > symfrac percent gaps to construct the HMM.
        If None provided, HMMbuild internal default is 0.5.
        (Note: this is calculated after their internal sequence
        weighting is calculated)
    binary : str (default: "hmmbuild")
        Path to jackhmmer binary (put in PATH for
        default to work)

    Returns
    -------
    HmmbuildResult
        namedtuple with fields corresponding to the different
        output files (prefix, alignment, output, tblout, domtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources("Input file does not exist or is empty", alignment_file)

    create_prefix_folders(prefix)

    # store filenames of all individual results;
    # these will be returned as result of the
    # function.
    result = HmmbuildResult(
        prefix,
        prefix + ".hmm",
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
    )

    cmd = [
        binary,
        "-o",
        result.output,
    ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    if symfrac is not None:
        cmd += ["--symfrac", str(symfrac)]

    cmd += [result.hmmfile, alignment_file]

    return_code, stdout, stderr = run(cmd)

    # also check we actually created some sort of alignment
    verify_resources(
        "hmmbuild returned empty HMM profile: "
        "stdout={} stderr={} file={}".format(stdout, stderr, result.hmmfile),
        result.hmmfile)

    return result
Example #11
0
def run_hmmsearch(hmmfile,
                  database,
                  prefix,
                  use_bitscores,
                  domain_threshold,
                  seq_threshold,
                  nobias=False,
                  cpu=None,
                  stdout_redirect=None,
                  binary="hmmsearch"):
    """
    Search profile(s) against a sequence database.
    Refer to HMMER documentation for details.

    http://eddylab.org/software/hmmer3/3.1b2/Userguide.pdf

    Parameters
    ----------
    hmmfile : str
        File containing the profile(s)
    database : str
        File containing sequence database
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    use_bitscores : bool
        Use bitscore inclusion thresholds rather than E-values.
    domain_threshold : int or float or str
        Inclusion threshold applied on the domain level
        (e.g. "1E-03" or 0.001 or 50)
    seq_threshold : int or float or str
        Inclusion threshold applied on the sequence level
        (e.g. "1E-03" or 0.001 or 50)
    nobias : bool, optional (default: False)
        Turn of bias correction
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    binary : str (default: "hmmsearch")
        Path to jackhmmer binary (put in PATH for
        default to work)

    Returns
    -------
    HmmsearchResult
        namedtuple with fields corresponding to the different
        output files (prefix, alignment, output, tblout, domtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources("Input file does not exist or is empty", hmmfile,
                     database)

    create_prefix_folders(prefix)

    # store filenames of all individual results;
    # these will be returned as result of the
    # function.
    result = HmmsearchResult(
        prefix, prefix + ".sto",
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
        prefix + ".tblout", prefix + ".domtblout")

    cmd = [
        binary, "-o", result.output, "-A", result.alignment, "--tblout",
        result.tblout, "--domtblout", result.domtblout, "--noali", "--notextw"
    ]

    # reporting thresholds are set accordingly to
    # inclusion threshold to reduce memory footprint
    if use_bitscores:
        cmd += [
            "-T",
            str(seq_threshold), "--domT",
            str(domain_threshold), "--incT",
            str(seq_threshold), "--incdomT",
            str(domain_threshold)
        ]
    else:
        cmd += [
            "-E",
            str(seq_threshold), "--domE",
            str(domain_threshold), "--incE",
            str(seq_threshold), "--incdomE",
            str(domain_threshold)
        ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    # bias correction filter
    if nobias:
        cmd += ["--nobias"]

    cmd += [hmmfile, database]

    return_code, stdout, stderr = run(cmd)

    return result