Beispiel #1
0
def write_fragment_sequence_profiles(df, filename=None, consensus=None):
    """Write a sequence profile from :class:`.FragmentFrame` to load into
    Rosetta's **SeqprofConsensus**.

    Format mimicks as much as possible **BLAST PSSM**.

    :param df: Fragments from which to create the matrix or the matrix itself.
    :type df: Union[:class:`.FragmentFrame`, :class:`~pandas.DataFrame`]
    :param str filename: Output file name.
    :param str consensus: Consensus sequence to show.

    :return: :class:`str` - the expected file content if ``filename`` is :data:`None`.

    :raises:
        :IOError: if ``filename`` exists and :ref:`system.overwrite <options>` is
            :data:`False`.
        :ValueError: if ``consensus`` lenth differs from the expected by the
            given fragments.
    """
    def format_row(row, aa, row0):
        val1 = "".join(row.apply("{0:>3d}".format))
        val0 = "".join(row0.apply("{0:>4d}".format))
        data = "{0:>5d} {1}  ".format(row.name + 1, aa)
        return data + val1 + " " + val0 + \
            "{0:>6.2f}{0:>8.2f}".format(0)

    if isinstance(df, rc.FragmentFrame):
        matrix = df.make_sequence_matrix(round=True)
    else:
        matrix = df.copy()
    if consensus is None:
        consensus = df.quick_consensus_sequence()
    if len(consensus) != matrix.shape[0]:
        raise ValueError('Sequence need to be the same length.')
    matrix2 = matrix.copy()
    matrix2[:] = 0
    data = list(
        matrix.apply(lambda row: format_row(row, consensus[row.name], matrix2.
                                            iloc[row.name]),
                     axis=1))
    head = "{0:>11}".format(" ") + \
           "  ".join(list(matrix.columns)) + "   " + "   ".join(list(matrix.columns))
    data.insert(0, head)
    prefix = "Last position-specific scoring matrix computed, weighted observed percentages " + \
             "rounded down, information per position, and relative weight of gapless real " + \
             "matches to pseudocounts"
    data.insert(0, prefix)
    data.insert(0, "")
    data = "\n".join(data)

    if filename is not None:
        if os.path.isfile(filename) and not core.get_option(
                "system", "overwrite"):
            raise IOError("File {} already exists".format(filename))
        with open(filename, "w") as fd:
            fd.write(data)
    else:
        return data
Beispiel #2
0
def make_rosetta_app_path( application ):
    """Provided the expected Rosetta application, add path and suffix.

    .. note::
        Depends on :ref:`rosetta.path <options>` and :ref:`rosetta.compilation <options>`,
        if the ``filename`` does not exist.

    :param str application: Name of the application to call.

    :return: :class:`str`

    :raise:
        :IOError: If the final path created does not exist.
    """
    import rstoolbox.core as core

    path    = core.get_option("rosetta", "path")
    comp    = core.get_option("rosetta", "compilation")
    exe     = os.path.join(path, "{0}.{1}".format(application, comp))
    if not os.path.isfile(exe):
        raise IOError("The expected Rosetta executable {0} is not found".format(exe))
    return exe
Beispiel #3
0
def write_clustalw(df, seqID, filename=None):
    """Write sequences of selected designs as a CLUSTALW alignment.

    If a ``reference_sequence`` exists, it is set up as the first sequence
    of the alignment. The name assigned to it will be the multipl longest common
    subsequence of all the decoys ``description``. If none is found, or if it
    actually matches one of the already existing identifiers, then it will default
    to *reference*.

    :param df: Data content.
    :type df: Union[:class:`.DesignFrame`, :class:`~pandas.DataFrame`]
    :param str seqID: |seqID_param|.
    :type seqID: :class:`str`
    :param filename: Output file name.
    :type filename: :class:`str`

    :return: :class:`str` - **CLUSTALW** formated string.

    :raises:
        :IOError: If ``filename`` exists and global option :ref:`system.overwrite <options>`
            is not :data:`True`.
        :AttributeError: |seqID_error|.

    .. note::
        Depends on :ref:`system.overwrite <options>` and :ref:`system.output <options>`.
    """
    def chunkstring(string, length):
        return [
            string[0 + i:length + i] for i in range(0, len(string), length)
        ]

    if filename is not None:
        if os.path.isfile(filename) and not core.get_option(
                "system", "overwrite"):
            raise IOError("File {} already exists".format(filename))

    data = ["CLUSTAL W(1.83) multiple sequence alignment\n"]
    names = list(df.get_id().values)
    seqs = list(df.get_sequence(seqID).values)
    if df.has_reference_sequence(seqID):
        refname = mlcs(names)
        if len(refname) > 0 and refname not in names:
            if re.match('[\w\d\_\.]+\_[0]*$', str(refname)):
                refname = refname.rstrip("0").rstrip("_")
        else:
            refname = "reference"
        names.insert(0, refname)
        seqs.insert(0, df.get_reference_sequence(seqID))
    seqs = [chunkstring(_, 50) for _ in seqs]
    chunks = len(seqs[0])
    nm_len = max([len(_) for _ in names])
    for j in range(chunks):
        for i, nm in enumerate(names):
            line = ("{:<" + str(nm_len) + "}").format(nm) + " " + seqs[i][j]
            data.append(line)
        data.append("\n")

    if filename is not None:
        fd = open(filename,
                  "w") if not filename.endswith(".gz") else gzip.open(
                      filename, "wb")
        fd.write("\n".join(data))
        fd.close()

    return "\n".join(data)
Beispiel #4
0
def write_fasta(df, seqID, separator=None, filename=None, split=False):
    """Writes fasta files of the selected decoys.

    It assumes that the provided data is contained in a :class:`.DesignFrame`
    or a :class:`~pandas.DataFrame`.

    Mandatory columns are:

    ====================  ===================================================
    Column Name           Data Content
    ====================  ===================================================
    **description**       Sequence identifier.
    **sequence_<seqID>**  Sequence content.
    ====================  ===================================================

    .. ipython::

        In [1]: from rstoolbox.io import read_fasta, write_fasta
           ...: df = read_fasta("../rstoolbox/tests/data/*fa", multi=True)
           ...: print write_fasta(df, "A")

    When working with multiple ``seqID``, one can select which ones to be printed;
    empty sequences will be skipped.

    .. ipython::

        In [1]: from rstoolbox.io import read_fasta, write_fasta
           ...: df = read_fasta("../rstoolbox/tests/data/*fa", expand=True, multi=True)
           ...: print write_fasta(df, "AC")

    :param df: Data content.
    :type df: Union[:class:`.DesignFrame`, :class:`~pandas.DataFrame`]
    :param str seqID: |seqID_param|.
    :param str separator: Add ``seqID`` to sequence identifier through a particular
        string separator. If multiple ``seqID`` are provided, it defaults to ``:``.
    :param str filename: Output file name.
    :param bool split: Split each fasta in a different file. ``filename`` first part of the filename
        is used as `prefix`, with a following enumeration.

    :return: :class:`str` - **FASTA** formated string.

    :raises:
        :IOError: If ``filename`` exists and global option :ref:`system.overwrite <options>`
            is not :data:`True`.
        :AttributeError: |seqID_error|.

    .. note::
        Depends on :ref:`system.overwrite <options>` and :ref:`system.output <options>`.

    .. seealso::
        :func:`~.read_fasta`
    """
    def nomenclator(row, seqID, separator):
        sequence = row.get_sequence(seqID)
        if sequence is None or isinstance(sequence,
                                          float) or len(sequence) == 0:
            return ""
        name = ">" + row.get_id()
        if separator is not None:
            name = name + separator + seqID
        return name + "\n" + row.get_sequence(seqID)

    if filename is not None:
        if os.path.isfile(filename) and not core.get_option(
                "system", "overwrite"):
            raise IOError("File {} already exists".format(filename))
    if not isinstance(df, cp.DesignFrame):
        df = cp.DesignFrame(df)
    if len(seqID) > 0 and separator is None:
        separator = ":"

    data = []
    for chain in seqID:
        eachfa = df.apply(lambda row: nomenclator(row, chain, separator),
                          axis=1)
        data.extend(eachfa.values)

    if filename is not None:
        if not split:
            fd = open(filename,
                      "w") if not filename.endswith(".gz") else gzip.open(
                          filename, "wb")
            fd.write("\n".join(data).strip() + "\n")
            fd.close()
        else:
            suffix = "_f{0:04d}"
            cplxname = os.path.splitext(filename)
            for i, sequence in enumerate(data):
                fname = cplxname[0] + suffix.format(i + 1) + cplxname[1]
                fd = open(fname,
                          "w") if not fname.endswith(".gz") else gzip.open(
                              fname, "wb")
                fd.write(sequence + "\n")
                fd.close()

    return "\n".join(data).strip() + "\n"
Beispiel #5
0
def make_structures(df,
                    outdir=None,
                    tagsfilename="tags",
                    prefix=None,
                    keep_tagfile=True):
    """Extract the selected decoys (if any).

    .. note::
        Depends on :ref:`rosetta.path <options>` and :ref:`rosetta.compilation <options>`.
        Depends on :ref:`system.overwrite <options>` and :ref:`system.output <options>`.

    .. attention::
        This function **REQUIRES** a local installation of **Rosetta**.

    It basicall runs the ``extract_pdbs`` **Rosetta** application over the selected decoys.

    .. code-block:: bash

       extract_pdbs.linuxgccrelease -in:file:silent <pdb> -tags <selected>

    It requires the :class:`.DesignFrame` to have ``source_file`` attached identifying the
    silent files from which the data can be extracted. **minisilent files will not work here**.
    This should happen by default with the library, if one reads from actual silent files,
    but can be set up with::

        # (1) Read from a minisilent file that does not contain structural data: substitute
        ``df.replace_source_files(["file1", "file2", ])``
        # (2) Add files to a recently casted DesignFrame
        ``df.add_source_files(["file1", "file2", ])``

    :param df: Selected set of decoy that have to be extracted.
    :type df: :class:`.DesignFrame`
    :param str outdir: Directory in which to save the PDB files. If none is provided,
        it will be loaded from the :ref:`system.output <options>` global option.
    :param str tagsfilename: Name of the file containing the ids of the decoys of interest.
        It will be created in the ``outdir``. An previously existing file will not be
        overwritten if the global option :ref:`system.overwrite <options>` is :data:`False`.
    :param str prefix: If provided, a prefix is added to the PDB files.
    :param bool keep_tagfile: If :data:`True`, do not delete the tag file after using it.

    :raises:
        :ValueError: if the provided data does not have a **description** column.
        :ValueError: if the **description** column has repeated identifiers.
        :AttributeError: if silent files from where to extract structures are not found.
        :IOError: if the attached silent files do not exist.
        :IOError: when trying to overwrite the ``tagsfilename`` if *system.overwrite* is
            :py:data:`False`.
        :IOError: if the rosetta executable is not found. Depends on *rosetta.path* and
            *rosetta.compilation*.
    """
    # Check that the selection has at least one decoy
    if df.shape[0] == 0:
        sys.stdout.write(
            "There are no decoys that fullfill the selection criteria.")
        return

    # Check that a column named "description", from which the IDs are (must be unique)
    column = "description"
    if column not in df.columns:
        raise ValueError(
            "Identifiers of the decoys must be assigned to the column 'description'."
        )
    if True in df.duplicated(column).value_counts().index:
        raise ValueError(
            "There are repeated identifiers. This might indicate the merging of files "
            "with identical prefixes and will be an issue with extracting "
            " the structures.")

    # Check that we have associated silent files to extract the data from
    if not isinstance(df, rc.DesignFrame) or len(df.get_source_files()) == 0:
        raise AttributeError(
            "There are not source files from where to extract the structures.")
    sfiles = list(df.get_source_files())

    # Manage output directory
    outdir = outdir if outdir is not None else core.get_option(
        "system", "output")
    if not os.path.isdir(outdir):
        os.makedirs(outdir)
    if not outdir.endswith("/"):
        outdir += "/"

    # Manage tag file name
    tagsfilename = os.path.join(outdir, tagsfilename)
    if os.path.isfile(tagsfilename) and not core.get_option(
            "system", "overwrite"):
        raise IOError(
            "Filename {0} already exists and cannot be overwrite.".format(
                tagsfilename))

    # Manage prefix
    if prefix is not None:
        outdir = os.path.join(outdir, prefix)

    # Check rosetta executable
    path = core.get_option("rosetta", "path")
    comp = core.get_option("rosetta", "compilation")
    exe = os.path.join(path, "extract_pdbs.{0}".format(comp))
    if not os.path.isfile(exe):
        raise IOError(
            "The expected Rosetta executable {0} is not found".format(exe))

    # Print the tag file
    df[[column]].to_csv(tagsfilename, index=False, header=False)
    if not os.path.isfile(tagsfilename):
        raise IOError(
            "Something went wrong writing the file {0}".format(tagsfilename))

    # Run process
    sfiles = " ".join(sfiles)
    command = "{0} -in:file:silent {1} -in:file:tagfile {2} -out:prefix {3}"
    command = command.format(exe, sfiles, tagsfilename, outdir)
    sys.stdout.write("Executing Rosetta's extract_pdbs app\n")
    sys.stdout.write(
        "(depending on the total number of decoys and how many have "
        "been requested this might take a while...)\n")
    error = os.system(command)
    if not bool(error):
        sys.stdout.write("Execution has finished\n")
    else:
        sys.stdout.write("Execution has failed\n")

    # Remove extra files if requested
    if not keep_tagfile:
        os.unlink(tagsfilename)
Beispiel #6
0
def get_sequence_and_structure(pdbfile):
    """Provided a PDB file, it will run a small **RosettaScript** to capture its sequence and
    structure, i.e. dssp and phi-psi dihedrals.

    .. note::
        Depends on :ref:`rosetta.path <options>` and :ref:`rosetta.compilation <options>`,
        if the quality file is not provided.

    .. attention::
        This function **REQUIRES** a local installation of **Rosetta**.

    It will generate an output file called: ``<pdbfile>.dssp.minisilent``. If this file
    already exists, it will be directly read. You can choose to compress it;
    ``<pdbfile>.dssp.minisilent.gz`` will also work.

    :param str pdbfile: Name of the input structure.

    :return: :class:`.DesignFrame`.

    :raises:
        :IOError: if ``pdbfile`` cannot be found.
        :IOError: if Rosetta executable cannot be found.
        :ValueError: if Rosetta execution fails

    The script that is runned (and that can be runned outside and then brough back to
    your computer) is:

    .. ipython::

        In [1]: from rstoolbox.utils import baseline
           ...: print baseline()

    .. seealso::
        :func:`.baseline`
    """
    if not os.path.isfile(pdbfile):
        raise IOError("Structure {} cannot be found".format(pdbfile))
    minisilent = re.sub("\.pdb|\.cif$", "", re.sub(
        "\.gz$", "", pdbfile)) + ".dssp.minisilent"
    if os.path.isfile(minisilent):
        return parse_rosetta_file(minisilent, {
            "sequence": "*",
            "structure": "*",
            "dihedrals": "*"
        })
    elif os.path.isfile(minisilent + ".gz"):
        return parse_rosetta_file(minisilent + ".gz", {
            "sequence": "*",
            "structure": "*",
            "dihedrals": "*"
        })

    with open("dssp.xml", "w") as fd:
        fd.write(baseline())

    # Check rosetta executable & run
    path = core.get_option("rosetta", "path")
    comp = core.get_option("rosetta", "compilation")
    exe = os.path.join(path, "rosetta_scripts.{0}".format(comp))
    if not os.path.isfile(exe):
        raise IOError(
            "The expected Rosetta executable {0} is not found".format(exe))
    command = "{0} -parser:protocol {1} -s {2} -out:file:silent {3} -ignore_unrecognized_res"
    command = command.format(exe, "dssp.xml", pdbfile, str(os.getpid()) + "_")
    sys.stdout.write("Running Rosetta\n")
    sys.stdout.write(command + "\n")
    error = os.system(command)
    os.unlink("dssp.xml")
    if not bool(error):
        if os.path.isfile(str(os.getpid()) + "_"):
            sys.stdout.write("Execution has finished\n")
            fd = open(minisilent, "w")
            for line, _, _, _ in open_rosetta_file(str(os.getpid()) + "_"):
                fd.write(line)
            fd.close()
            os.unlink(str(os.getpid()) + "_")
            return get_sequence_and_structure(pdbfile)
        else:
            raise ValueError("Execution has failed\n")
    else:
        raise ValueError("Execution has failed\n")
Beispiel #7
0
    def add_quality_measure(self, filename, pdbfile=None):
        """Add RMSD quality measure to the fragment data.

        The RMSD quality measurement is performed by the ``r_fraq_qual`` application
        from **Rosetta**. It can be called as:

        .. code-block:: bash

           r_fraq_qual.linuxgccrelease -in:file:native <pdb> -f <fragfile> -out:qual <output>

        :param str filename: Name containing the quality measure. If ``filename`` is None,
            it assumes that RMSD quality has not been calculated yet, so it'll run the
            ``r_fraq_qual`` application as long as the :class:`.FragmentFrame`
            has a ``source_file``. Standart output will be the name of the source
            file with the extension ".qual". If a file with this naming schema exists,
            it will be automatically picked. To be able to run *Rosetta* it will need
            a ``pdbfile``.
        :param str pdbfile: In case the quality has to be calculated. Provide the
            PDB over which to calculate it. Default is :data:`None`.

        :raises:
            :IOError: if ``filename`` does not exist.
            :IOError: if ``pdbfile`` is provided and does not exist.
            :IOError: if the *Rosetta* executable is not found.
            :AttributeError: if ``filename`` is :data:`None` and there is no attached
                ``source_file`` to the object.
            :ValueError: if no rmsd data is assigned. Might indicate that wrong data is
                trying to be assigned.

        .. note::
            Depends on :ref:`rosetta.path <options>` and :ref:`rosetta.compilation <options>`,
            if the quality file is not provided.

        .. attention::
            Some configurations of this function require a local installation of **Rosetta**.
        """
        if filename is None and not self.has_source_file():
            raise AttributeError(
                "No quality file is provided and no source file can be found.")

        # Make the quality fragmet eval if needed.
        if filename is None:
            sofi = self._source_file
            if sofi.endswith(".gz"):
                sofi = ".".join(sofi.split(".")[:-1])
            filename = sofi + ".qual"
            if not os.path.isfile(filename) and not os.path.isfile(filename +
                                                                   ".gz"):
                # Check rosetta executable
                exe = os.path.join(
                    core.get_option("rosetta", "path"),
                    "r_frag_quality.{0}".format(
                        core.get_option("rosetta", "compilation")))
                if not os.path.isfile(exe):
                    raise IOError(
                        "The expected Rosetta executable {0} is not found".
                        format(exe))
                if not os.path.isfile(pdbfile):
                    raise IOError("{0} not found".format(pdbfile))
                command = "{0} -in:file:native {1} -f {2} -out:qual {3}".format(
                    exe, pdbfile, self._source_file, filename)
                error = os.system(command)
                if not bool(error):
                    sys.stdout.write("Execution has finished\n")
                else:
                    sys.stdout.write("Execution has failed\n")
            elif os.path.isfile(filename + ".gz"):
                filename = filename + ".gz"

        # Load the data
        df = pd.read_csv(
            filename,
            header=None,
            sep="\s+",
            names=["size", "frame", "neighbor", "rmsd", "_null1", "_null2"],
            usecols=["size", "frame", "neighbor", "rmsd"])

        df = self.merge(df, how='left', on=["size", "frame", "neighbor"])
        if (df['rmsd'].isnull()).all():
            raise ValueError(
                'No rmsd was assigned to the fragment data. '
                'Check that the correct quality data is being assigned.')
        return df