Beispiel #1
0
def run_maxcluster_cluster(predictions,
                           method="average",
                           rmsd=True,
                           clustering_threshold=None,
                           binary="maxcluster"):
    """
    Compare a set of predicted structures to an experimental structure
    using maxcluster.

    For clustering functionality, use run_maxcluster_clustering() function.

    Parameters
    ----------
    predictions : list(str)
        List of PDB files that should be compared against experiment
    method : {"single", "average", "maximum", "pairs_min", "pairs_abs"}, optional (default: "average")
        Clustering method (single / average / maximum linkage,
        or min / absolute size neighbour pairs
    clustering_threshold : float (optional, default: None)
        Initial clustering threshold (maxcluster -T option)
    rmsd : bool, optional (default: True)
        Use RMSD-based clustering (faster)
    binary : str, optional (default: "maxcluster")
        Path to maxcluster binary

    Returns
    -------
    pandas.DataFrame
        Clustering result table (see parse_maxcluster_clustering
        for more detailed explanation)
    """
    # create a list of files for input to maxcluster
    list_file = temp()
    with open(list_file, "w") as f:
        for pred_file in predictions:
            f.write(pred_file + "\n")

    method_map = {
        "single": 1,
        "average": 2,
        "maximum": 3,
        "pairs_min": 4,
        "pairs_abs": 5,
    }

    if method not in method_map:
        raise InvalidParameterError("Method must be one of the following: " +
                                    ", ".join(method_map.keys()))

    cmd = [binary, "-l", list_file, "-C", str(method_map[method])]

    if rmsd:
        cmd += ["-rmsd"]

    if clustering_threshold is not None:
        cmd += ["-T", str(clustering_threshold)]

    return_code, stdout, stderr = run(cmd)

    return parse_maxcluster_clustering(stdout)
Beispiel #2
0
def run_maxcluster_compare(predictions,
                           experiment,
                           normalization_length=None,
                           distance_cutoff=None,
                           binary="maxcluster"):
    """
    Compare a set of predicted structures to an experimental structure
    using maxcluster.
    
    For clustering functionality, use run_maxcluster_clustering() function.
    
    For a high-level wrapper around this function that removes
    problematic atoms and compares multiple models, please look at 
    evcouplings.fold.protocol.compare_models_maxcluster().
    
    Parameters
    ----------
    predictions : list(str)
        List of PDB files that should be compared against experiment
    experiment : str
        Path of experimental structure PDB file. Note that the numbering
        and residues in this file must agree with the predicted structure,
        and that the structure may not contain duplicate atoms (multiple
        models, or alternative locations for the same atom).
    normalization_length : int, optional (default: None)
        Use this length to normalize the Template Modeling (TM)
        score (-N option of maxcluster). If None, will normalize
        by length of experiment.
    distance_cutoff : float, optional (default: None)
        Distance cutoff for MaxSub search (-d option of maxcluster).
        If None, will use maxcluster auto-calibration.
    binary : str, optional (default: "maxcluster")
        Path to maxcluster binary

    Returns
    -------
    pandas.DataFrame
        Comparison result table (see parse_maxcluster_comparison
        for more detailed explanation)
    """
    # create a list of files for input to maxcluster
    list_file = temp()
    with open(list_file, "w") as f:
        for pred_file in predictions:
            f.write(pred_file + "\n")

    cmd = [binary, "-l", list_file, "-e", experiment]

    # normalization length for TM score calculation
    if normalization_length is not None:
        cmd += ["-N", str(normalization_length)]

    # distance cutoff for MaxSub search
    if distance_cutoff is not None:
        cmd += ["-d", str(distance_cutoff)]

    return_code, stdout, stderr = run(cmd)

    return parse_maxcluster_comparison(stdout)
Beispiel #3
0
    def _eliminate_altloc(chain):
        # if multiple locations, select the one with the
        # highest occupancy
        chain.coords = chain.coords.loc[chain.coords.groupby(
            ["residue_index", "atom_name"]).occupancy.idxmax()]

        # save cut chain to temporary file
        temp_filename = temp()
        with open(temp_filename, "w") as f:
            chain.to_file(f)
        return temp_filename
Beispiel #4
0
def cns_seq_file(sequence, output_file=None, residues_per_line=16):
    """
    Generate a CNS .seq file for a given protein sequence

    Parameters
    ----------
    sequence : str
        Amino acid sequence in one-letter code
    output_file : str, optional (default: None)
        Save 3-letter code sequence to this file
        (if None, will create temporary file)
    residues_per_line : int, optional (default: 16)
        Print this many residues on each line
        of .seq file

    Returns
    -------
    output_file : str
        Path to file with sequence
        (useful if temporary file was
        generated)

    Raises
    ------
    InvalidParameterError
        If sequence contains invalid symbol
    """
    if output_file is None:
        output_file = temp()

    with open(output_file, "w") as f:
        # split sequence into parts per line
        lines = [
            sequence[i:i + residues_per_line]
            for i in range(0, len(sequence), residues_per_line)
        ]

        # go through lines and transform into 3-letter code
        for line in lines:
            try:
                l3 = " ".join([AA1_to_AA3[aa] for aa in line])
            except KeyError as e:
                raise InvalidParameterError(
                    "Invalid amino acid could not be mapped") from e

            f.write(l3 + "\n")

    return output_file
Beispiel #5
0
def _read_hmmer_table(filename, column_names):
    """
    Parse a HMMER file in (dom)tbl format into
    a pandas DataFrame.

    (Why this is necessary: cannot easily split on
    whitespace with pandas because of last column
    that contains whitespace both in header and rows)

    Parameters
    ----------
    filename : str
        Path of (dom)tbl file
    column_names : list of str
        Columns in the respective format
        (different for tbl and domtbl)

    Returns
    -------
    pd.DataFrame
        DataFrame with parsed (dom)tbl
    """
    res = []
    num_splits = len(column_names) - 1

    with open(filename) as f:
        for line in f:
            if line.startswith("#"):
                continue

            fields = line.rstrip().split(maxsplit=num_splits)
            res.append(fields)

    # at the moment, all fields in dataframe are strings, even
    # if numeric. To convert to numbers, cheap trick is to store
    # to csv file and let pandas guess the types, rather than
    # going through convert_objects (deprecated) or to_numeric
    # (more effort)
    tempfile = temp()
    pd.DataFrame(res, columns=column_names).to_csv(tempfile, index=False)

    return pd.read_csv(tempfile)
Beispiel #6
0
    def _create_mapping_table(self, sifts_table_file):
        """
        Create modified SIFTS mapping table (based on
        file at SIFTS_URL). For some of the entries,
        the Uniprot sequence ranges do not map to a
        SEQRES sequence range of the same length. These
        PDB IDs will be entirely replaced by a segment-
        based mapping extracted from the SIFTS REST API.

        Parameters
        ----------
        sifts_table_file : str
            Path where computed table will be stored
        """
        def extract_rows(M, pdb_id):
            res = []

            M = M[pdb_id.lower()]["UniProt"]

            for uniprot_ac, Ms in M.items():
                for x in Ms["mappings"]:
                    res.append({
                        "pdb_id":
                        pdb_id,
                        "pdb_chain":
                        x["chain_id"],
                        "uniprot_ac":
                        uniprot_ac,
                        "resseq_start":
                        x["start"]["residue_number"],
                        "resseq_end":
                        x["end"]["residue_number"],
                        "coord_start":
                        (str(x["start"]["author_residue_number"]) +
                         x["start"]["author_insertion_code"].replace(" ", "")),
                        "coord_end":
                        (str(x["end"]["author_residue_number"]) +
                         x["end"]["author_insertion_code"].replace(" ", "")),
                        "uniprot_start":
                        x["unp_start"],
                        "uniprot_end":
                        x["unp_end"],
                    })

            return res

        # download SIFTS table (gzip-compressed csv) to temp file
        temp_download_file = temp()
        get_urllib(SIFTS_URL, temp_download_file)

        # load table and rename columns for internal use, if SIFTS
        # ever decided to rename theirs
        table = pd.read_csv(temp_download_file,
                            comment="#",
                            compression="gzip").rename(
                                columns={
                                    "PDB": "pdb_id",
                                    "CHAIN": "pdb_chain",
                                    "SP_PRIMARY": "uniprot_ac",
                                    "RES_BEG": "resseq_start",
                                    "RES_END": "resseq_end",
                                    "PDB_BEG": "coord_start",
                                    "PDB_END": "coord_end",
                                    "SP_BEG": "uniprot_start",
                                    "SP_END": "uniprot_end",
                                })

        # TODO: remove the following if new segment-based table proves as robust solution
        """
        # this block disabled for now due to use of new table
        # based on observed UniProt segments
        # - can probably be removed eventually

        # identify problematic PDB IDs
        problematic_ids = table.query(
            "(resseq_end - resseq_start) != (uniprot_end - uniprot_start)"
        ).pdb_id.unique()
        
        # collect new mappings from segment based REST API
        res = []
        for i, pdb_id in enumerate(problematic_ids):
            r = requests.get(
                SIFTS_REST_API.format(pdb_id.lower())
            )
            mapping = json.loads(r.text)

            res += extract_rows(mapping, pdb_id)

        # remove bad PDB IDs from table and add new mapping
        new_table = table.loc[~table.pdb_id.isin(problematic_ids)]

        # also disabled due to use of new table based on observed
        # UniProt segments - can probably be removed eventually 
        
        new_table = new_table.append(
            pd.DataFrame(res).loc[:, table.columns]
        )
        """

        # save for later reuse
        table.to_csv(sifts_table_file, index=False)