Python filter_dataframe Examples, graphein.protein.utils.filter_dataframe Python Examples

Example #1

0

Show file

File: distance.py Project: BatoolMM/graphein

def add_disulfide_interactions(G: nx.Graph,
                               rgroup_df: Optional[pd.DataFrame] = None):
    """
    Find all disulfide interactions between CYS residues.

    Criteria: sulfur atom pairs are within 2.2A of each other.

    :param G: networkx protein graph
    :type G: nx.Graph
    :param rgroup_df: pd.DataFrame containing rgroup data, defaults to None, which retrieves the df from the provided nx graph.
    :type rgroup_df: pd.DataFrame, optional
    """
    # Check for existence of at least two Cysteine residues
    residues = [d["residue_name"] for _, d in G.nodes(data=True)]
    if residues.count("CYS") < 2:
        log.debug(
            f"{residues.count('CYS')} CYS residues found. Cannot add disulfide interactions with fewer than two CYS residues."
        )
        return

    if rgroup_df is None:
        rgroup_df = G.graph["rgroup_df"]
    disulfide_df = filter_dataframe(rgroup_df, "residue_name", DISULFIDE_RESIS,
                                    True)
    disulfide_df = filter_dataframe(disulfide_df, "atom_name", DISULFIDE_ATOMS,
                                    True)
    distmat = compute_distmat(disulfide_df)
    interacting_atoms = get_interacting_atoms(2.2, distmat)
    add_interacting_resis(G, interacting_atoms, disulfide_df, ["disulfide"])

Example #2

0

Show file

File: distance.py Project: a-r-j/graphein

def add_hydrophobic_interactions(
    G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None
):
    """
    Find all hydrophobic interactions.

    Performs searches between the following residues:
    ``[ALA, VAL, LEU, ILE, MET, PHE, TRP, PRO, TYR]`` (:const:`~graphein.protein.resi_atoms.HYDROPHOBIC_RESIS`).

    Criteria: R-group residues are within 5A distance.

    :param G: nx.Graph to add hydrophobic interactions to.
    :type G: nx.Graph
    :param rgroup_df: Optional dataframe of R-group atoms.
    :type rgroup_df: pd.DataFrame, optional
    """
    if rgroup_df is None:
        rgroup_df = G.graph["rgroup_df"]
    hydrophobics_df = filter_dataframe(
        rgroup_df, "residue_name", HYDROPHOBIC_RESIS, True
    )
    hydrophobics_df = filter_dataframe(
        hydrophobics_df, "node_id", list(G.nodes()), True
    )
    distmat = compute_distmat(hydrophobics_df)
    interacting_atoms = get_interacting_atoms(5, distmat)
    add_interacting_resis(
        G, interacting_atoms, hydrophobics_df, ["hydrophobic"]
    )

Example #3

0

Show file

File: distance.py Project: BatoolMM/graphein

def get_ring_atoms(dataframe: pd.DataFrame, aa: str) -> pd.DataFrame:
    """
    Return ring atoms from a dataframe.

    A helper function for add_aromatic_interactions.

    Gets the ring atoms from the particular aromatic amino acid.

    Parameters:
    ===========
    - dataframe: the dataframe containing the atom records.
    - aa: the amino acid of interest, passed in as 3-letter string.

    Returns:
    ========
    - dataframe: a filtered dataframe containing just those atoms from the
                    particular amino acid selected. e.g. equivalent to
                    selecting just the ring atoms from a particular amino
                    acid.
    """
    ring_atom_df = filter_dataframe(dataframe, "residue_name", [aa], True)

    ring_atom_df = filter_dataframe(ring_atom_df, "atom_name",
                                    AA_RING_ATOMS[aa], True)
    return ring_atom_df

Example #4

0

Show file

File: distance.py Project: a-r-j/graphein

def add_ionic_interactions(
    G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None
):
    """
    Find all ionic interactions.

    Criteria: ``[ARG, LYS, HIS, ASP, and GLU]`` (:const:`~graphein.protein.resi_atoms.IONIC_RESIS`) residues are within 6A.
    We also check for opposing charges (:const:`~graphein.protein.resi_atoms.POS_AA`, :const:`~graphein.protein.resi_atoms.NEG_AA`)
    """
    if rgroup_df is None:
        rgroup_df = G.graph["rgroup_df"]
    ionic_df = filter_dataframe(rgroup_df, "residue_name", IONIC_RESIS, True)
    ionic_df = filter_dataframe(rgroup_df, "node_id", list(G.nodes()), True)
    distmat = compute_distmat(ionic_df)
    interacting_atoms = get_interacting_atoms(6, distmat)
    add_interacting_resis(G, interacting_atoms, ionic_df, ["ionic"])
    # Check that the interacting residues are of opposite charges
    for r1, r2 in get_edges_by_bond_type(G, "ionic"):
        condition1 = (
            G.nodes[r1]["residue_name"] in POS_AA
            and G.nodes[r2]["residue_name"] in NEG_AA
        )

        condition2 = (
            G.nodes[r2]["residue_name"] in POS_AA
            and G.nodes[r1]["residue_name"] in NEG_AA
        )

        is_ionic = condition1 or condition2
        if not is_ionic:
            G.edges[r1, r2]["kind"].remove("ionic")
            if len(G.edges[r1, r2]["kind"]) == 0:
                G.remove_edge(r1, r2)

Example #5

0

Show file

File: distance.py Project: BatoolMM/graphein

def add_hydrogen_bond_interactions(G: nx.Graph,
                                   rgroup_df: Optional[pd.DataFrame] = None):
    """Add all hydrogen-bond interactions."""
    # For these atoms, find those that are within 3.5A of one another.
    if rgroup_df is None:
        rgroup_df = G.graph["rgroup_df"]
    HBOND_ATOMS = [
        "ND",  # histidine and asparagine
        "NE",  # glutamate, tryptophan, arginine, histidine
        "NH",  # arginine
        "NZ",  # lysine
        "OD1",
        "OD2",
        "OE",
        "OG",
        "OH",
        "SD",  # cysteine
        "SG",  # methionine
        "N",
        "O",
    ]
    hbond_df = filter_dataframe(rgroup_df, "atom_name", HBOND_ATOMS, True)
    distmat = compute_distmat(hbond_df)
    interacting_atoms = get_interacting_atoms(3.5, distmat)
    add_interacting_resis(G, interacting_atoms, hbond_df, ["hbond"])

    # For these atoms, find those that are within 4.0A of one another.
    HBOND_ATOMS_SULPHUR = ["SD", "SG"]
    hbond_df = filter_dataframe(rgroup_df, "atom_name", HBOND_ATOMS_SULPHUR,
                                True)
    distmat = compute_distmat(hbond_df)
    interacting_atoms = get_interacting_atoms(4.0, distmat)
    add_interacting_resis(G, interacting_atoms, hbond_df, ["hbond"])

Example #6

0

Show file

def remove_insertions(df: pd.DataFrame, keep: str = "first") -> pd.DataFrame:
    """
    This function removes insertions from PDB dataframes.

    :param df: Protein Structure dataframe to remove insertions from.
    :type df: pd.DataFrame
    :param keep: Specifies which insertion to keep. Options are ``"first"`` or ``"last"``.
        Default is ``"first"``
    :type keep: str
    :return: Protein structure dataframe with insertions removed
    :rtype: pd.DataFrame
    """
    # Catches unnamed insertions
    duplicates = df.duplicated(
        subset=["chain_id", "residue_number", "atom_name"], keep=keep)
    df = df[~duplicates]

    # Catches explicit insertions
    df = filter_dataframe(df,
                          by_column="insertion",
                          list_of_values=[""],
                          boolean=True)

    # Remove alt_locs
    df = filter_dataframe(df,
                          by_column="alt_loc",
                          list_of_values=["", "A"],
                          boolean=True)

    return df

Example #7

0

Show file

File: distance.py Project: a-r-j/graphein

def add_cation_pi_interactions(
    G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None
):
    """Add cation-pi interactions."""
    if rgroup_df is None:
        rgroup_df = G.graph["rgroup_df"]
    cation_pi_df = filter_dataframe(
        rgroup_df, "residue_name", CATION_PI_RESIS, True
    )
    cation_pi_df = filter_dataframe(
        cation_pi_df, "node_id", list(G.nodes()), True
    )
    distmat = compute_distmat(cation_pi_df)
    interacting_atoms = get_interacting_atoms(6, distmat)
    interacting_atoms = list(zip(interacting_atoms[0], interacting_atoms[1]))

    for (a1, a2) in interacting_atoms:
        resi1 = cation_pi_df.loc[a1, "node_id"]
        resi2 = cation_pi_df.loc[a2, "node_id"]

        condition1 = resi1 in CATION_RESIS and resi2 in PI_RESIS
        condition2 = resi1 in PI_RESIS and resi2 in CATION_RESIS

        if (condition1 or condition2) and resi1 != resi2:
            if G.has_edge(resi1, resi2):
                G.edges[resi1, resi2]["kind"].add("cation_pi")
            else:
                G.add_edge(resi1, resi2, kind={"cation_pi"})

Example #8

0

Show file

File: distance.py Project: a-r-j/graphein

def add_aromatic_sulphur_interactions(
    G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None
):
    """Find all aromatic-sulphur interactions."""
    if rgroup_df is None:
        rgroup_df = G.graph["rgroup_df"]
    RESIDUES = ["MET", "CYS", "PHE", "TYR", "TRP"]
    SULPHUR_RESIS = ["MET", "CYS"]
    AROMATIC_RESIS = ["PHE", "TYR", "TRP"]

    aromatic_sulphur_df = filter_dataframe(
        rgroup_df, "residue_name", RESIDUES, True
    )
    aromatic_sulphur_df = filter_dataframe(
        aromatic_sulphur_df, "node_id", list(G.nodes()), True
    )
    distmat = compute_distmat(aromatic_sulphur_df)
    interacting_atoms = get_interacting_atoms(5.3, distmat)
    interacting_atoms = list(zip(interacting_atoms[0], interacting_atoms[1]))

    for (a1, a2) in interacting_atoms:
        resi1 = aromatic_sulphur_df.loc[a1, "node_id"]
        resi2 = aromatic_sulphur_df.loc[a2, "node_id"]

        condition1 = resi1 in SULPHUR_RESIS and resi2 in AROMATIC_RESIS
        condition2 = resi1 in AROMATIC_RESIS and resi2 in SULPHUR_RESIS

        if (condition1 or condition2) and resi1 != resi2:
            if G.has_edge(resi1, resi2):
                G.edges[resi1, resi2]["kind"].add("aromatic_sulphur")
            else:
                G.add_edge(resi1, resi2, kind={"aromatic_sulphur"})

Example #9

0

Show file

File: graphs.py Project: BatoolMM/graphein

def compute_rgroup_dataframe(pdb_df: pd.DataFrame) -> pd.DataFrame:
    """Return the atoms that are in R-groups and not the backbone chain.

    :param pdb_df: DataFrame to compute R group dataframe from
    :type pdb_df: pd.DataFrame
    :returns: Dataframe containing R-groups only (backbone atoms removed)
    :rtype: pd.DataFrame
    """
    return filter_dataframe(pdb_df, "atom_name", BACKBONE_ATOMS, False)

Example #10

0

Show file

File: distance.py Project: a-r-j/graphein

def add_aromatic_interactions(
    G: nx.Graph, pdb_df: Optional[pd.DataFrame] = None
):
    """
    Find all aromatic-aromatic interaction.

    Criteria: phenyl ring centroids separated between 4.5A to 7A.
    Phenyl rings are present on ``PHE, TRP, HIS, TYR`` (:const:`~graphein.protein.resi_atoms.AROMATIC_RESIS`).
    Phenyl ring atoms on these amino acids are defined by the following
    atoms:
    - PHE: CG, CD, CE, CZ
    - TRP: CD, CE, CH, CZ
    - HIS: CG, CD, ND, NE, CE
    - TYR: CG, CD, CE, CZ
    Centroids of these atoms are taken by taking:
        (mean x), (mean y), (mean z)
    for each of the ring atoms.
    Notes for future self/developers:
    - Because of the requirement to pre-compute ring centroids, we do not
        use the functions written above (filter_dataframe, compute_distmat,
        get_interacting_atoms), as they do not return centroid atom
        euclidean coordinates.
    """
    if pdb_df is None:
        pdb_df = G.graph["raw_pdb_df"]
    dfs = []
    for resi in AROMATIC_RESIS:
        resi_rings_df = get_ring_atoms(pdb_df, resi)
        resi_rings_df = filter_dataframe(
            resi_rings_df, "node_id", list(G.nodes()), True
        )
        resi_centroid_df = get_ring_centroids(resi_rings_df)
        dfs.append(resi_centroid_df)
    aromatic_df = (
        pd.concat(dfs).sort_values(by="node_id").reset_index(drop=True)
    )
    distmat = compute_distmat(aromatic_df)
    distmat.set_index(aromatic_df["node_id"], inplace=True)
    distmat.columns = aromatic_df["node_id"]
    distmat = distmat[(distmat >= 4.5) & (distmat <= 7)].fillna(0)
    indices = np.where(distmat > 0)

    interacting_resis = [
        (distmat.index[r], distmat.index[c])
        for r, c in zip(indices[0], indices[1])
    ]
    log.info(f"Found: {len(interacting_resis)} aromatic-aromatic interactions")
    for n1, n2 in interacting_resis:
        assert G.nodes[n1]["residue_name"] in AROMATIC_RESIS
        assert G.nodes[n2]["residue_name"] in AROMATIC_RESIS
        if G.has_edge(n1, n2):
            G.edges[n1, n2]["kind"].add("aromatic")
        else:
            G.add_edge(n1, n2, kind={"aromatic"})

Example #11

0

Show file

File: graphs.py Project: BatoolMM/graphein

def remove_insertions(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function removes insertions from PDB dataframes

    :param df: Protein Structure dataframe to remove insertions from
    :type df: pd.DataFrame
    :return: Protein structure dataframe with insertions removed
    :rtype: pd.DataFrame
    """
    """Remove insertions from structure."""
    return filter_dataframe(df,
                            by_column="alt_loc",
                            list_of_values=["", "A"],
                            boolean=True)

Example #12

0

Show file

File: graphs.py Project: BatoolMM/graphein

def subset_structure_to_atom_type(df: pd.DataFrame,
                                  granularity: str) -> pd.DataFrame:
    """
    Return a subset of atomic dataframe that contains only certain atom names.

    :param df: Protein Structure dataframe to subset
    :type df: pd.DataFrame
    :returns: Subsetted protein structure dataframe
    :rtype: pd.DataFrame
    """
    return filter_dataframe(df,
                            by_column="atom_name",
                            list_of_values=[granularity],
                            boolean=True)

Example #13

0

Show file

File: graphs.py Project: BatoolMM/graphein

def deprotonate_structure(df: pd.DataFrame) -> pd.DataFrame:
    """Remove protons from PDB dataframe.

    :param df: Atomic dataframe.
    :type df: pd.DataFrame
    :returns: Atomic dataframe with all atom_name == "H" removed.
    :rtype: pd.DataFrame
    """
    log.debug(
        "Deprotonating protein. This removes H atoms from the pdb_df dataframe"
    )
    return filter_dataframe(df,
                            by_column="atom_name",
                            list_of_values=["H"],
                            boolean=False)

Example #14

0

Show file

File: distance.py Project: a-r-j/graphein

def add_distance_threshold(
    G: nx.Graph, long_interaction_threshold: int, threshold: float = 5.0
):
    """
    Adds edges to any nodes within a given distance of each other. Long interaction threshold is used
    to specify minimum separation in sequence to add an edge between networkx nodes within the distance threshold

    :param G: Protein Structure graph to add distance edges to
    :type G: nx.Graph
    :param long_interaction_threshold: minimum distance in sequence for two nodes to be connected
    :type long_interaction_threshold: int
    :param threshold: Distance in angstroms, below which two nodes are connected
    :type threshold: float
    :return: Graph with distance-based edges added
    """
    pdb_df = filter_dataframe(
        G.graph["pdb_df"], "node_id", list(G.nodes()), True
    )
    dist_mat = compute_distmat(pdb_df)
    interacting_nodes = get_interacting_atoms(threshold, distmat=dist_mat)
    interacting_nodes = zip(interacting_nodes[0], interacting_nodes[1])

    log.info(f"Found: {len(list(interacting_nodes))} distance edges")
    for a1, a2 in interacting_nodes:
        n1 = G.graph["pdb_df"].loc[a1, "node_id"]
        n2 = G.graph["pdb_df"].loc[a2, "node_id"]
        n1_chain = G.graph["pdb_df"].loc[a1, "chain_id"]
        n2_chain = G.graph["pdb_df"].loc[a2, "chain_id"]
        n1_position = G.graph["pdb_df"].loc[a1, "residue_number"]
        n2_position = G.graph["pdb_df"].loc[a2, "residue_number"]

        condition_1 = n1_chain != n2_chain
        condition_2 = (
            abs(n1_position - n2_position) > long_interaction_threshold
        )

        if condition_1 or condition_2:
            if G.has_edge(n1, n2):
                G.edges[n1, n2]["kind"].add("distance_threshold")
            else:
                G.add_edge(n1, n2, kind={"distance_threshold"})

Example #15

0

Show file

def add_beta_carbon_vector(g: nx.Graph,
                           scale: bool = True,
                           reverse: bool = False):
    """Adds vector from node (typically alpha carbon) to position of beta carbon.

    Glycine does not have a beta carbon, so we set it to ``np.array([0, 0, 0])``.
    We extract the position of the beta carbon from the unprocessed atomic PDB dataframe.
    For this we use the ``raw_pdb_df`` dataframe.
    If scale, we scale the vector to the unit vector. If reverse is True,
    we reverse the vector (``C beta - node``). If reverse is false (default) we compute (``node - C beta``).

    :param g: Graph to add vector to.
    :type g: nx.Graph
    :param scale: Scale vector to unit vector. Defaults to ``True``.
    :type scale: bool
    :param reverse: Reverse vector. Defaults to ``False``.
    :type reverse: bool
    """

    c_beta_coords = filter_dataframe(g.graph["raw_pdb_df"],
                                     "atom_name", ["CB"],
                                     boolean=True)
    c_beta_coords.index = c_beta_coords["node_id"]

    # Iterate over nodes and compute vector
    for n, d in g.nodes(data=True):
        if d["residue_name"] == "GLY":
            vec = np.array([0, 0, 0])
        else:
            if reverse:
                vec = d["coords"] - np.array(
                    c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]])
            else:
                vec = (np.array(
                    c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]]) -
                       d["coords"])

            if scale:
                vec = vec / np.linalg.norm(vec)
        d["c_beta_vector"] = vec

Example #16

0

Show file

File: graphs.py Project: BatoolMM/graphein

def select_chains(protein_df: pd.DataFrame,
                  chain_selection: str,
                  verbose: bool = False) -> pd.DataFrame:
    """
    Extracts relevant chains from protein_df

    :param protein_df: pandas dataframe of PDB subsetted to relevant atoms (CA, CB)
    :type protein_df: pd.DataFrame
    :param chain_selection: Specifies chains that should be extracted from the larger complexed structure
    :type chain_selection: str
    :param verbose: Print dataframe
    :type verbose: bool
    :return Protein structure dataframe containing only entries in the chain selection
    :rtype: pd.DataFrame
    """
    if chain_selection != "all":
        protein_df = filter_dataframe(
            protein_df,
            by_column="chain_id",
            list_of_values=list(chain_selection),
            boolean=True,
        )

    return protein_df

Example #17

0

Show file

File: distance.py Project: BatoolMM/graphein

def add_hydrophobic_interactions(G: nx.Graph,
                                 rgroup_df: Optional[pd.DataFrame] = None):
    """
    Find all hydrophobic interactions.

    Performs searches between the following residues:
    ALA, VAL, LEU, ILE, MET, PHE, TRP, PRO, TYR

    Criteria: R-group residues are within 5A distance.

    :param G:
    :type G: nx.Graph
    :param rgroup_df:
    :type rgroup_df: pd.DataFrame, optional

    """
    if rgroup_df is None:
        rgroup_df = G.graph["rgroup_df"]
    hydrophobics_df = filter_dataframe(rgroup_df, "residue_name",
                                       HYDROPHOBIC_RESIS, True)
    distmat = compute_distmat(hydrophobics_df)
    interacting_atoms = get_interacting_atoms(5, distmat)
    add_interacting_resis(G, interacting_atoms, hydrophobics_df,
                          ["hydrophobic"])

Example #18

0

Show file

File: distance.py Project: a-r-j/graphein

def add_k_nn_edges(
    G: nx.Graph,
    long_interaction_threshold: int,
    k: int = 5,
    mode: str = "connectivity",
    metric: str = "minkowski",
    p: int = 2,
    include_self: Union[bool, str] = False,
):
    """
    Adds edges to nodes based on K nearest neighbours. Long interaction threshold is used
    to specify minimum separation in sequence to add an edge between networkx nodes within the distance threshold

    :param G: Protein Structure graph to add distance edges to
    :type G: nx.Graph
    :param long_interaction_threshold: minimum distance in sequence for two nodes to be connected
    :type long_interaction_threshold: int
    :param k: Number of neighbors for each sample.
    :type k: int
    :param mode: Type of returned matrix: ``"connectivity"`` will return the connectivity matrix with ones and zeros,
        and ``"distance"`` will return the distances between neighbors according to the given metric.
    :type mode: str
    :param metric: The distance metric used to calculate the k-Neighbors for each sample point.
        The DistanceMetric class gives a list of available metrics.
        The default distance is ``"euclidean"`` (``"minkowski"`` metric with the ``p`` param equal to ``2``).
    :type metric: str
    :param p: Power parameter for the Minkowski metric. When ``p = 1``, this is equivalent to using ``manhattan_distance`` (l1),
        and ``euclidean_distance`` (l2) for ``p = 2``. For arbitrary ``p``, ``minkowski_distance`` (l_p) is used. Default is ``2`` (euclidean).
    :type p: int
    :param include_self: Whether or not to mark each sample as the first nearest neighbor to itself.
        If ``"auto"``, then ``True`` is used for ``mode="connectivity"`` and ``False`` for ``mode="distance"``. Default is ``False``.
    :type include_self: Union[bool, str]
    :return: Graph with knn-based edges added
    :rtype: nx.Graph
    """
    pdb_df = filter_dataframe(
        G.graph["pdb_df"], "node_id", list(G.nodes()), True
    )
    dist_mat = compute_distmat(pdb_df)

    nn = kneighbors_graph(
        X=dist_mat,
        n_neighbors=k,
        mode=mode,
        metric=metric,
        p=p,
        include_self=include_self,
    )

    # Create iterable of node indices
    outgoing = np.repeat(np.array(range(len(G.graph["pdb_df"]))), k)
    incoming = nn.indices
    interacting_nodes = list(zip(outgoing, incoming))
    log.info(f"Found: {len(interacting_nodes)} KNN edges")
    for a1, a2 in interacting_nodes:
        # Get nodes IDs from indices
        n1 = G.graph["pdb_df"].loc[a1, "node_id"]
        n2 = G.graph["pdb_df"].loc[a2, "node_id"]

        # Get chains
        n1_chain = G.graph["pdb_df"].loc[a1, "chain_id"]
        n2_chain = G.graph["pdb_df"].loc[a2, "chain_id"]

        # Get sequence position
        n1_position = G.graph["pdb_df"].loc[a1, "residue_number"]
        n2_position = G.graph["pdb_df"].loc[a2, "residue_number"]

        # Check residues are not on same chain
        condition_1 = n1_chain != n2_chain
        # Check residues are separated by long_interaction_threshold
        condition_2 = (
            abs(n1_position - n2_position) > long_interaction_threshold
        )

        # If not on same chain add edge or
        # If on same chain and separation is sufficient add edge
        if condition_1 or condition_2:
            if G.has_edge(n1, n2):
                G.edges[n1, n2]["kind"].add("k_nn")
            else:
                G.add_edge(n1, n2, kind={"k_nn"})