def add_disulfide_interactions(G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None): """ Find all disulfide interactions between CYS residues. Criteria: sulfur atom pairs are within 2.2A of each other. :param G: networkx protein graph :type G: nx.Graph :param rgroup_df: pd.DataFrame containing rgroup data, defaults to None, which retrieves the df from the provided nx graph. :type rgroup_df: pd.DataFrame, optional """ # Check for existence of at least two Cysteine residues residues = [d["residue_name"] for _, d in G.nodes(data=True)] if residues.count("CYS") < 2: log.debug( f"{residues.count('CYS')} CYS residues found. Cannot add disulfide interactions with fewer than two CYS residues." ) return if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] disulfide_df = filter_dataframe(rgroup_df, "residue_name", DISULFIDE_RESIS, True) disulfide_df = filter_dataframe(disulfide_df, "atom_name", DISULFIDE_ATOMS, True) distmat = compute_distmat(disulfide_df) interacting_atoms = get_interacting_atoms(2.2, distmat) add_interacting_resis(G, interacting_atoms, disulfide_df, ["disulfide"])
def add_hydrophobic_interactions( G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None ): """ Find all hydrophobic interactions. Performs searches between the following residues: ``[ALA, VAL, LEU, ILE, MET, PHE, TRP, PRO, TYR]`` (:const:`~graphein.protein.resi_atoms.HYDROPHOBIC_RESIS`). Criteria: R-group residues are within 5A distance. :param G: nx.Graph to add hydrophobic interactions to. :type G: nx.Graph :param rgroup_df: Optional dataframe of R-group atoms. :type rgroup_df: pd.DataFrame, optional """ if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] hydrophobics_df = filter_dataframe( rgroup_df, "residue_name", HYDROPHOBIC_RESIS, True ) hydrophobics_df = filter_dataframe( hydrophobics_df, "node_id", list(G.nodes()), True ) distmat = compute_distmat(hydrophobics_df) interacting_atoms = get_interacting_atoms(5, distmat) add_interacting_resis( G, interacting_atoms, hydrophobics_df, ["hydrophobic"] )
def get_ring_atoms(dataframe: pd.DataFrame, aa: str) -> pd.DataFrame: """ Return ring atoms from a dataframe. A helper function for add_aromatic_interactions. Gets the ring atoms from the particular aromatic amino acid. Parameters: =========== - dataframe: the dataframe containing the atom records. - aa: the amino acid of interest, passed in as 3-letter string. Returns: ======== - dataframe: a filtered dataframe containing just those atoms from the particular amino acid selected. e.g. equivalent to selecting just the ring atoms from a particular amino acid. """ ring_atom_df = filter_dataframe(dataframe, "residue_name", [aa], True) ring_atom_df = filter_dataframe(ring_atom_df, "atom_name", AA_RING_ATOMS[aa], True) return ring_atom_df
def add_ionic_interactions( G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None ): """ Find all ionic interactions. Criteria: ``[ARG, LYS, HIS, ASP, and GLU]`` (:const:`~graphein.protein.resi_atoms.IONIC_RESIS`) residues are within 6A. We also check for opposing charges (:const:`~graphein.protein.resi_atoms.POS_AA`, :const:`~graphein.protein.resi_atoms.NEG_AA`) """ if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] ionic_df = filter_dataframe(rgroup_df, "residue_name", IONIC_RESIS, True) ionic_df = filter_dataframe(rgroup_df, "node_id", list(G.nodes()), True) distmat = compute_distmat(ionic_df) interacting_atoms = get_interacting_atoms(6, distmat) add_interacting_resis(G, interacting_atoms, ionic_df, ["ionic"]) # Check that the interacting residues are of opposite charges for r1, r2 in get_edges_by_bond_type(G, "ionic"): condition1 = ( G.nodes[r1]["residue_name"] in POS_AA and G.nodes[r2]["residue_name"] in NEG_AA ) condition2 = ( G.nodes[r2]["residue_name"] in POS_AA and G.nodes[r1]["residue_name"] in NEG_AA ) is_ionic = condition1 or condition2 if not is_ionic: G.edges[r1, r2]["kind"].remove("ionic") if len(G.edges[r1, r2]["kind"]) == 0: G.remove_edge(r1, r2)
def add_hydrogen_bond_interactions(G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None): """Add all hydrogen-bond interactions.""" # For these atoms, find those that are within 3.5A of one another. if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] HBOND_ATOMS = [ "ND", # histidine and asparagine "NE", # glutamate, tryptophan, arginine, histidine "NH", # arginine "NZ", # lysine "OD1", "OD2", "OE", "OG", "OH", "SD", # cysteine "SG", # methionine "N", "O", ] hbond_df = filter_dataframe(rgroup_df, "atom_name", HBOND_ATOMS, True) distmat = compute_distmat(hbond_df) interacting_atoms = get_interacting_atoms(3.5, distmat) add_interacting_resis(G, interacting_atoms, hbond_df, ["hbond"]) # For these atoms, find those that are within 4.0A of one another. HBOND_ATOMS_SULPHUR = ["SD", "SG"] hbond_df = filter_dataframe(rgroup_df, "atom_name", HBOND_ATOMS_SULPHUR, True) distmat = compute_distmat(hbond_df) interacting_atoms = get_interacting_atoms(4.0, distmat) add_interacting_resis(G, interacting_atoms, hbond_df, ["hbond"])
def remove_insertions(df: pd.DataFrame, keep: str = "first") -> pd.DataFrame: """ This function removes insertions from PDB dataframes. :param df: Protein Structure dataframe to remove insertions from. :type df: pd.DataFrame :param keep: Specifies which insertion to keep. Options are ``"first"`` or ``"last"``. Default is ``"first"`` :type keep: str :return: Protein structure dataframe with insertions removed :rtype: pd.DataFrame """ # Catches unnamed insertions duplicates = df.duplicated( subset=["chain_id", "residue_number", "atom_name"], keep=keep) df = df[~duplicates] # Catches explicit insertions df = filter_dataframe(df, by_column="insertion", list_of_values=[""], boolean=True) # Remove alt_locs df = filter_dataframe(df, by_column="alt_loc", list_of_values=["", "A"], boolean=True) return df
def add_cation_pi_interactions( G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None ): """Add cation-pi interactions.""" if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] cation_pi_df = filter_dataframe( rgroup_df, "residue_name", CATION_PI_RESIS, True ) cation_pi_df = filter_dataframe( cation_pi_df, "node_id", list(G.nodes()), True ) distmat = compute_distmat(cation_pi_df) interacting_atoms = get_interacting_atoms(6, distmat) interacting_atoms = list(zip(interacting_atoms[0], interacting_atoms[1])) for (a1, a2) in interacting_atoms: resi1 = cation_pi_df.loc[a1, "node_id"] resi2 = cation_pi_df.loc[a2, "node_id"] condition1 = resi1 in CATION_RESIS and resi2 in PI_RESIS condition2 = resi1 in PI_RESIS and resi2 in CATION_RESIS if (condition1 or condition2) and resi1 != resi2: if G.has_edge(resi1, resi2): G.edges[resi1, resi2]["kind"].add("cation_pi") else: G.add_edge(resi1, resi2, kind={"cation_pi"})
def add_aromatic_sulphur_interactions( G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None ): """Find all aromatic-sulphur interactions.""" if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] RESIDUES = ["MET", "CYS", "PHE", "TYR", "TRP"] SULPHUR_RESIS = ["MET", "CYS"] AROMATIC_RESIS = ["PHE", "TYR", "TRP"] aromatic_sulphur_df = filter_dataframe( rgroup_df, "residue_name", RESIDUES, True ) aromatic_sulphur_df = filter_dataframe( aromatic_sulphur_df, "node_id", list(G.nodes()), True ) distmat = compute_distmat(aromatic_sulphur_df) interacting_atoms = get_interacting_atoms(5.3, distmat) interacting_atoms = list(zip(interacting_atoms[0], interacting_atoms[1])) for (a1, a2) in interacting_atoms: resi1 = aromatic_sulphur_df.loc[a1, "node_id"] resi2 = aromatic_sulphur_df.loc[a2, "node_id"] condition1 = resi1 in SULPHUR_RESIS and resi2 in AROMATIC_RESIS condition2 = resi1 in AROMATIC_RESIS and resi2 in SULPHUR_RESIS if (condition1 or condition2) and resi1 != resi2: if G.has_edge(resi1, resi2): G.edges[resi1, resi2]["kind"].add("aromatic_sulphur") else: G.add_edge(resi1, resi2, kind={"aromatic_sulphur"})
def compute_rgroup_dataframe(pdb_df: pd.DataFrame) -> pd.DataFrame: """Return the atoms that are in R-groups and not the backbone chain. :param pdb_df: DataFrame to compute R group dataframe from :type pdb_df: pd.DataFrame :returns: Dataframe containing R-groups only (backbone atoms removed) :rtype: pd.DataFrame """ return filter_dataframe(pdb_df, "atom_name", BACKBONE_ATOMS, False)
def add_aromatic_interactions( G: nx.Graph, pdb_df: Optional[pd.DataFrame] = None ): """ Find all aromatic-aromatic interaction. Criteria: phenyl ring centroids separated between 4.5A to 7A. Phenyl rings are present on ``PHE, TRP, HIS, TYR`` (:const:`~graphein.protein.resi_atoms.AROMATIC_RESIS`). Phenyl ring atoms on these amino acids are defined by the following atoms: - PHE: CG, CD, CE, CZ - TRP: CD, CE, CH, CZ - HIS: CG, CD, ND, NE, CE - TYR: CG, CD, CE, CZ Centroids of these atoms are taken by taking: (mean x), (mean y), (mean z) for each of the ring atoms. Notes for future self/developers: - Because of the requirement to pre-compute ring centroids, we do not use the functions written above (filter_dataframe, compute_distmat, get_interacting_atoms), as they do not return centroid atom euclidean coordinates. """ if pdb_df is None: pdb_df = G.graph["raw_pdb_df"] dfs = [] for resi in AROMATIC_RESIS: resi_rings_df = get_ring_atoms(pdb_df, resi) resi_rings_df = filter_dataframe( resi_rings_df, "node_id", list(G.nodes()), True ) resi_centroid_df = get_ring_centroids(resi_rings_df) dfs.append(resi_centroid_df) aromatic_df = ( pd.concat(dfs).sort_values(by="node_id").reset_index(drop=True) ) distmat = compute_distmat(aromatic_df) distmat.set_index(aromatic_df["node_id"], inplace=True) distmat.columns = aromatic_df["node_id"] distmat = distmat[(distmat >= 4.5) & (distmat <= 7)].fillna(0) indices = np.where(distmat > 0) interacting_resis = [ (distmat.index[r], distmat.index[c]) for r, c in zip(indices[0], indices[1]) ] log.info(f"Found: {len(interacting_resis)} aromatic-aromatic interactions") for n1, n2 in interacting_resis: assert G.nodes[n1]["residue_name"] in AROMATIC_RESIS assert G.nodes[n2]["residue_name"] in AROMATIC_RESIS if G.has_edge(n1, n2): G.edges[n1, n2]["kind"].add("aromatic") else: G.add_edge(n1, n2, kind={"aromatic"})
def remove_insertions(df: pd.DataFrame) -> pd.DataFrame: """ This function removes insertions from PDB dataframes :param df: Protein Structure dataframe to remove insertions from :type df: pd.DataFrame :return: Protein structure dataframe with insertions removed :rtype: pd.DataFrame """ """Remove insertions from structure.""" return filter_dataframe(df, by_column="alt_loc", list_of_values=["", "A"], boolean=True)
def subset_structure_to_atom_type(df: pd.DataFrame, granularity: str) -> pd.DataFrame: """ Return a subset of atomic dataframe that contains only certain atom names. :param df: Protein Structure dataframe to subset :type df: pd.DataFrame :returns: Subsetted protein structure dataframe :rtype: pd.DataFrame """ return filter_dataframe(df, by_column="atom_name", list_of_values=[granularity], boolean=True)
def deprotonate_structure(df: pd.DataFrame) -> pd.DataFrame: """Remove protons from PDB dataframe. :param df: Atomic dataframe. :type df: pd.DataFrame :returns: Atomic dataframe with all atom_name == "H" removed. :rtype: pd.DataFrame """ log.debug( "Deprotonating protein. This removes H atoms from the pdb_df dataframe" ) return filter_dataframe(df, by_column="atom_name", list_of_values=["H"], boolean=False)
def add_distance_threshold( G: nx.Graph, long_interaction_threshold: int, threshold: float = 5.0 ): """ Adds edges to any nodes within a given distance of each other. Long interaction threshold is used to specify minimum separation in sequence to add an edge between networkx nodes within the distance threshold :param G: Protein Structure graph to add distance edges to :type G: nx.Graph :param long_interaction_threshold: minimum distance in sequence for two nodes to be connected :type long_interaction_threshold: int :param threshold: Distance in angstroms, below which two nodes are connected :type threshold: float :return: Graph with distance-based edges added """ pdb_df = filter_dataframe( G.graph["pdb_df"], "node_id", list(G.nodes()), True ) dist_mat = compute_distmat(pdb_df) interacting_nodes = get_interacting_atoms(threshold, distmat=dist_mat) interacting_nodes = zip(interacting_nodes[0], interacting_nodes[1]) log.info(f"Found: {len(list(interacting_nodes))} distance edges") for a1, a2 in interacting_nodes: n1 = G.graph["pdb_df"].loc[a1, "node_id"] n2 = G.graph["pdb_df"].loc[a2, "node_id"] n1_chain = G.graph["pdb_df"].loc[a1, "chain_id"] n2_chain = G.graph["pdb_df"].loc[a2, "chain_id"] n1_position = G.graph["pdb_df"].loc[a1, "residue_number"] n2_position = G.graph["pdb_df"].loc[a2, "residue_number"] condition_1 = n1_chain != n2_chain condition_2 = ( abs(n1_position - n2_position) > long_interaction_threshold ) if condition_1 or condition_2: if G.has_edge(n1, n2): G.edges[n1, n2]["kind"].add("distance_threshold") else: G.add_edge(n1, n2, kind={"distance_threshold"})
def add_beta_carbon_vector(g: nx.Graph, scale: bool = True, reverse: bool = False): """Adds vector from node (typically alpha carbon) to position of beta carbon. Glycine does not have a beta carbon, so we set it to ``np.array([0, 0, 0])``. We extract the position of the beta carbon from the unprocessed atomic PDB dataframe. For this we use the ``raw_pdb_df`` dataframe. If scale, we scale the vector to the unit vector. If reverse is True, we reverse the vector (``C beta - node``). If reverse is false (default) we compute (``node - C beta``). :param g: Graph to add vector to. :type g: nx.Graph :param scale: Scale vector to unit vector. Defaults to ``True``. :type scale: bool :param reverse: Reverse vector. Defaults to ``False``. :type reverse: bool """ c_beta_coords = filter_dataframe(g.graph["raw_pdb_df"], "atom_name", ["CB"], boolean=True) c_beta_coords.index = c_beta_coords["node_id"] # Iterate over nodes and compute vector for n, d in g.nodes(data=True): if d["residue_name"] == "GLY": vec = np.array([0, 0, 0]) else: if reverse: vec = d["coords"] - np.array( c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]]) else: vec = (np.array( c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]]) - d["coords"]) if scale: vec = vec / np.linalg.norm(vec) d["c_beta_vector"] = vec
def select_chains(protein_df: pd.DataFrame, chain_selection: str, verbose: bool = False) -> pd.DataFrame: """ Extracts relevant chains from protein_df :param protein_df: pandas dataframe of PDB subsetted to relevant atoms (CA, CB) :type protein_df: pd.DataFrame :param chain_selection: Specifies chains that should be extracted from the larger complexed structure :type chain_selection: str :param verbose: Print dataframe :type verbose: bool :return Protein structure dataframe containing only entries in the chain selection :rtype: pd.DataFrame """ if chain_selection != "all": protein_df = filter_dataframe( protein_df, by_column="chain_id", list_of_values=list(chain_selection), boolean=True, ) return protein_df
def add_hydrophobic_interactions(G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None): """ Find all hydrophobic interactions. Performs searches between the following residues: ALA, VAL, LEU, ILE, MET, PHE, TRP, PRO, TYR Criteria: R-group residues are within 5A distance. :param G: :type G: nx.Graph :param rgroup_df: :type rgroup_df: pd.DataFrame, optional """ if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] hydrophobics_df = filter_dataframe(rgroup_df, "residue_name", HYDROPHOBIC_RESIS, True) distmat = compute_distmat(hydrophobics_df) interacting_atoms = get_interacting_atoms(5, distmat) add_interacting_resis(G, interacting_atoms, hydrophobics_df, ["hydrophobic"])
def add_k_nn_edges( G: nx.Graph, long_interaction_threshold: int, k: int = 5, mode: str = "connectivity", metric: str = "minkowski", p: int = 2, include_self: Union[bool, str] = False, ): """ Adds edges to nodes based on K nearest neighbours. Long interaction threshold is used to specify minimum separation in sequence to add an edge between networkx nodes within the distance threshold :param G: Protein Structure graph to add distance edges to :type G: nx.Graph :param long_interaction_threshold: minimum distance in sequence for two nodes to be connected :type long_interaction_threshold: int :param k: Number of neighbors for each sample. :type k: int :param mode: Type of returned matrix: ``"connectivity"`` will return the connectivity matrix with ones and zeros, and ``"distance"`` will return the distances between neighbors according to the given metric. :type mode: str :param metric: The distance metric used to calculate the k-Neighbors for each sample point. The DistanceMetric class gives a list of available metrics. The default distance is ``"euclidean"`` (``"minkowski"`` metric with the ``p`` param equal to ``2``). :type metric: str :param p: Power parameter for the Minkowski metric. When ``p = 1``, this is equivalent to using ``manhattan_distance`` (l1), and ``euclidean_distance`` (l2) for ``p = 2``. For arbitrary ``p``, ``minkowski_distance`` (l_p) is used. Default is ``2`` (euclidean). :type p: int :param include_self: Whether or not to mark each sample as the first nearest neighbor to itself. If ``"auto"``, then ``True`` is used for ``mode="connectivity"`` and ``False`` for ``mode="distance"``. Default is ``False``. :type include_self: Union[bool, str] :return: Graph with knn-based edges added :rtype: nx.Graph """ pdb_df = filter_dataframe( G.graph["pdb_df"], "node_id", list(G.nodes()), True ) dist_mat = compute_distmat(pdb_df) nn = kneighbors_graph( X=dist_mat, n_neighbors=k, mode=mode, metric=metric, p=p, include_self=include_self, ) # Create iterable of node indices outgoing = np.repeat(np.array(range(len(G.graph["pdb_df"]))), k) incoming = nn.indices interacting_nodes = list(zip(outgoing, incoming)) log.info(f"Found: {len(interacting_nodes)} KNN edges") for a1, a2 in interacting_nodes: # Get nodes IDs from indices n1 = G.graph["pdb_df"].loc[a1, "node_id"] n2 = G.graph["pdb_df"].loc[a2, "node_id"] # Get chains n1_chain = G.graph["pdb_df"].loc[a1, "chain_id"] n2_chain = G.graph["pdb_df"].loc[a2, "chain_id"] # Get sequence position n1_position = G.graph["pdb_df"].loc[a1, "residue_number"] n2_position = G.graph["pdb_df"].loc[a2, "residue_number"] # Check residues are not on same chain condition_1 = n1_chain != n2_chain # Check residues are separated by long_interaction_threshold condition_2 = ( abs(n1_position - n2_position) > long_interaction_threshold ) # If not on same chain add edge or # If on same chain and separation is sufficient add edge if condition_1 or condition_2: if G.has_edge(n1, n2): G.edges[n1, n2]["kind"].add("k_nn") else: G.add_edge(n1, n2, kind={"k_nn"})