def test_extract_subgraph_from_bond_type(): """Tests subgraph extraction from bond type""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" config = ProteinGraphConfig( edge_construction_functions=[add_peptide_bonds, add_ionic_interactions] ) G = construct_graph(pdb_path=str(file_path)) # , config=config) BOND_TYPES = ["ionic"] s_g = extract_subgraph_by_bond_type(G, BOND_TYPES, filter_dataframe=True) for u, v, d in G.edges(data=True): if d["kind"] in BOND_TYPES: assert u in s_g.nodes() assert v in s_g.nodes() assert (u, v) in s_g.edges() for u, v, d in s_g.edges(data=True): for bond in list(d["kind"]): assert bond in BOND_TYPES s_g = extract_subgraph_by_bond_type( G, BOND_TYPES, filter_dataframe=True, inverse=True ) for u, v, d in G.edges(data=True): if d["kind"] in BOND_TYPES: assert (u, v) not in s_g.edges() for u, v, d in s_g.edges(data=True): for bond in list(d["kind"]): assert bond not in BOND_TYPES
def test_sequence_features(): # Tests sequence featurisers for a residue graph: # ESM and BioVec embeddings, propy and sequence descriptors file_path = Path(__file__).parent / "test_data/4hhb.pdb" sequence_feature_functions = { "graph_metadata_functions": [ # esm_sequence_embedding, # esm_residue_embedding, biovec_sequence_embedding, molecular_weight, ] } config = ProteinGraphConfig(**sequence_feature_functions) G = construct_graph(pdb_path=str(file_path), config=config) # Check for existence on sequence-based features as node-level features # for n, d in G.nodes(data=True): # Todo this can probably be improved. # This only checks for the existence and shape of the esm_embedding for each node # assert "esm_embedding" in d # assert len(d["esm_embedding"]) == 1280 # Check for existence of sequence-based features as Graph-level features for chain in G.graph["chain_ids"]: assert f"sequence_{chain}" in G.graph # assert f"esm_embedding_{chain}" in G.graph assert f"biovec_embedding_{chain}" in G.graph assert f"molecular_weight_{chain}" in G.graph
def test_node_features(): # Todo this test requires attention # Tests node featurisers for a residue graph: # Amino acid features, ESM embedding, DSSP features, aaindex features file_path = Path(__file__).parent / "test_data/4hhb.pdb" node_feature_functions = { "node_metadata_functions": [ expasy_protein_scale, # Todo we need to refactor node data assingment flow meiler_embedding, # rsa, # asa, # phi, # psi, # secondary_structure, # partial(aaindex1, accession="FAUJ880111"), ] } config = ProteinGraphConfig(**node_feature_functions) G = construct_graph(pdb_path=str(file_path), config=config) # Check for existence of features for n, d in G.nodes(data=True): # assert "meiler_embedding" in d # Todo these functions return pd.Series, rather than adding to the node # assert expasy_protein_scale in d # assert "rsa" in d # assert "asa" in d # assert "phi" in d # assert "psi" in d # assert "secondary_structure" in d continue
def test_insertion_handling(): configs = { "granularity": "CA", "keep_hets": False, "insertions": False, "verbose": False, "node_metadata_functions": [meiler_embedding, expasy_protein_scale], "edge_construction_functions": [ add_peptide_bonds, add_hydrogen_bond_interactions, add_ionic_interactions, add_aromatic_sulphur_interactions, add_hydrophobic_interactions, add_cation_pi_interactions, ], } config = ProteinGraphConfig(**configs) # This is a nasty PDB with a lot of insertions and altlocs g = construct_graph(config=config, pdb_code="6OGE") assert len(g.graph["sequence_A"]) + len(g.graph["sequence_B"]) + len( g.graph["sequence_C"] ) + len(g.graph["sequence_D"]) + len(g.graph["sequence_E"]) == len(g) assert g.graph["coords"].shape[0] == len(g)
def test_distance_edges(): """Example-based test that distance-based edge construction works correctly Uses 4hhb PDB file as an example test case. """ file_path = Path(__file__).parent / "test_data/4hhb.pdb" edge_functions = { "edge_construction_functions": [ partial(add_k_nn_edges, k=5, long_interaction_threshold=10), add_hydrophobic_interactions, add_aromatic_interactions, # Todo removed for now as ring centroids require precomputing add_aromatic_sulphur_interactions, add_delaunay_triangulation, add_cation_pi_interactions, add_peptide_bonds, add_hydrogen_bond_interactions, add_disulfide_interactions, add_ionic_interactions, partial( add_distance_threshold, threshold=12, long_interaction_threshold=10, ), ] } config = ProteinGraphConfig(**edge_functions) G = construct_graph(pdb_path=str(file_path), config=config) assert G is not None
def test_amino_acid_one_hot_example(): """Example-based test on 4hhb for `amino_acid_onehot`.""" # Test np array config = ProteinGraphConfig(node_metadata_functions=[amino_acid_one_hot]) g = construct_graph(pdb_code="4hhb", config=config) for n, d in g.nodes(data=True): assert sum(d["amino_acid_one_hot"]) == 1 # Test pd.Series config = ProteinGraphConfig(node_metadata_functions=[ partial(amino_acid_one_hot, return_array=False) ]) g = construct_graph(pdb_code="4hhb", config=config) for n, d in g.nodes(data=True): assert sum(d["amino_acid_one_hot"]) == 1 assert (d["amino_acid_one_hot"].idxmax() == RESI_THREE_TO_1[ d["residue_name"]])
def test_add_sidechain_vector(): config = ProteinGraphConfig(edge_construction_functions=[ partial(add_sidechain_vector, scale=True) ], ) g = construct_graph(pdb_code="1lds", config=config) for n, d in g.nodes(data=True): # Check that the node has the correct attributes assert "sidechain_vector" in d.keys() # Check the vector is of the correct dimensionality assert d["sidechain_vector"].shape == (3, ) # check glycines are zero if d["residue_name"] == "GLY": np.testing.assert_equal(d["sidechain_vector"], np.array([0.0, 0.0, 0.0])) else: # Check scaled vector has norm close 1 np.testing.assert_almost_equal( np.linalg.norm(d["sidechain_vector"]), 1.0) # Test unscaled vector config = ProteinGraphConfig(edge_construction_functions=[ partial(add_sidechain_vector, scale=False) ], ) g = construct_graph(pdb_code="1lds", config=config) for n, d in g.nodes(data=True): # check glycines are zero if d["residue_name"] == "GLY": np.testing.assert_equal(d["sidechain_vector"], np.array([0.0, 0.0, 0.0])) else: # Check the vector is pointing in the correct direction sc_true = np.array( g.graph["rgroup_df"].groupby("node_id").mean().loc[n][[ "x_coord", "y_coord", "z_coord" ]]) np.testing.assert_almost_equal(sc_true, d["coords"] + d["sidechain_vector"])
def test_add_beta_carbon_vector(): config = ProteinGraphConfig(edge_construction_functions=[ partial(add_beta_carbon_vector, scale=True) ], ) g = construct_graph(pdb_code="1lds", config=config) raw_pdb = g.graph["raw_pdb_df"] for n, d in g.nodes(data=True): # Check that the node has the correct attributes assert "c_beta_vector" in d.keys() # Check the vector is of the correct dimensionality assert d["c_beta_vector"].shape == (3, ) # check glycines are zero if d["residue_name"] == "GLY": np.testing.assert_equal(d["c_beta_vector"], np.array([0.0, 0.0, 0.0])) else: # Check scaled vector has norm close 1 np.testing.assert_almost_equal(np.linalg.norm(d["c_beta_vector"]), 1.0) # Test unscaled vector config = ProteinGraphConfig(edge_construction_functions=[ partial(add_beta_carbon_vector, scale=False) ], ) g = construct_graph(pdb_code="1lds", config=config) for n, d in g.nodes(data=True): # check glycines are zero if d["residue_name"] == "GLY": np.testing.assert_equal(d["c_beta_vector"], np.array([0.0, 0.0, 0.0])) else: # Check the vector is pointing in the correct direction cb_true = np.array( raw_pdb[raw_pdb["node_id"] == n][raw_pdb["atom_name"] == "CB"][ ["x_coord", "y_coord", "z_coord"]]).T.squeeze() np.testing.assert_almost_equal(cb_true, d["coords"] + d["c_beta_vector"])
def test_surface_subgraph(): """Tests surface subgraph extraction.""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" config = ProteinGraphConfig( graph_metadata_functions=[rsa], dssp_config=DSSPConfig() ) G = construct_graph(pdb_path=str(file_path), config=config) RSA_THRESHOLD: float = 0.2 s_g = extract_surface_subgraph(G, RSA_THRESHOLD, filter_dataframe=True) for n, d in s_g.nodes(data=True): assert d["rsa"] >= RSA_THRESHOLD for n, d in G.nodes(data=True): if d["rsa"] >= RSA_THRESHOLD: assert n in s_g.nodes(), print(n, d)
def test_secondary_structure_subgraph(): """Tests secondary subgraph extraction.""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" config = ProteinGraphConfig( graph_metadata_functions=[secondary_structure], dssp_config=DSSPConfig(), ) G = construct_graph(pdb_path=str(file_path), config=config) SS_ELEMENTS: List[str] = ["H"] s_g = extract_subgraph_from_secondary_structure( G, SS_ELEMENTS, filter_dataframe=True ) for _, d in s_g.nodes(data=True): assert d["ss"] in SS_ELEMENTS for n, d in G.nodes(data=True): if d["ss"] in SS_ELEMENTS: assert n in s_g.nodes()
def test_edges_do_not_add_nodes_for_chain_subset(): new_funcs = { "edge_construction_functions": [ add_peptide_bonds, add_hydrogen_bond_interactions, add_disulfide_interactions, add_ionic_interactions, add_aromatic_interactions, add_aromatic_sulphur_interactions, add_cation_pi_interactions, ], } config = ProteinGraphConfig(**new_funcs) g = construct_graph(config=config, pdb_code="2vvi", chain_selection="A") assert len(g) == 217 g = construct_graph(config=config, pdb_code="2vvi", chain_selection="B") assert len(g) == 219 g = construct_graph(config=config, pdb_code="2vvi", chain_selection="C") assert len(g) == 222 g = construct_graph(config=config, pdb_code="2vvi", chain_selection="D") assert len(g) == 219
def test_successful_pickle(): """Tests subgraphs can be successfully pickled and unpickled""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" config = ProteinGraphConfig( graph_metadata_functions=[secondary_structure], dssp_config=DSSPConfig(), ) G = construct_graph(pdb_path=str(file_path), config=config) s_g = extract_subgraph_from_residue_types( G, residue_types=["ALA", "SER", "MET"], update_coords=True, filter_dataframe=True, recompute_distmat=True, ) with open("/tmp/test_graph.p", "wb") as f: pickle.dump(s_g, f) with open("/tmp/test_graph.p", "rb") as f: loaded_graph = pickle.load(f) assert nx.is_isomorphic(s_g, loaded_graph)
def construct_graph( config: Optional[ProteinGraphConfig] = None, pdb_path: Optional[str] = None, pdb_code: Optional[str] = None, chain_selection: str = "all", df_processing_funcs: Optional[List[Callable]] = None, edge_construction_funcs: Optional[List[Callable]] = None, edge_annotation_funcs: Optional[List[Callable]] = None, node_annotation_funcs: Optional[List[Callable]] = None, graph_annotation_funcs: Optional[List[Callable]] = None, ) -> nx.Graph: """ Constructs protein structure graph from a pdb_code or pdb_path. Users can provide a ProteinGraphConfig object. However, config parameters can be overridden by passing arguments directly to the function. :param config: ProteinGraphConfig object. If None, defaults to config in graphein.protein.config :type config: graphein.protein.config.ProteinGraphConfig, optional :param pdb_path: Path to pdb_file to build graph from :type pdb_path: str, optional :param pdb_code: 4-character PDB accession pdb_code to build graph from :type pdb_code: str, optional :param chain_selection: String of polypeptide chains to include in graph. E.g "ABDF" or "all" :type chain_selection: str, optional :param df_processing_funcs: List of dataframe processing functions :type df_processing_funcs: List[Callable], optional :param edge_construction_funcs: List of edge construction functions :type edge_construction_funcs: List[Callable], optional :param edge_annotation_funcs: List of edge annotation functions :type edge_annotation_funcs: List[Callable], optional :param node_annotation_funcs: List of node annotation functions :type node_annotation_funcs: List[Callable], optional :param graph_annotation_funcs: List of graph annotation function :type graph_annotation_funcs: List[Callable] :return: Protein Structure Graph :type: nx.Graph """ # If no config is provided, use default if config is None: config = ProteinGraphConfig() # Get name from pdb_file is no pdb_code is provided if pdb_path and (pdb_code is None): pdb_code = get_protein_name_from_filename(pdb_path) # If config params are provided, overwrite them config.protein_df_processing_functions = ( df_processing_funcs if config.protein_df_processing_functions is None else config.protein_df_processing_functions) config.edge_construction_functions = ( edge_construction_funcs if config.edge_construction_functions is None else config.edge_construction_functions) config.node_metadata_functions = (node_annotation_funcs if config.node_metadata_functions is None else config.node_metadata_functions) config.graph_metadata_functions = (graph_annotation_funcs if config.graph_metadata_functions is None else config.graph_metadata_functions) config.edge_metadata_functions = (edge_annotation_funcs if config.edge_metadata_functions is None else config.edge_metadata_functions) raw_df = read_pdb_to_dataframe( pdb_path, pdb_code, verbose=config.verbose, granularity=config.granularity, ) protein_df = process_dataframe(raw_df, chain_selection=chain_selection, granularity=config.granularity) # Initialise graph with metadata g = initialise_graph_with_metadata( protein_df=protein_df, raw_pdb_df=raw_df.df["ATOM"], pdb_id=pdb_code, granularity=config.granularity, ) # Add nodes to graph g = add_nodes_to_graph(g) # Add config to graph g.graph["config"] = config # Annotate additional node metadata if config.node_metadata_functions is not None: g = annotate_node_metadata(g, config.node_metadata_functions) # Compute graph edges g = compute_edges( g, funcs=config.edge_construction_functions, get_contacts_config=None, ) # Annotate additional graph metadata if config.graph_metadata_functions is not None: g = annotate_graph_metadata(g, config.graph_metadata_functions) # Annotate additional edge metadata if config.edge_metadata_functions is not None: g = annotate_edge_metadata(g, config.edge_metadata_functions) return g
if __name__ == "__main__": from functools import partial from graphein.protein.edges.distance import add_k_nn_edges from graphein.protein.features.sequence.sequence import molecular_weight configs = { "granularity": "CA", "keep_hets": False, "insertions": False, "verbose": False, "get_contacts_config": GetContactsConfig(), "dssp_config": DSSPConfig(), "graph_metadata_functions": [molecular_weight], } config = ProteinGraphConfig(**configs) config.edge_construction_functions = [ partial(add_k_nn_edges, k=3, long_interaction_threshold=0) ] # Test High-level API g = construct_graph( config=config, pdb_path="../examples/pdbs/3eiy.pdb", ) """ # Test Low-level API raw_df = read_pdb_to_dataframe( pdb_path="../../examples/pdbs/3eiy.pdb", verbose=config.verbose, )
def parse_protein_graph_config(config_dict): config = ProteinGraphConfig(**config_dict) print(config) return config
def test_protein_graph_config(): """Test the protein graph config yaml parser.""" config = ProteinGraphConfig(**protein_graph_config) yml_config = parse_config(DATA_PATH / "test_protein_graph_config.yml") assert config == yml_config
"insertions": False, "verbose": False, "pdb_dir": "../examples/pdbs/", "get_contacts_config": GetContactsConfig( contacts_dir="../examples/contacts/", pdb_dir="../examples/contacts/", ), "dssp_config": DSSPConfig(), } config = ProteinGraphConfig(**configs) config.edge_construction_functions = [ salt_bridge, hydrogen_bond, van_der_waals, pi_cation, pi_stacking, hydrophobic, t_stacking, ] # Test High-level API # Iterate over rows to produce Graph, pickle graph and label for row in tqdm(range(len(df))): example = df.iloc[row]
from graphein.protein.graphs import construct_graph # Test Point cloud plotting # v, f, a = create_mesh(pdb_code="3eiy") # m = convert_verts_and_face_to_mesh(v, f) # plot_pointcloud(m, "Test") # TEST PROTEIN STRUCTURE GRAPH PLOTTING configs = { "granularity": "atom", "keep_hets": False, "deprotonate": True, "insertions": False, "verbose": False, } config = ProteinGraphConfig(**configs) config.edge_construction_functions = [ add_atomic_edges, add_ring_status, add_bond_order, ] config.node_metadata_functions = [meiler_embedding, expasy_protein_scale] g = construct_graph( config=config, pdb_path="../examples/pdbs/3eiy.pdb", pdb_code="3eiy" ) p = plotly_protein_structure_graph( g, 30,
for i, (_, _, feat_dict) in enumerate(G.edges(data=True)): for key, value in feat_dict.items(): data[str(key)] = (list(value) if i == 0 else data[str(key)] + list(value)) # Add graph-level features for feat_name in G.graph: data[str(feat_name)] = [G.graph[feat_name]] data["edge_index"] = edge_index.view(2, -1) data = Data.from_dict(data) data.num_nodes = G.number_of_nodes() return data if __name__ == "__main__": from graphein.protein.config import ProteinGraphConfig from graphein.protein.graphs import construct_graph g = construct_graph(pdb_code="3eiy", config=ProteinGraphConfig()) assert type(g) is nx.Graph # print(SUPPORTED_FORMATS) convertor = GraphFormatConvertor(src_format="nx", dst_format="pyg", verbose="gnn") pyg = convertor(g) assert type(pyg) is torch_geometric.data.Data
def parse_protein_graph_config(config_dict): from graphein.protein.config import ProteinGraphConfig config = ProteinGraphConfig(**config_dict) print(config) return config