コード例 #1
0
ファイル: test_subgraphs.py プロジェクト: a-r-j/graphein
def test_extract_subgraph_from_bond_type():
    """Tests subgraph extraction from bond type"""
    file_path = Path(__file__).parent / "test_data/4hhb.pdb"
    config = ProteinGraphConfig(
        edge_construction_functions=[add_peptide_bonds, add_ionic_interactions]
    )
    G = construct_graph(pdb_path=str(file_path))  # , config=config)

    BOND_TYPES = ["ionic"]

    s_g = extract_subgraph_by_bond_type(G, BOND_TYPES, filter_dataframe=True)

    for u, v, d in G.edges(data=True):
        if d["kind"] in BOND_TYPES:
            assert u in s_g.nodes()
            assert v in s_g.nodes()
            assert (u, v) in s_g.edges()

    for u, v, d in s_g.edges(data=True):
        for bond in list(d["kind"]):
            assert bond in BOND_TYPES

    s_g = extract_subgraph_by_bond_type(
        G, BOND_TYPES, filter_dataframe=True, inverse=True
    )

    for u, v, d in G.edges(data=True):
        if d["kind"] in BOND_TYPES:
            assert (u, v) not in s_g.edges()

    for u, v, d in s_g.edges(data=True):
        for bond in list(d["kind"]):
            assert bond not in BOND_TYPES
コード例 #2
0
def test_sequence_features():
    # Tests sequence featurisers for a residue graph:
    # ESM and BioVec embeddings, propy and sequence descriptors
    file_path = Path(__file__).parent / "test_data/4hhb.pdb"

    sequence_feature_functions = {
        "graph_metadata_functions": [
            # esm_sequence_embedding,
            # esm_residue_embedding,
            biovec_sequence_embedding,
            molecular_weight,
        ]
    }
    config = ProteinGraphConfig(**sequence_feature_functions)
    G = construct_graph(pdb_path=str(file_path), config=config)

    # Check for existence on sequence-based features as node-level features
    # for n, d in G.nodes(data=True):
    # Todo this can probably be improved.
    # This only checks for the existence and shape of the esm_embedding for each node
    # assert "esm_embedding" in d
    # assert len(d["esm_embedding"]) == 1280

    # Check for existence of sequence-based features as Graph-level features
    for chain in G.graph["chain_ids"]:
        assert f"sequence_{chain}" in G.graph
        # assert f"esm_embedding_{chain}" in G.graph
        assert f"biovec_embedding_{chain}" in G.graph
        assert f"molecular_weight_{chain}" in G.graph
コード例 #3
0
def test_node_features():
    # Todo this test requires attention
    # Tests node featurisers for a residue graph:
    # Amino acid features, ESM embedding, DSSP features, aaindex features

    file_path = Path(__file__).parent / "test_data/4hhb.pdb"

    node_feature_functions = {
        "node_metadata_functions": [
            expasy_protein_scale,  # Todo we need to refactor node data assingment flow
            meiler_embedding,
            # rsa,
            # asa,
            # phi,
            # psi,
            # secondary_structure,
            # partial(aaindex1, accession="FAUJ880111"),
        ]
    }
    config = ProteinGraphConfig(**node_feature_functions)
    G = construct_graph(pdb_path=str(file_path), config=config)

    # Check for existence of features
    for n, d in G.nodes(data=True):
        # assert "meiler_embedding" in d # Todo these functions return pd.Series, rather than adding to the node
        # assert expasy_protein_scale in d
        # assert "rsa" in d
        # assert "asa" in d
        # assert "phi" in d
        # assert "psi" in d
        # assert "secondary_structure" in d
        continue
コード例 #4
0
def test_insertion_handling():
    configs = {
        "granularity": "CA",
        "keep_hets": False,
        "insertions": False,
        "verbose": False,
        "node_metadata_functions": [meiler_embedding, expasy_protein_scale],
        "edge_construction_functions": [
            add_peptide_bonds,
            add_hydrogen_bond_interactions,
            add_ionic_interactions,
            add_aromatic_sulphur_interactions,
            add_hydrophobic_interactions,
            add_cation_pi_interactions,
        ],
    }

    config = ProteinGraphConfig(**configs)

    # This is a nasty PDB with a lot of insertions and altlocs
    g = construct_graph(config=config, pdb_code="6OGE")

    assert len(g.graph["sequence_A"]) + len(g.graph["sequence_B"]) + len(
        g.graph["sequence_C"]
    ) + len(g.graph["sequence_D"]) + len(g.graph["sequence_E"]) == len(g)
    assert g.graph["coords"].shape[0] == len(g)
コード例 #5
0
def test_distance_edges():
    """Example-based test that distance-based edge construction works correctly

    Uses 4hhb PDB file as an example test case.
    """
    file_path = Path(__file__).parent / "test_data/4hhb.pdb"

    edge_functions = {
        "edge_construction_functions": [
            partial(add_k_nn_edges, k=5, long_interaction_threshold=10),
            add_hydrophobic_interactions,
            add_aromatic_interactions,  # Todo removed for now as ring centroids require precomputing
            add_aromatic_sulphur_interactions,
            add_delaunay_triangulation,
            add_cation_pi_interactions,
            add_peptide_bonds,
            add_hydrogen_bond_interactions,
            add_disulfide_interactions,
            add_ionic_interactions,
            partial(
                add_distance_threshold,
                threshold=12,
                long_interaction_threshold=10,
            ),
        ]
    }
    config = ProteinGraphConfig(**edge_functions)
    G = construct_graph(pdb_path=str(file_path), config=config)
    assert G is not None
コード例 #6
0
ファイル: test_amino_acid.py プロジェクト: a-r-j/graphein
def test_amino_acid_one_hot_example():
    """Example-based test on 4hhb for `amino_acid_onehot`."""

    # Test np array
    config = ProteinGraphConfig(node_metadata_functions=[amino_acid_one_hot])
    g = construct_graph(pdb_code="4hhb", config=config)

    for n, d in g.nodes(data=True):
        assert sum(d["amino_acid_one_hot"]) == 1

    # Test pd.Series
    config = ProteinGraphConfig(node_metadata_functions=[
        partial(amino_acid_one_hot, return_array=False)
    ])
    g = construct_graph(pdb_code="4hhb", config=config)

    for n, d in g.nodes(data=True):
        assert sum(d["amino_acid_one_hot"]) == 1
        assert (d["amino_acid_one_hot"].idxmax() == RESI_THREE_TO_1[
            d["residue_name"]])
コード例 #7
0
ファイル: test_geometry.py プロジェクト: a-r-j/graphein
def test_add_sidechain_vector():
    config = ProteinGraphConfig(edge_construction_functions=[
        partial(add_sidechain_vector, scale=True)
    ], )
    g = construct_graph(pdb_code="1lds", config=config)

    for n, d in g.nodes(data=True):
        # Check that the node has the correct attributes
        assert "sidechain_vector" in d.keys()
        # Check the vector is of the correct dimensionality
        assert d["sidechain_vector"].shape == (3, )

        # check glycines are zero
        if d["residue_name"] == "GLY":
            np.testing.assert_equal(d["sidechain_vector"],
                                    np.array([0.0, 0.0, 0.0]))
        else:
            # Check scaled vector has norm close 1
            np.testing.assert_almost_equal(
                np.linalg.norm(d["sidechain_vector"]), 1.0)

    # Test unscaled vector
    config = ProteinGraphConfig(edge_construction_functions=[
        partial(add_sidechain_vector, scale=False)
    ], )
    g = construct_graph(pdb_code="1lds", config=config)

    for n, d in g.nodes(data=True):
        # check glycines are zero
        if d["residue_name"] == "GLY":
            np.testing.assert_equal(d["sidechain_vector"],
                                    np.array([0.0, 0.0, 0.0]))
        else:
            # Check the vector is pointing in the correct direction
            sc_true = np.array(
                g.graph["rgroup_df"].groupby("node_id").mean().loc[n][[
                    "x_coord", "y_coord", "z_coord"
                ]])
            np.testing.assert_almost_equal(sc_true,
                                           d["coords"] + d["sidechain_vector"])
コード例 #8
0
ファイル: test_geometry.py プロジェクト: a-r-j/graphein
def test_add_beta_carbon_vector():
    config = ProteinGraphConfig(edge_construction_functions=[
        partial(add_beta_carbon_vector, scale=True)
    ], )
    g = construct_graph(pdb_code="1lds", config=config)

    raw_pdb = g.graph["raw_pdb_df"]
    for n, d in g.nodes(data=True):
        # Check that the node has the correct attributes
        assert "c_beta_vector" in d.keys()
        # Check the vector is of the correct dimensionality
        assert d["c_beta_vector"].shape == (3, )

        # check glycines are zero
        if d["residue_name"] == "GLY":
            np.testing.assert_equal(d["c_beta_vector"],
                                    np.array([0.0, 0.0, 0.0]))
        else:
            # Check scaled vector has norm close 1
            np.testing.assert_almost_equal(np.linalg.norm(d["c_beta_vector"]),
                                           1.0)

    # Test unscaled vector
    config = ProteinGraphConfig(edge_construction_functions=[
        partial(add_beta_carbon_vector, scale=False)
    ], )
    g = construct_graph(pdb_code="1lds", config=config)

    for n, d in g.nodes(data=True):
        # check glycines are zero
        if d["residue_name"] == "GLY":
            np.testing.assert_equal(d["c_beta_vector"],
                                    np.array([0.0, 0.0, 0.0]))
        else:
            # Check the vector is pointing in the correct direction
            cb_true = np.array(
                raw_pdb[raw_pdb["node_id"] == n][raw_pdb["atom_name"] == "CB"][
                    ["x_coord", "y_coord", "z_coord"]]).T.squeeze()
            np.testing.assert_almost_equal(cb_true,
                                           d["coords"] + d["c_beta_vector"])
コード例 #9
0
ファイル: test_subgraphs.py プロジェクト: a-r-j/graphein
def test_surface_subgraph():
    """Tests surface subgraph extraction."""
    file_path = Path(__file__).parent / "test_data/4hhb.pdb"
    config = ProteinGraphConfig(
        graph_metadata_functions=[rsa], dssp_config=DSSPConfig()
    )
    G = construct_graph(pdb_path=str(file_path), config=config)

    RSA_THRESHOLD: float = 0.2
    s_g = extract_surface_subgraph(G, RSA_THRESHOLD, filter_dataframe=True)

    for n, d in s_g.nodes(data=True):
        assert d["rsa"] >= RSA_THRESHOLD

    for n, d in G.nodes(data=True):
        if d["rsa"] >= RSA_THRESHOLD:
            assert n in s_g.nodes(), print(n, d)
コード例 #10
0
ファイル: test_subgraphs.py プロジェクト: a-r-j/graphein
def test_secondary_structure_subgraph():
    """Tests secondary subgraph extraction."""
    file_path = Path(__file__).parent / "test_data/4hhb.pdb"
    config = ProteinGraphConfig(
        graph_metadata_functions=[secondary_structure],
        dssp_config=DSSPConfig(),
    )
    G = construct_graph(pdb_path=str(file_path), config=config)

    SS_ELEMENTS: List[str] = ["H"]
    s_g = extract_subgraph_from_secondary_structure(
        G, SS_ELEMENTS, filter_dataframe=True
    )

    for _, d in s_g.nodes(data=True):
        assert d["ss"] in SS_ELEMENTS

    for n, d in G.nodes(data=True):
        if d["ss"] in SS_ELEMENTS:
            assert n in s_g.nodes()
コード例 #11
0
def test_edges_do_not_add_nodes_for_chain_subset():
    new_funcs = {
        "edge_construction_functions": [
            add_peptide_bonds,
            add_hydrogen_bond_interactions,
            add_disulfide_interactions,
            add_ionic_interactions,
            add_aromatic_interactions,
            add_aromatic_sulphur_interactions,
            add_cation_pi_interactions,
        ],
    }
    config = ProteinGraphConfig(**new_funcs)
    g = construct_graph(config=config, pdb_code="2vvi", chain_selection="A")
    assert len(g) == 217
    g = construct_graph(config=config, pdb_code="2vvi", chain_selection="B")
    assert len(g) == 219
    g = construct_graph(config=config, pdb_code="2vvi", chain_selection="C")
    assert len(g) == 222
    g = construct_graph(config=config, pdb_code="2vvi", chain_selection="D")
    assert len(g) == 219
コード例 #12
0
ファイル: test_subgraphs.py プロジェクト: a-r-j/graphein
def test_successful_pickle():
    """Tests subgraphs can be successfully pickled and unpickled"""
    file_path = Path(__file__).parent / "test_data/4hhb.pdb"
    config = ProteinGraphConfig(
        graph_metadata_functions=[secondary_structure],
        dssp_config=DSSPConfig(),
    )
    G = construct_graph(pdb_path=str(file_path), config=config)
    s_g = extract_subgraph_from_residue_types(
        G,
        residue_types=["ALA", "SER", "MET"],
        update_coords=True,
        filter_dataframe=True,
        recompute_distmat=True,
    )

    with open("/tmp/test_graph.p", "wb") as f:
        pickle.dump(s_g, f)

    with open("/tmp/test_graph.p", "rb") as f:
        loaded_graph = pickle.load(f)

    assert nx.is_isomorphic(s_g, loaded_graph)
コード例 #13
0
ファイル: graphs.py プロジェクト: BatoolMM/graphein
def construct_graph(
    config: Optional[ProteinGraphConfig] = None,
    pdb_path: Optional[str] = None,
    pdb_code: Optional[str] = None,
    chain_selection: str = "all",
    df_processing_funcs: Optional[List[Callable]] = None,
    edge_construction_funcs: Optional[List[Callable]] = None,
    edge_annotation_funcs: Optional[List[Callable]] = None,
    node_annotation_funcs: Optional[List[Callable]] = None,
    graph_annotation_funcs: Optional[List[Callable]] = None,
) -> nx.Graph:
    """
    Constructs protein structure graph from a pdb_code or pdb_path. Users can provide a ProteinGraphConfig object.

    However, config parameters can be overridden by passing arguments directly to the function.

    :param config: ProteinGraphConfig object. If None, defaults to config in graphein.protein.config
    :type config: graphein.protein.config.ProteinGraphConfig, optional
    :param pdb_path: Path to pdb_file to build graph from
    :type pdb_path: str, optional
    :param pdb_code: 4-character PDB accession pdb_code to build graph from
    :type pdb_code: str, optional
    :param chain_selection: String of polypeptide chains to include in graph. E.g "ABDF" or "all"
    :type chain_selection: str, optional
    :param df_processing_funcs: List of dataframe processing functions
    :type df_processing_funcs: List[Callable], optional
    :param edge_construction_funcs: List of edge construction functions
    :type edge_construction_funcs: List[Callable], optional
    :param edge_annotation_funcs: List of edge annotation functions
    :type edge_annotation_funcs: List[Callable], optional
    :param node_annotation_funcs: List of node annotation functions
    :type node_annotation_funcs: List[Callable], optional
    :param graph_annotation_funcs: List of graph annotation function
    :type graph_annotation_funcs: List[Callable]
    :return: Protein Structure Graph
    :type: nx.Graph
    """

    # If no config is provided, use default
    if config is None:
        config = ProteinGraphConfig()

    # Get name from pdb_file is no pdb_code is provided
    if pdb_path and (pdb_code is None):
        pdb_code = get_protein_name_from_filename(pdb_path)

    # If config params are provided, overwrite them
    config.protein_df_processing_functions = (
        df_processing_funcs if config.protein_df_processing_functions is None
        else config.protein_df_processing_functions)
    config.edge_construction_functions = (
        edge_construction_funcs if config.edge_construction_functions is None
        else config.edge_construction_functions)
    config.node_metadata_functions = (node_annotation_funcs
                                      if config.node_metadata_functions is None
                                      else config.node_metadata_functions)
    config.graph_metadata_functions = (graph_annotation_funcs if
                                       config.graph_metadata_functions is None
                                       else config.graph_metadata_functions)
    config.edge_metadata_functions = (edge_annotation_funcs
                                      if config.edge_metadata_functions is None
                                      else config.edge_metadata_functions)

    raw_df = read_pdb_to_dataframe(
        pdb_path,
        pdb_code,
        verbose=config.verbose,
        granularity=config.granularity,
    )
    protein_df = process_dataframe(raw_df,
                                   chain_selection=chain_selection,
                                   granularity=config.granularity)

    # Initialise graph with metadata
    g = initialise_graph_with_metadata(
        protein_df=protein_df,
        raw_pdb_df=raw_df.df["ATOM"],
        pdb_id=pdb_code,
        granularity=config.granularity,
    )
    # Add nodes to graph
    g = add_nodes_to_graph(g)

    # Add config to graph
    g.graph["config"] = config

    # Annotate additional node metadata
    if config.node_metadata_functions is not None:
        g = annotate_node_metadata(g, config.node_metadata_functions)

    # Compute graph edges
    g = compute_edges(
        g,
        funcs=config.edge_construction_functions,
        get_contacts_config=None,
    )

    # Annotate additional graph metadata
    if config.graph_metadata_functions is not None:
        g = annotate_graph_metadata(g, config.graph_metadata_functions)

    # Annotate additional edge metadata
    if config.edge_metadata_functions is not None:
        g = annotate_edge_metadata(g, config.edge_metadata_functions)

    return g
コード例 #14
0
ファイル: graphs.py プロジェクト: BatoolMM/graphein
if __name__ == "__main__":
    from functools import partial

    from graphein.protein.edges.distance import add_k_nn_edges
    from graphein.protein.features.sequence.sequence import molecular_weight

    configs = {
        "granularity": "CA",
        "keep_hets": False,
        "insertions": False,
        "verbose": False,
        "get_contacts_config": GetContactsConfig(),
        "dssp_config": DSSPConfig(),
        "graph_metadata_functions": [molecular_weight],
    }
    config = ProteinGraphConfig(**configs)
    config.edge_construction_functions = [
        partial(add_k_nn_edges, k=3, long_interaction_threshold=0)
    ]
    # Test High-level API
    g = construct_graph(
        config=config,
        pdb_path="../examples/pdbs/3eiy.pdb",
    )
    """
    # Test Low-level API
    raw_df = read_pdb_to_dataframe(
        pdb_path="../../examples/pdbs/3eiy.pdb",
        verbose=config.verbose,
    )
コード例 #15
0
def parse_protein_graph_config(config_dict):

    config = ProteinGraphConfig(**config_dict)
    print(config)
    return config
コード例 #16
0
def test_protein_graph_config():
    """Test the protein graph config yaml parser."""
    config = ProteinGraphConfig(**protein_graph_config)
    yml_config = parse_config(DATA_PATH / "test_protein_graph_config.yml")
    assert config == yml_config
コード例 #17
0
        "insertions":
        False,
        "verbose":
        False,
        "pdb_dir":
        "../examples/pdbs/",
        "get_contacts_config":
        GetContactsConfig(
            contacts_dir="../examples/contacts/",
            pdb_dir="../examples/contacts/",
        ),
        "dssp_config":
        DSSPConfig(),
    }

    config = ProteinGraphConfig(**configs)

    config.edge_construction_functions = [
        salt_bridge,
        hydrogen_bond,
        van_der_waals,
        pi_cation,
        pi_stacking,
        hydrophobic,
        t_stacking,
    ]
    # Test High-level API

    # Iterate over rows to produce Graph, pickle graph and label
    for row in tqdm(range(len(df))):
        example = df.iloc[row]
コード例 #18
0
    from graphein.protein.graphs import construct_graph

    # Test Point cloud plotting
    # v, f, a = create_mesh(pdb_code="3eiy")
    # m = convert_verts_and_face_to_mesh(v, f)
    # plot_pointcloud(m, "Test")
    # TEST PROTEIN STRUCTURE GRAPH PLOTTING
    configs = {
        "granularity": "atom",
        "keep_hets": False,
        "deprotonate": True,
        "insertions": False,
        "verbose": False,
    }

    config = ProteinGraphConfig(**configs)
    config.edge_construction_functions = [
        add_atomic_edges,
        add_ring_status,
        add_bond_order,
    ]

    config.node_metadata_functions = [meiler_embedding, expasy_protein_scale]

    g = construct_graph(
        config=config, pdb_path="../examples/pdbs/3eiy.pdb", pdb_code="3eiy"
    )

    p = plotly_protein_structure_graph(
        g,
        30,
コード例 #19
0
ファイル: conversion.py プロジェクト: a-r-j/graphein
    for i, (_, _, feat_dict) in enumerate(G.edges(data=True)):
        for key, value in feat_dict.items():
            data[str(key)] = (list(value) if i == 0 else data[str(key)] +
                              list(value))

    # Add graph-level features
    for feat_name in G.graph:
        data[str(feat_name)] = [G.graph[feat_name]]

    data["edge_index"] = edge_index.view(2, -1)
    data = Data.from_dict(data)
    data.num_nodes = G.number_of_nodes()

    return data


if __name__ == "__main__":
    from graphein.protein.config import ProteinGraphConfig
    from graphein.protein.graphs import construct_graph

    g = construct_graph(pdb_code="3eiy", config=ProteinGraphConfig())
    assert type(g) is nx.Graph

    # print(SUPPORTED_FORMATS)

    convertor = GraphFormatConvertor(src_format="nx",
                                     dst_format="pyg",
                                     verbose="gnn")
    pyg = convertor(g)
    assert type(pyg) is torch_geometric.data.Data
コード例 #20
0
ファイル: utils.py プロジェクト: BatoolMM/graphein
def parse_protein_graph_config(config_dict):
    from graphein.protein.config import ProteinGraphConfig

    config = ProteinGraphConfig(**config_dict)
    print(config)
    return config