def from_reaction_template(cls, template_smarts): template = ReactionTemplate(template_smarts) _rdkit_config = rdkit_config(reaction_center=template.ReactingAtomsMN, reactant_or_product='reactant', IsSanitized=False, set_morgan_identifier=False) reaction = Graph.from_rdkit(template.reactants[0], _rdkit_config).to_networkx() for reactant in template.reactants[1:]: g = Graph.from_rdkit(reactant, _rdkit_config).to_networkx() reaction = nx.disjoint_union(reaction, g) _rdkit_config = rdkit_config(reaction_center=template.ReactingAtomsMN, reactant_or_product='product', IsSanitized=False, set_morgan_identifier=False) for product in template.products: g = Graph.from_rdkit(product, _rdkit_config).to_networkx() reaction = nx.disjoint_union(reaction, g) g = _from_networkx(cls, reaction) if g.nodes.to_pandas()['ReactingCenter'].max() <= 0: raise RuntimeError(f'No reacting atoms are found in reactants: ' f'{template_smarts}') if g.nodes.to_pandas()['ReactingCenter'].min() >= 0: raise RuntimeError(f'No reacting atoms are found in products: ' f'{template_smarts}') return g
def test_molecular_kernel(): molecules = [molecule('H2'), molecule('O2'), molecule('CH4')] graphs = [Graph.from_ase(m) for m in molecules] kernel = Tang2019MolecularKernel(starting_probability='uniform') R = kernel(graphs) D = np.diag(np.diag(R)**-0.5) K = D.dot(R).dot(D) assert (R.shape == (3, 3)) for i in range(len(molecules)): assert (K[i, i] == pytest.approx(1, 1e-6)) R_nodal = kernel(graphs, nodal=True) D_nodal = np.diag(np.diag(R_nodal)**-0.5) K_nodal = D_nodal.dot(R_nodal).dot(D_nodal) natoms = np.sum([len(m) for m in molecules]) assert (R_nodal.shape == (natoms, natoms)) for i in range(natoms): assert (K_nodal[i, i] == pytest.approx(1, 1e-6)) kernel_nocarbon = Tang2019MolecularKernel( starting_probability=lambda n: 0.0 if n[1]['element'] == 6 else 1.0) R_nocarbon_nodal = kernel_nocarbon(graphs, nodal=True) k = 0 for i, m in enumerate(molecules): for j, a in enumerate(m): if a.symbol == 'C': assert (R_nocarbon_nodal[k, :].sum() == 0) assert (R_nocarbon_nodal[:, k].sum() == 0) k += 1
def test_mlgk_on_permuted_graph(): g = Graph.from_ase(molecule('C6H6')) for _ in range(10): h = g.permute(np.random.permutation(len(g.nodes))) kernel = MarginalizedGraphKernel( TensorProduct(element=KroneckerDelta(0.5)), TensorProduct(length=SquareExponential(0.1))) assert (kernel([g], [h]).item() == pytest.approx(kernel([g]).item()))
def test_marginalized_graph_kernel_2nd_launch(benchmark, batch): graphs = [Graph.from_networkx(g, weight='weight') for g in make_graphs(batch, 48)] knode = TensorProduct(label=KroneckerDelta(0.5)) kedge = TensorProduct(label=KroneckerDelta(0.5)) kernel = MarginalizedGraphKernel(knode, kedge) def fun(): kernel(graphs, nodal=False) benchmark.pedantic(fun, iterations=3, rounds=3, warmup_rounds=0)
def test_mlgk_typecheck(): node_kernel = Constant(1.0) edge_kernel = Constant(1.0) mlgk = MarginalizedGraphKernel(node_kernel, edge_kernel, q=0.5) G = [ Graph.from_networkx(unlabeled_graph1), Graph.from_networkx(labeled_graph1), Graph.from_networkx(weighted_graph1, weight='w') ] with pytest.raises(TypeError): mlgk([G[0], G[1]]) with pytest.raises(TypeError): mlgk([G[0], G[2]]) with pytest.raises(TypeError): mlgk([G[1], G[2]]) with pytest.raises(TypeError): mlgk([G[1], G[0]]) with pytest.raises(TypeError): mlgk([G[2], G[0]]) with pytest.raises(TypeError): mlgk([G[2], G[1]])
def test_octile_graph_weighted(): assert(OctileGraph.dtype.isalignedstruct) dfg = Graph( nodes={ '!i': [0, 1, 2], 'charge': [1, -1, 2], 'conjugate': [False, True, True], 'hybridization': [2, 3, 1] }, edges={ '!i': [0, 0], '!j': [1, 2], 'length': [0.5, 1.0], '!w': [1.0, 2.0] }, title='H2O') og = OctileGraph(dfg) assert(og.n_node == len(dfg.nodes)) assert(og.p_octile != 0) assert(og.p_degree != 0) assert(og.p_node != 0) with pytest.raises(AttributeError): og.p_octile = np.uintp(0) with pytest.raises(AttributeError): og.p_degree = np.uintp(0) with pytest.raises(AttributeError): og.p_node = np.uintp(0) assert(og.node_t.isalignedstruct) for name in og.node_t.names: assert(name in dfg.nodes.columns) assert('charge' in og.node_t.names) assert('conjugate' in og.node_t.names) assert('hybridization' in og.node_t.names) assert(og.edge_t.isalignedstruct) assert(len(og.edge_t.names) == 2) assert('weight' in og.edge_t.names) assert('label' in og.edge_t.names) for name in og.edge_t['label'].names: assert(name in dfg.edges.columns) for name in dfg.edges.columns: if name in ['!i', '!j', '!w']: continue assert(name in og.edge_t['label'].names)
def test_molecular_kernel_custom_pstart(): molecules = [molecule('H2'), molecule('O2'), molecule('CH4')] graphs = [Graph.from_ase(m) for m in molecules] kernel_nocarbon = Tang2019MolecularKernel( starting_probability=(lambda ns: np.where(ns.element == 6, 0.0, 1.0), 'n.element == 6 ? 0.f : 1.f')) R_nocarbon_nodal = kernel_nocarbon(graphs, nodal=True) k = 0 for i, m in enumerate(molecules): for j, a in enumerate(m): if a.symbol == 'C': assert (R_nocarbon_nodal[k, :].sum() == 0) assert (R_nocarbon_nodal[:, k].sum() == 0) k += 1
def test_octile_graph_weighted(): assert (OctileGraph.dtype.isalignedstruct) dfg = Graph(nodes={ 'index': [0, 1, 2], 'columns': ['charge', 'conjugate', 'hybridization'], 'data': [[1, False, 2], [-1, True, 3], [2, True, 1]] }, edges={ 'index': [0, 1], 'columns': ['!ij', 'length', '!w'], 'data': [[(0, 1), 0.5, 1.0], [(0, 2), 1.0, 2.0]] }, title='H2O') og = OctileGraph(dfg) assert (og.n_node == len(dfg.nodes)) assert (og.padded_size >= og.n_node and og.padded_size % 8 == 0) assert (og.n_octile == (og.padded_size // 8)**2) assert (og.p_octile != 0) assert (og.p_degree != 0) assert (og.p_node != 0) with pytest.raises(AttributeError): og.p_octile = np.uintp(0) with pytest.raises(AttributeError): og.p_degree = np.uintp(0) with pytest.raises(AttributeError): og.p_node = np.uintp(0) assert (og.node_type.isalignedstruct) for name in og.node_type.names: assert (name in dfg.nodes.columns) for name in dfg.nodes.columns: assert (name in og.node_type.names) assert (og.edge_type.isalignedstruct) assert (len(og.edge_type.names) == 2) assert ('weight' in og.edge_type.names) assert ('label' in og.edge_type.names) for name in og.edge_type['label'].names: assert (name in dfg.edges.columns) for name in dfg.edges.drop(['!ij', '!w'], axis=1).columns: assert (name in og.edge_type['label'].names)
def test_mlgk_self_loops(): kedge = Constant(1.0) knode = Constant(1.0) q = 0.1 mlgk = MarginalizedGraphKernel(knode, kedge, q=q) np.random.seed(2) for i in range(10): n = np.random.randint(4, 20) A = np.random.randn(n, n) A = A + A.T G = [Graph.from_networkx(nx.from_numpy_array(A), weight='weight')] K = mlgk(G).item() K0 = MLGK(G[0], knode, kedge, q, q, nodal=False) assert (K == pytest.approx(K0, 5e-4))
def test_mlgk_fixed_hyperparameters(): g = nx.Graph() g.add_node(0, feature=0) g.add_node(1, feature=1) g.add_node(2, feature=0) g.add_edge(0, 1, attribute=1.0) g.add_edge(0, 2, attribute=2.0) G = [Graph.from_networkx(g)] knodeV = TensorProduct(feature=KroneckerDelta(0.5)) knodeF = TensorProduct(feature=KroneckerDelta(0.5, h_bounds='fixed')) kedgeV = TensorProduct(attribute=SquareExponential(1.0)) kedgeF = TensorProduct( attribute=SquareExponential(1.0, length_scale_bounds='fixed')) kernelVV = MarginalizedGraphKernel(knodeV, kedgeV) kernelVF = MarginalizedGraphKernel(knodeV, kedgeF) kernelFV = MarginalizedGraphKernel(knodeF, kedgeV) kernelFF = MarginalizedGraphKernel(knodeF, kedgeF) assert (len(kernelVV.theta) == len(kernelVF.theta) + 1) assert (len(kernelVV.theta) == len(kernelFV.theta) + 1) assert (len(kernelVV.theta) == len(kernelFF.theta) + 2) assert (len(kernelVV.bounds) == len(kernelVF.bounds) + 1) assert (len(kernelVV.bounds) == len(kernelFV.bounds) + 1) assert (len(kernelVV.bounds) == len(kernelFF.bounds) + 2) Rvv, dRvv = kernelVV(G, eval_gradient=True) Rvf, dRvf = kernelVF(G, eval_gradient=True) Rfv, dRfv = kernelFV(G, eval_gradient=True) Rff, dRff = kernelFF(G, eval_gradient=True) assert (Rvv == pytest.approx(Rvf)) assert (Rvv == pytest.approx(Rfv)) assert (Rvv == pytest.approx(Rff)) assert (dRvv.shape[2] == dRvf.shape[2] + 1) assert (dRvv.shape[2] == dRfv.shape[2] + 1) assert (dRvv.shape[2] == dRff.shape[2] + 2) assert (dRvv[:, :, kernelVF.active_theta_mask] == pytest.approx(dRvf)) assert (dRvv[:, :, kernelFV.active_theta_mask] == pytest.approx(dRfv)) assert (dRvv[:, :, kernelFF.active_theta_mask] == pytest.approx(dRff))
def test_mlgk_large(): g = nx.Graph() n = 24 for i, row in enumerate(np.random.randint(0, 2, (n, n))): g.add_node(i, type=0) for j, pred in enumerate(row[:i]): if pred: g.add_edge(i, j, weight=1) dfg = Graph.from_networkx(g, weight='weight') q = 0.5 node_kernel = TensorProduct(type=KroneckerDelta(1.0)) edge_kernel = Constant(1.0) mlgk = MarginalizedGraphKernel(node_kernel, edge_kernel, q=q) dot = mlgk([dfg]) gold = MLGK(dfg, node_kernel, edge_kernel, q, q) assert (dot.shape == (1, 1)) assert (dot.item() == pytest.approx(gold))
def test_mlgk_dtype(): g = nx.Graph() n = 8 for i, row in enumerate(np.random.randint(0, 2, (n, n))): g.add_node(i, type=0) for j, pred in enumerate(row[:i]): if pred: g.add_edge(i, j, weight=1) dfg = Graph.from_networkx(g, weight='weight') q = 0.5 node_kernel = TensorProduct(type=KroneckerDelta(1.0)) edge_kernel = Constant(1.0) for dtype in [np.float, np.float32, np.float64]: mlgk = MarginalizedGraphKernel(node_kernel, edge_kernel, q=q, dtype=dtype) assert (mlgk([dfg]).dtype == dtype) assert (mlgk.diag([dfg]).dtype == dtype)
def test_molecular_kernel(): molecules = [molecule('H2'), molecule('O2'), molecule('CH4')] graphs = [Graph.from_ase(m) for m in molecules] kernel = Tang2019MolecularKernel() R = kernel(graphs) D = np.diag(np.diag(R)**-0.5) K = D.dot(R).dot(D) assert (R.shape == (3, 3)) for i in range(len(molecules)): assert (K[i, i] == pytest.approx(1, 1e-6)) R_nodal = kernel(graphs, nodal=True) D_nodal = np.diag(np.diag(R_nodal)**-0.5) K_nodal = D_nodal.dot(R_nodal).dot(D_nodal) natoms = np.sum([len(m) for m in molecules]) assert (R_nodal.shape == (natoms, natoms)) for i in range(natoms): assert (K_nodal[i, i] == pytest.approx(1, 1e-6))
vario_graph1.add_node('O1', rings=(5, 6)) vario_graph1.add_node('H1', rings=(3, )) vario_graph1.add_node('H2', rings=(2, 3, 4)) vario_graph1.add_edge('O1', 'H1', spectrum=(3, 4), w=1.0) vario_graph1.add_edge('O1', 'H2', spectrum=(3, 5), w=2.0) vario_graph2 = nx.Graph(title='H2') vario_graph2.add_node('H1', rings=(3, 4)) vario_graph2.add_node('H2', rings=(3, )) vario_graph2.add_edge('H1', 'H2', spectrum=(2, 4), w=3.0) case_dict = { 'unlabeled': { 'graphs': Graph.unify_datatype([ Graph.from_networkx(unlabeled_graph1), Graph.from_networkx(unlabeled_graph2) ]), 'knode': Constant(1.0), 'kedge': Constant(1.0), 'q': [0.01, 0.05, 0.1, 0.5] }, 'labeled': { 'graphs': Graph.unify_datatype([ Graph.from_networkx(labeled_graph1), Graph.from_networkx(labeled_graph2) ]), 'knode': TensorProduct(hybridization=KroneckerDelta(0.3),
g2.add_node(2) g2.add_edge(0, 1) g2.add_edge(1, 2) # 0 --- 1 # \ / # 2 g3 = nx.Graph() g3.add_node(0) g3.add_node(1) g3.add_node(2) g3.add_edge(0, 1) g3.add_edge(0, 2) g3.add_edge(1, 2) # define trivial node and edge kernelets knode = Constant(1.0) kedge = Constant(1.0) # compose the marginalized graph kernel and compute pairwise similarity mlgk = MarginalizedGraphKernel(knode, kedge, q=0.05) R = mlgk([Graph.from_networkx(g) for g in [g1, g2, g3]]) # normalize the similarity matrix d = np.diag(R)**-0.5 K = np.diag(d).dot(R).dot(np.diag(d)) # all entries should be approximately 1 plus round-off error print(K)
import numpy as np import pandas as pd from ase.build import molecule, bulk from graphdot import Graph from graphdot.kernel.molecular import Tang2019MolecularKernel # build sample molecules small_title = ['H2O', 'HCl', 'NaCl'] bulk_title = ['NaCl-bulk', 'NaCl-bulk2'] bulk = [ bulk('NaCl', 'rocksalt', a=5.64), bulk('NaCl', 'rocksalt', a=5.66), ] molecules = [molecule(name) for name in small_title] + bulk # convert to molecular graphs graphs = [Graph.from_ase(m) for m in molecules] # use pre-defined molecular kernel kernel = Tang2019MolecularKernel(edge_length_scale=0.1) R = kernel(graphs) # normalize the similarity matrix d = np.diag(R)**-0.5 K = np.diag(d).dot(R).dot(np.diag(d)) # note the difference between the NaCl variants title = small_title + bulk_title print(pd.DataFrame(K, columns=title, index=title))
g1 = nx.Graph() g1.add_node(0, category=(1, 2), symbol=1) g1.add_node(1, category=(2, ), symbol=2) g1.add_edge(0, 1, w=1.0, spectra=[0.5, 0.2]) g2 = nx.Graph() g2.add_node(0, category=(1, 3), symbol=1) g2.add_node(1, category=(2, 3, 5), symbol=2) g2.add_node(2, category=(1, ), symbol=1) g2.add_edge(0, 1, w=2.0, spectra=[0.1, 0.9, 1.5]) g2.add_edge(0, 2, w=0.5, spectra=[0.4]) g2.add_edge(1, 2, w=0.5, spectra=[0.3, 0.6]) # Define node and edge base kernels using the R-convolution framework # Reference: Haussler, David. Convolution kernels on discrete structures. 1999. knode = TensorProduct(symbol=KroneckerDelta(0.5), category=Convolution(KroneckerDelta(0.5))) kedge = TensorProduct(spectra=Convolution(SquareExponential(0.3))) # compose the marginalized graph kernel and compute pairwise similarity mlgk = MarginalizedGraphKernel(knode, kedge, q=0.05) R = mlgk([Graph.from_networkx(g, weight='w') for g in [g1, g2]]) # normalize the similarity matrix d = np.diag(R)**-0.5 K = np.diag(d).dot(R).dot(np.diag(d)) print(K)
from graphdot.kernel.marginalized.starting_probability import Uniform from graphdot.microkernel import (Additive, Convolution as kConv, Constant as kC, KroneckerDelta as kDelta, SquareExponential as kSE) from graphdot.model.gaussian_process import LowRankApproximateGPR smiles = [ 'CC', 'CCC', 'CCCC', 'CCCCC', 'CCCCCC', 'CCCCCCC', 'CCCCCCCC', 'CCCCCCCCC', 'CCCCCCCCCC', 'CCCCCCCCCCC', 'CCCCCCCCCCCC' ] energy = [ -719.05, -1014.16, -1309.27, -1604.29, -1899.33, -2194.35, -2489.38, -2784.41, -3079.44, -3374.47, -3669.50 ] graphs = list(map(lambda smi: Graph.from_rdkit(MolFromSmiles(smi)), smiles)) train_X = graphs[::2] train_y = energy[::2] test_X = graphs[1::2] test_y = energy[1::2] core = train_X[::2] kernel = MarginalizedGraphKernel( node_kernel=Additive( aromatic=kC(0.5, (0.1, 1.0)) * kDelta(0.5, (0.1, 0.9)), atomic_number=kC(0.5, (0.1, 1.0)) * kDelta(0.8, (0.1, 0.9)), charge=kC(0.5, (0.1, 1.0)) * kSE(1.0), chiral=kC(0.5, (0.1, 1.0)) * kDelta(0.5, (0.1, 0.9)), hcount=kC(0.5, (0.1, 1.0)) * kSE(1.0), hybridization=kC(0.5, (0.1, 1.0)) * kDelta(0.5, (0.1, 0.9)), ring_list=kC(0.5, (0.01, 1.0)) * kConv(kDelta(0.5,
weighted_graph1 = nx.Graph(title='H2O') weighted_graph1.add_node('O1', hybridization=Hybrid.SP2, charge=1) weighted_graph1.add_node('H1', hybridization=Hybrid.SP3, charge=-1) weighted_graph1.add_node('H2', hybridization=Hybrid.SP, charge=2) weighted_graph1.add_edge('O1', 'H1', order=1, length=0.5, w=1.0) weighted_graph1.add_edge('O1', 'H2', order=2, length=1.0, w=2.0) weighted_graph2 = nx.Graph(title='H2') weighted_graph2.add_node('H1', hybridization=Hybrid.SP, charge=1) weighted_graph2.add_node('H2', hybridization=Hybrid.SP, charge=1) weighted_graph2.add_edge('H1', 'H2', order=2, length=1.0, w=3.0) case_dict = { 'unlabeled': { 'graphs': [ Graph.from_networkx(unlabeled_graph1), Graph.from_networkx(unlabeled_graph2) ], 'knode': Constant(1.0), 'kedge': Constant(1.0), 'q': [0.01, 0.05, 0.1, 0.5] }, 'labeled': { 'graphs': [ Graph.from_networkx(labeled_graph1), Graph.from_networkx(labeled_graph2) ], 'knode': TensorProduct(hybridization=KroneckerDelta(0.3, 1.0),
from graphdot.kernel.marginalized.basekernel import KroneckerDelta # build sample molecules smiles_list = [ 'CC', # ethane 'CCO', # acetic acid 'CCN', # ethylamine 'C=C', # ethene 'CC=C', # propene 'CC=CC', # 2-n-butene ] # convert to molecular graphs # nodes(atoms) has 'aromatic', 'charge', 'element', 'hcount' attributes # edges(bonds) has the 'order' attribute graphs = [Graph.from_smiles(smi) for smi in smiles_list] # define node and edge kernelets knode = TensorProduct(aromatic=KroneckerDelta(0.8, 1.0), charge=SquareExponential(1.0), element=KroneckerDelta(0.5, 1.0), hcount=SquareExponential(1.0)) kedge = TensorProduct(order=KroneckerDelta(0.5, 1.0)) # compose the marginalized graph kernel and compute pairwise similarity kernel = MarginalizedGraphKernel(knode, kedge, q=0.05) R = kernel(graphs) # normalize the similarity matrix and then print
#!/usr/bin/env python # -*- coding: utf-8 -*- import numpy as np import pytest from ase.build import molecule from graphdot import Graph from graphdot.metric.maximin import MaxiMin from graphdot.microkernel import ( KroneckerDelta, SquareExponential, TensorProduct, ) from graphdot.kernel.marginalized.starting_probability import Uniform G = [Graph.from_ase(molecule(f)) for f in ['CH3SCH3', 'CH3OCH3']] H = [Graph.from_ase(molecule(f)) for f in ['CH4', 'NH3', 'H2O']] def test_maximin_basic(): metric = MaxiMin(node_kernel=TensorProduct(element=KroneckerDelta(0.5)), edge_kernel=TensorProduct(length=SquareExponential(0.1)), q=0.01) distance = metric(G) assert distance.shape == (len(G), len(G)) assert np.allclose(distance.diagonal(), 0, atol=1e-3) assert np.all(distance >= 0) assert np.allclose(distance, distance.T, rtol=1e-14, atol=1e-14) distance = metric(G, G) assert distance.shape == (len(G), len(G)) assert np.allclose(distance.diagonal(), 0, atol=1e-3)