def test_single_carbon(self):
    """Test that single carbon atom is featurized properly."""
    raw_smiles = ['C']
    mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles]
    featurizer = ConvMolFeaturizer()
    mol_list = featurizer.featurize(mols)
    mol = mol_list[0]

    # Only one carbon
    assert mol.get_num_atoms() == 1

    # No bonds, so degree adjacency lists are empty
    deg_adj_lists = mol.get_deg_adjacency_lists()
    assert np.array_equal(deg_adj_lists[0],
                          np.zeros([1,0], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[1],
                          np.zeros([0,1], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[2],
                          np.zeros([0,2], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[3],
                          np.zeros([0,3], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[4],
                          np.zeros([0,4], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[5],
                          np.zeros([0,5], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[6],
                          np.zeros([0,6], dtype=np.int32))
  def test_alkane(self):
    """Test on simple alkane"""
    raw_smiles = ['CCC']
    mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles]
    featurizer = ConvMolFeaturizer()
    mol_list = featurizer.featurize(mols)
    mol = mol_list[0]

    # 3 carbonds in alkane 
    assert mol.get_num_atoms() == 3

    deg_adj_lists = mol.get_deg_adjacency_lists()
    assert np.array_equal(deg_adj_lists[0],
                          np.zeros([0,0], dtype=np.int32))
    # Outer two carbonds are connected to central carbon
    assert np.array_equal(deg_adj_lists[1],
                          np.array([[2], [2]], dtype=np.int32))
    # Central carbon connected to outer two
    assert np.array_equal(deg_adj_lists[2],
                          np.array([[0,1]], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[3],
                          np.zeros([0,3], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[4],
                          np.zeros([0,4], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[5],
                          np.zeros([0,5], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[6],
                          np.zeros([0,6], dtype=np.int32))
Exemple #3
0
    def test_graph_gather(self):
        """Test that GraphGather can be invoked."""
        batch_size = 2
        n_features = 75
        n_atoms = 4  # In CCC and C, there are 4 atoms
        raw_smiles = ['CCC', 'C']
        mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles]
        featurizer = ConvMolFeaturizer()
        mols = featurizer.featurize(mols)
        multi_mol = ConvMol.agglomerate_mols(mols)
        atom_features = multi_mol.get_atom_features()
        degree_slice = multi_mol.deg_slice
        membership = multi_mol.membership
        deg_adjs = multi_mol.get_deg_adjacency_lists()[1:]

        with self.test_session() as sess:
            atom_features = tf.convert_to_tensor(atom_features,
                                                 dtype=tf.float32)
            degree_slice = tf.convert_to_tensor(degree_slice, dtype=tf.int32)
            membership = tf.convert_to_tensor(membership, dtype=tf.int32)
            deg_adjs_tf = []
            for deg_adj in deg_adjs:
                deg_adjs_tf.append(
                    tf.convert_to_tensor(deg_adj, dtype=tf.int32))
            args = [atom_features, degree_slice, membership] + deg_adjs_tf
            out_tensor = GraphGather(batch_size)(*args)
            sess.run(tf.global_variables_initializer())
            out_tensor = out_tensor.eval()
            # TODO(rbharath): Why is it 2*n_features instead of n_features?
            assert out_tensor.shape == (batch_size, 2 * n_features)
  def test_carbon_nitrogen(self):
    """Test on carbon nitrogen molecule"""
    # Note there is a central carbon of degree 4, with 3 carbons and
    # one nitrogen of degree 1 (connected only to central carbon).
    raw_smiles = ['C[N+](C)(C)C']
    mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles]
    featurizer = ConvMolFeaturizer()
    mols = featurizer.featurize(mols)
    mol = mols[0]

    # 5 atoms in compound
    assert mol.get_num_atoms() == 5

    # Get the adjacency lists grouped by degree
    deg_adj_lists = mol.get_deg_adjacency_lists()
    assert np.array_equal(deg_adj_lists[0],
                          np.zeros([0,0], dtype=np.int32))
    # The 4 outer atoms connected to central carbon
    assert np.array_equal(deg_adj_lists[1],
                          np.array([[4], [4], [4], [4]], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[2],
                          np.zeros([0,2], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[3],
                          np.zeros([0,3], dtype=np.int32))
    # Central carbon connected to everything else.
    assert np.array_equal(deg_adj_lists[4],
                          np.array([[0, 1, 2, 3]], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[5],
                          np.zeros([0,5], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[6],
                          np.zeros([0,6], dtype=np.int32))
  def test_carbon_nitrogen(self):
    """Test on carbon nitrogen molecule"""
    # Note there is a central carbon of degree 4, with 3 carbons and
    # one nitrogen of degree 1 (connected only to central carbon).
    raw_smiles = ['C[N+](C)(C)C']
    mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles]
    featurizer = ConvMolFeaturizer()
    mols = featurizer.featurize(mols)
    mol = mols[0]

    # 5 atoms in compound
    assert mol.get_num_atoms() == 5

    # Get the adjacency lists grouped by degree
    deg_adj_lists = mol.get_deg_adjacency_lists()
    assert np.array_equal(deg_adj_lists[0], np.zeros([0, 0], dtype=np.int32))
    # The 4 outer atoms connected to central carbon
    assert np.array_equal(deg_adj_lists[1],
                          np.array([[4], [4], [4], [4]], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[2], np.zeros([0, 2], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[3], np.zeros([0, 3], dtype=np.int32))
    # Central carbon connected to everything else.
    assert np.array_equal(deg_adj_lists[4],
                          np.array([[0, 1, 2, 3]], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[6], np.zeros([0, 6], dtype=np.int32))
Exemple #6
0
    def test_alkane(self):
        """Test on simple alkane"""
        raw_smiles = ['CCC']
        import rdkit.Chem
        mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles]
        featurizer = ConvMolFeaturizer()
        mol_list = featurizer.featurize(mols)
        mol = mol_list[0]

        # 3 carbonds in alkane
        assert mol.get_num_atoms() == 3

        deg_adj_lists = mol.get_deg_adjacency_lists()
        assert np.array_equal(deg_adj_lists[0], np.zeros([0, 0],
                                                         dtype=np.int32))
        # Outer two carbonds are connected to central carbon
        assert np.array_equal(deg_adj_lists[1],
                              np.array([[2], [2]], dtype=np.int32))
        # Central carbon connected to outer two
        assert np.array_equal(deg_adj_lists[2],
                              np.array([[0, 1]], dtype=np.int32))
        assert np.array_equal(deg_adj_lists[3], np.zeros([0, 3],
                                                         dtype=np.int32))
        assert np.array_equal(deg_adj_lists[4], np.zeros([0, 4],
                                                         dtype=np.int32))
        assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5],
                                                         dtype=np.int32))
        assert np.array_equal(deg_adj_lists[6], np.zeros([0, 6],
                                                         dtype=np.int32))
Exemple #7
0
    def test_single_carbon(self):
        """Test that single carbon atom is featurized properly."""
        raw_smiles = ['C']
        import rdkit
        mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles]
        featurizer = ConvMolFeaturizer()
        mol_list = featurizer.featurize(mols)
        mol = mol_list[0]

        # Only one carbon
        assert mol.get_num_atoms() == 1

        # No bonds, so degree adjacency lists are empty
        deg_adj_lists = mol.get_deg_adjacency_lists()
        assert np.array_equal(deg_adj_lists[0], np.zeros([1, 0],
                                                         dtype=np.int32))
        assert np.array_equal(deg_adj_lists[1], np.zeros([0, 1],
                                                         dtype=np.int32))
        assert np.array_equal(deg_adj_lists[2], np.zeros([0, 2],
                                                         dtype=np.int32))
        assert np.array_equal(deg_adj_lists[3], np.zeros([0, 3],
                                                         dtype=np.int32))
        assert np.array_equal(deg_adj_lists[4], np.zeros([0, 4],
                                                         dtype=np.int32))
        assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5],
                                                         dtype=np.int32))
        assert np.array_equal(deg_adj_lists[6], np.zeros([0, 6],
                                                         dtype=np.int32))
Exemple #8
0
  def test_graph_gather(self):
    """Test that GraphGather can be invoked."""
    batch_size = 2
    n_features = 75
    n_atoms = 4  # In CCC and C, there are 4 atoms
    raw_smiles = ['CCC', 'C']
    mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles]
    featurizer = ConvMolFeaturizer()
    mols = featurizer.featurize(mols)
    multi_mol = ConvMol.agglomerate_mols(mols)
    atom_features = multi_mol.get_atom_features()
    degree_slice = multi_mol.deg_slice
    membership = multi_mol.membership
    deg_adjs = multi_mol.get_deg_adjacency_lists()[1:]

    with self.session() as sess:
      atom_features = tf.convert_to_tensor(atom_features, dtype=tf.float32)
      degree_slice = tf.convert_to_tensor(degree_slice, dtype=tf.int32)
      membership = tf.convert_to_tensor(membership, dtype=tf.int32)
      deg_adjs_tf = []
      for deg_adj in deg_adjs:
        deg_adjs_tf.append(tf.convert_to_tensor(deg_adj, dtype=tf.int32))
      args = [atom_features, degree_slice, membership] + deg_adjs_tf
      out_tensor = GraphGather(batch_size)(*args)
      sess.run(tf.global_variables_initializer())
      out_tensor = out_tensor.eval()
      # TODO(rbharath): Why is it 2*n_features instead of n_features?
      assert out_tensor.shape == (batch_size, 2 * n_features)
 def test_per_atom_fragmentation(self):
   """checks if instantiating featurizer with per_atom_fragmentation=True
   leads to  as many fragments' features, as many atoms mol has for any mol"""
   import rdkit.Chem
   raw_smiles = ['CC(CO)Cc1ccccc1', 'CC']
   mols = [rdkit.Chem.MolFromSmiles(m) for m in raw_smiles]
   featurizer = ConvMolFeaturizer(per_atom_fragmentation=True)
   feat = featurizer.featurize(mols)
   for i, j in zip(feat, mols):
     assert len(i) == j.GetNumHeavyAtoms()
Exemple #10
0
  def predict_on_smiles(self, smiles, transformers=[], untransform=False):
    """Generates predictions on a numpy array of smile strings

    # Returns:
      y_: numpy ndarray of shape (n_samples, n_tasks)
    """
    max_index = len(smiles) - 1
    n_tasks = len(self.outputs)
    num_batches = (max_index // self.batch_size) + 1
    featurizer = ConvMolFeaturizer()

    y_ = []
    for i in range(num_batches):
      start = i * self.batch_size
      end = min((i + 1) * self.batch_size, max_index + 1)
      smiles_batch = smiles[start:end]
      y_.append(
          self.predict_on_smiles_batch(smiles_batch, featurizer, transformers))
    y_ = np.concatenate(y_, axis=0)[:max_index + 1]
    y_ = y_.reshape(-1, n_tasks)

    if untransform:
      y_ = undo_transforms(y_, transformers)

    return y_
Exemple #11
0
"""

import warnings
warnings.filterwarnings('ignore')

import deepchem as dc
#from deepchem.models.tensorgraph.models.graph_models import MPNNTensorGraph
from deepchem.models.tensorgraph.models.graph_models import GraphConvModel
#from deepchem.feat import WeaveFeaturizer
from deepchem.feat.graph_features import ConvMolFeaturizer
from deepchem.feat.graph_features import WeaveFeaturizer
from deepchem.data.data_loader import CSVLoader

import pandas as pd
import numpy as np

featurizer = ConvMolFeaturizer()
#featurizer = WeaveFeaturizer(graph_distance=True, explicit_H=False)
train_loader = CSVLoader(tasks=['LogD7.4'],
                         smiles_field='smiles',
                         featurizer=featurizer)
test_loader = CSVLoader(tasks=['LogD7.4'],
                        smiles_field='smiles',
                        featurizer=featurizer)

X_train = train_loader.featurize('../demo_data/reg/training_set.csv')
X_test = test_loader.featurize('../demo_data/reg/testing_set.csv')

model = GraphConvModel(n_tasks=1, mode='regression')
model.fit(X_train)
print(model.predict(X_test))