def test_coulomb_matrix(self): """ Test CoulombMatrix. """ f = CoulombMatrix(self.num_atoms) rval = f([self.mol_with_no_conf]) assert rval.shape == (1, self.num_atoms, self.num_atoms) rval = f([self.mol_with_one_conf]) assert rval.shape == (1, self.num_atoms, self.num_atoms) rval = f([self.mol_with_multi_conf]) assert rval.shape == (1, self.num_confs, self.num_atoms, self.num_atoms)
def test_upper_tri_coulomb_matrix(self): """ Test upper triangular CoulombMatrix. """ f = CoulombMatrix(self.num_atoms, upper_tri=True) size = np.triu_indices(self.num_atoms)[0].size rval = f([self.mol_with_no_conf]) assert rval.shape == (1, size) rval = f([self.mol_with_one_conf]) assert rval.shape == (1, size) rval = f([self.mol_with_multi_conf]) assert rval.shape == (1, self.num_confs, size)
def test_coulomb_matrix_padding(self): """ Test CoulombMatrix with padding. """ max_atoms = self.num_atoms * 2 f = CoulombMatrix(max_atoms=max_atoms) rval = f([self.mol_with_no_conf]) assert rval.shape == (1, max_atoms, max_atoms) rval = f([self.mol_with_one_conf]) assert rval.shape == (1, max_atoms, max_atoms) rval = f([self.mol_with_multi_conf]) assert rval.shape == (1, self.num_confs, max_atoms, max_atoms)
def test_coulomb_matrix_hydrogens(self): """ Test no hydrogen removal. """ f = CoulombMatrix( max_atoms=self.num_atoms, remove_hydrogens=False, upper_tri=True) size = np.triu_indices(self.num_atoms)[0].size rval = f([self.mol_with_no_conf]) assert rval.shape == (1, size) rval = f([self.mol_with_one_conf]) assert rval.shape == (1, size) rval = f([self.mol_with_multi_conf]) assert rval.shape == (1, self.num_confs, size)
def test_coulomb_matrix_no_hydrogens(self): """ Test hydrogen removal. """ num_atoms_with_no_H = self.mol_with_no_conf.GetNumAtoms() assert num_atoms_with_no_H < self.num_atoms f = CoulombMatrix( max_atoms=num_atoms_with_no_H, remove_hydrogens=True, upper_tri=True) size = np.triu_indices(num_atoms_with_no_H)[0].size rval = f([self.mol_with_no_conf]) assert rval.shape == (1, size) rval = f([self.mol_with_one_conf]) assert rval.shape == (1, size) rval = f([self.mol_with_multi_conf]) assert rval.shape == (1, self.num_confs, size)
def generate_coulomb_matrices(self): if not __deepchem_imported__: self.log.error( "deepchem was not imported or available in this execution environment" ) sys.exit() self.log.info( "Generating Coulomb Matrices, may take a few moments ...") matrix_featurizer = CoulombMatrix(self.max_atoms, randomize=False, n_samples=1) eigen_featurizer = CoulombMatrixEig(self.max_atoms) matrices = [] smiles = [] arrays = [] eigenarrays = [] num_confs = 1 for index, row in self.data.iterrows(): mol = Chem.MolFromSmiles(row["SMILE"]) Chem.AddHs(mol) ids = AllChem.EmbedMultipleConfs(mol, numConfs=num_confs) if len(ids) != num_confs: ids = AllChem.EmbedMultipleConfs(mol, numConfs=num_confs, ignoreSmoothingFailures=True) if len(ids) != num_confs: self.log.warning( "Coulomb Matrix - unable to generate %d conformer(s) for smile: %s", num_confs, row["SMILE"]) if len(ids) == num_confs: for id in ids: AllChem.UFFOptimizeMolecule(mol, confId=id) matrix = matrix_featurizer.coulomb_matrix(mol) matrices.append(matrix) arrays.append(matrix[0].flatten()) smiles.append(row["SMILE"]) eigenvalues = eigen_featurizer.featurize([mol]) eigenarrays.append(eigenvalues[0].flatten()) pd_dict = { "SMILE": smiles, "COULOMB": matrices, "COULOMB_ARRAY": arrays, "COULOMB_EIGEN": eigenarrays } coulomb_frame = pd.DataFrame(pd_dict) before_ids = list(self.data["ID"]) self.data = pd.merge(self.data, coulomb_frame, how="inner", on=["SMILE"]) self.data = self.data.drop_duplicates(subset=["SMILE"], keep="first") after_ids = list(self.data["ID"]) missing_list = list(set(before_ids) - set(after_ids)) for missing in missing_list: self.log.warning( "Dropped molecule ID: %s after join with Coulomb Matrix data", missing)