def coulomb_matrix(self, mol): """ Generate Coulomb matrices for each conformer of the given molecule. Parameters ---------- mol : RDKit Mol Molecule. """ if self.remove_hydrogens: mol = Chem.RemoveHs(mol) n_atoms = mol.GetNumAtoms() z = [atom.GetAtomicNum() for atom in mol.GetAtoms()] rval = [] for conf in mol.GetConformers(): d = self.get_interatomic_distances(conf) m = np.zeros((n_atoms, n_atoms)) for i in xrange(mol.GetNumAtoms()): for j in xrange(mol.GetNumAtoms()): if i == j: m[i, j] = 0.5 * z[i] ** 2.4 elif i < j: m[i, j] = (z[i] * z[j]) / d[i, j] m[j, i] = m[i, j] else: continue if self.randomize: for random_m in self.randomize_coulomb_matrix(m): random_m = pad_array(random_m, self.max_atoms) rval.append(random_m) else: m = pad_array(m, self.max_atoms) rval.append(m) rval = np.asarray(rval) return rval
def featurize_mol(self, coords, mol, max_num_atoms): neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff, self.max_num_neighbors, None) z = self.get_Z_matrix(mol, max_num_atoms) z = pad_array(z, max_num_atoms) coords = pad_array(coords, (max_num_atoms, 3)) return coords, neighbor_list, z
def coulomb_matrix(self, mol): """ Generate Coulomb matrices for each conformer of the given molecule. Parameters ---------- mol : RDKit Mol Molecule. """ if self.remove_hydrogens: mol = Chem.RemoveHs(mol) n_atoms = mol.GetNumAtoms() z = [atom.GetAtomicNum() for atom in mol.GetAtoms()] rval = [] for conf in mol.GetConformers(): d = self.get_interatomic_distances(conf) m = np.zeros((n_atoms, n_atoms)) for i in xrange(mol.GetNumAtoms()): for j in xrange(mol.GetNumAtoms()): if i == j: m[i, j] = 0.5 * z[i]**2.4 elif i < j: m[i, j] = (z[i] * z[j]) / d[i, j] m[j, i] = m[i, j] else: continue if self.randomize: for random_m in self.randomize_coulomb_matrix(m): random_m = pad_array(random_m, self.max_atoms) rval.append(random_m) else: m = pad_array(m, self.max_atoms) rval.append(m) rval = np.asarray(rval) return rval
def coulomb_matrix(self, mol): """ Generate Coulomb matrices for each conformer of the given molecule. Parameters ---------- mol : RDKit Mol Molecule. """ from rdkit import Chem if self.remove_hydrogens: mol = Chem.RemoveHs(mol) n_atoms = mol.GetNumAtoms() z = [atom.GetAtomicNum() for atom in mol.GetAtoms()] rval = [] for conf in mol.GetConformers(): d = self.get_interatomic_distances(conf) m = np.outer(z, z) / d m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4 if self.randomize: for random_m in self.randomize_coulomb_matrix(m): random_m = pad_array(random_m, self.max_atoms) rval.append(random_m) else: m = pad_array(m, self.max_atoms) rval.append(m) rval = np.asarray(rval) return rval
def featurize_mol(self, coords, mol, max_num_atoms): logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms())) neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff, self.max_num_neighbors, None) z = self.get_Z_matrix(mol, max_num_atoms) z = pad_array(z, max_num_atoms) coords = pad_array(coords, (max_num_atoms, 3)) return coords, neighbor_list, z
def unpad_randomize_and_flatten(self, cm): """ 1. Remove zero padding on Coulomb Matrix 2. Randomly permute the rows and columns for n_samples 3. Flatten each sample to upper triangular portion Returns list of feature vectors """ max_atom_number = len(cm) atom_number = 0 for i in cm[0]: if atom_number == max_atom_number: break elif i != 0.: atom_number += 1 else: break upcm = cm[0:atom_number,0:atom_number] row_norms = np.asarray( [np.linalg.norm(row) for row in upcm], dtype=float) rng = np.random.RandomState(self.seed) e = rng.normal(size=row_norms.size) p = np.argsort(row_norms+e) rcm = upcm[p][:,p] rcm = pad_array(rcm, len(cm)) rcm = rcm[np.triu_indices_from(rcm)] return rcm
def unpad_randomize_and_flatten(self, cm): """ 1. Remove zero padding on Coulomb Matrix 2. Randomly permute the rows and columns for n_samples 3. Flatten each sample to upper triangular portion Returns list of feature vectors """ max_atom_number = len(cm) atom_number = 0 for i in cm[0]: if atom_number == max_atom_number: break elif i != 0.: atom_number += 1 else: break upcm = cm[0:atom_number, 0:atom_number] row_norms = np.asarray([np.linalg.norm(row) for row in upcm], dtype=float) rng = np.random.RandomState(self.seed) e = rng.normal(size=row_norms.size) p = np.argsort(row_norms + e) rcm = upcm[p][:, p] rcm = pad_array(rcm, len(cm)) rcm = rcm[np.triu_indices_from(rcm)] return rcm
def get_Z_matrix(self, mol, max_atoms): if len(mol.GetAtoms()) > max_atoms: raise ValueError( f"A molecule (#atoms = {len(mol.GetAtoms())}) is larger than permitted by max_atoms. " "Increase max_atoms and try again.") return pad_array( np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)
def _featurize(self, struct): """ Calculate sine Coulomb matrix from pymatgen structure. Parameters ---------- struct : dict Json-serializable dictionary representation of pymatgen.core.structure https://pymatgen.org/pymatgen.core.structure.html Returns ------- features: np.ndarray 2D sine Coulomb matrix with shape (max_atoms, max_atoms), or 1D matrix eigenvalues with shape (max_atoms,). """ from pymatgen import Structure from matminer.featurizers.structure import SineCoulombMatrix as SCM s = Structure.from_dict(struct) # Get full N x N SCM scm = SCM(flatten=False) sine_mat = scm.featurize(s) if self.flatten: eigs, _ = np.linalg.eig(sine_mat) zeros = np.zeros((self.max_atoms,)) zeros[:len(eigs)] = eigs features = zeros else: features = pad_array(sine_mat, self.max_atoms) features = np.asarray(features) return features
def _featurize(self, struct: "pymatgen.Structure"): """ Calculate sine Coulomb matrix from pymatgen structure. Parameters ---------- struct : pymatgen.Structure A periodic crystal composed of a lattice and a sequence of atomic sites with 3D coordinates and elements. Returns ------- features: np.ndarray 2D sine Coulomb matrix with shape (max_atoms, max_atoms), or 1D matrix eigenvalues with shape (max_atoms,). """ try: from matminer.featurizers.structure import SineCoulombMatrix as SCM except ModuleNotFoundError: raise ValueError("This class requires matminer to be installed.") # Get full N x N SCM scm = SCM(flatten=False) sine_mat = scm.featurize(struct) if self.flatten: eigs, _ = np.linalg.eig(sine_mat) zeros = np.zeros((1, self.max_atoms)) zeros[:len(eigs)] = eigs features = zeros else: features = pad_array(sine_mat, self.max_atoms) features = np.asarray(features) return features
def _featurize(self, mol): """ Calculate eigenvalues of Coulomb matrix for molecules. Eigenvalues are returned sorted by absolute value in descending order and padded by max_atoms. Parameters ---------- mol : RDKit Mol Molecule. """ cmat = self.coulomb_matrix(mol) features = [] for f in cmat: w, v = np.linalg.eig(f) w_abs = np.abs(w) sortidx = np.argsort(w_abs) sortidx = sortidx[::-1] w = w[sortidx] f = pad_array(w, self.max_atoms) features.append(f) features = np.asarray(features) return features
def _featurize(self, mol): """Compute neighbor list. Parameters ---------- mol: rdkit.Chem.rdchem.mol Molecule """ print(mol) N = mol.GetNumAtoms() coords = get_coords(mol) x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff) # Associate each atom with cell it belongs to. O(N) cell_to_atoms, atom_to_cell = put_atoms_in_cells( coords, x_bins, y_bins, z_bins) # Associate each cell with its neighbor cells. Assumes periodic boundary # conditions, so does wrapround. O(constant) N_x, N_y, N_z = len(x_bins), len(y_bins), len(z_bins) neighbor_cell_map = compute_neighbor_cell_map(N_x, N_y, N_z) # For each atom, loop through all atoms in its cell and neighboring cells. # Accept as neighbors only those within threshold. This computation should be # O(Nm), where m is the number of atoms within a set of neighboring-cells. neighbor_list = {} if self.boxsize is not None: for atom in range(N): cell = atom_to_cell[atom] neighbor_cells = neighbor_cell_map[cell] neighbor_list[atom] = set() for neighbor_cell in neighbor_cells: atoms_in_cell = cell_to_atoms[neighbor_cell] for neighbor_atom in atoms_in_cell: if neighbor_atom == atom: continue dist = np.linalg.norm(coords[atom] - coords[neighbor_atom]) dist = dist - self.boxsize * np.round( dist / self.boxsize) if dist < self.neighbor_cutoff: neighbor_list[atom].add((neighbor_atom, dist)) # Sort neighbors by distance closest_neighbors = sorted(list(neighbor_list[atom]), key=lambda elt: elt[1]) closest_neighbors = [nbr for (nbr, dist) in closest_neighbors] # Pick up to max_num_neighbors closest_neighbors = closest_neighbors[:self.max_num_neighbors] neighbor_list[atom] = closest_neighbors else: for atom in range(N): cell = atom_to_cell[atom] neighbor_cells = neighbor_cell_map[cell] neighbor_list[atom] = set() for neighbor_cell in neighbor_cells: atoms_in_cell = cell_to_atoms[neighbor_cell] for neighbor_atom in atoms_in_cell: if neighbor_atom == atom: continue dist = np.linalg.norm(coords[atom] - coords[neighbor_atom]) if dist < self.neighbor_cutoff: neighbor_list[atom].add((neighbor_atom, dist)) closest_neighbors = sorted(list(neighbor_list[atom]), key=lambda elt: elt[1]) closest_neighbors = [nbr for (nbr, dist) in closest_neighbors] closest_neighbors = closest_neighbors[:self.max_num_neighbors] neighbor_list[atom] = closest_neighbors Z = pad_array( np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), self.max_num_atoms) coords = pad_array(coords, (self.max_num_atoms, 3)) return (coords, neighbor_list, Z)
def _featurize(self, mol): """Compute neighbor list. Parameters ---------- mol: rdkit.Chem.rdchem.mol Molecule """ N = mol.GetNumAtoms() coords = get_coords(mol) x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff) # Associate each atom with cell it belongs to. O(N) cell_to_atoms, atom_to_cell = put_atoms_in_cells(coords, x_bins, y_bins, z_bins) # Associate each cell with its neighbor cells. Assumes periodic boundary # conditions, so does wrapround. O(constant) N_x, N_y, N_z = len(x_bins), len(y_bins), len(z_bins) neighbor_cell_map = compute_neighbor_cell_map(N_x, N_y, N_z) # For each atom, loop through all atoms in its cell and neighboring cells. # Accept as neighbors only those within threshold. This computation should be # O(Nm), where m is the number of atoms within a set of neighboring-cells. neighbor_list = {} if self.boxsize is not None: for atom in range(N): cell = atom_to_cell[atom] neighbor_cells = neighbor_cell_map[cell] neighbor_list[atom] = set() for neighbor_cell in neighbor_cells: atoms_in_cell = cell_to_atoms[neighbor_cell] for neighbor_atom in atoms_in_cell: if neighbor_atom == atom: continue dist = np.linalg.norm(coords[atom] - coords[neighbor_atom]) dist = dist - self.boxsize * np.round(dist / self.boxsize) if dist < self.neighbor_cutoff: neighbor_list[atom].add((neighbor_atom, dist)) # Sort neighbors by distance closest_neighbors = sorted( list(neighbor_list[atom]), key=lambda elt: elt[1]) closest_neighbors = [nbr for (nbr, dist) in closest_neighbors] # Pick up to max_num_neighbors closest_neighbors = closest_neighbors[:self.max_num_neighbors] neighbor_list[atom] = closest_neighbors else: for atom in range(N): cell = atom_to_cell[atom] neighbor_cells = neighbor_cell_map[cell] neighbor_list[atom] = set() for neighbor_cell in neighbor_cells: atoms_in_cell = cell_to_atoms[neighbor_cell] for neighbor_atom in atoms_in_cell: if neighbor_atom == atom: continue dist = np.linalg.norm(coords[atom] - coords[neighbor_atom]) if dist < self.neighbor_cutoff: neighbor_list[atom].add((neighbor_atom, dist)) closest_neighbors = sorted( list(neighbor_list[atom]), key=lambda elt: elt[1]) closest_neighbors = [nbr for (nbr, dist) in closest_neighbors] closest_neighbors = closest_neighbors[:self.max_num_neighbors] neighbor_list[atom] = closest_neighbors Z = pad_array( np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), self.max_num_atoms) coords = pad_array(coords, (self.max_num_atoms, 3)) return (coords, neighbor_list, Z)
def get_Z_matrix(self, mol, max_atoms): return pad_array( np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)