def __cal_coul_mat(self, mol): """ Parameters ---------- mol: molecule object Returns ------- """ if isinstance(mol, Molecule): if mol.xyz is None: msg = "The molecule must be a chemml.chem.Molecule object with xyz information." raise ValueError(msg) else: msg = "The molecule must be a chemml.chem.Molecule object." raise ValueError(msg) mol = np.append(mol.xyz.atomic_numbers, mol.xyz.geometry, axis=1) cm = [] for i in range(len(mol)): vect = [] for k in range(0, i): vect.append(cm[k][i]) for j in range(i, len(mol)): if i == j: vect.append(0.5 * mol[i, 0]**2.4) else: vect.append((mol[i, 0] * mol[j, 0] * self.const) / np.linalg.norm(mol[i, 1:] - mol[j, 1:])) for m in range(len(mol), self.max_n_atoms_): vect.append(0.0) cm.append(vect) # pad with zero values if self.max_n_atoms_ > len(mol): cm = padaxis(np.array(cm), self.max_n_atoms_, 0, 0) return np.array(cm)[:self.max_n_atoms_, : self.max_n_atoms_] #shape nAtoms*nAtoms
def concat_mol_tensors(mol_tensors_list, match_degree=True, match_max_atoms=False): """Concatenates a list of molecule tensors # Arguments: mol_tensor_list: list of molecule tensors (e.g. list of `(atoms, bonds, edges)`-triplets) match_degree: bool, if True, the degrees of all tensors should match, if False, unmatching degrees will be padded to align them. match_max_atoms: bool, simular to match_degree but for max_atoms # Retuns: a single molecule tensor (as returned by `tensorise_smiles`) """ assert isinstance(mol_tensors_list, (tuple, list)), 'Provide a list or tuple of molecule tensors to concatenate' # get max_atoms (#1) of atoms (#0) tensor of first batch (#0) # and max_degree (#2) of bonds (#1) tensor of first batch (#0) max_atoms = mol_tensors_list[0][0].shape[1] max_degree = mol_tensors_list[0][1].shape[2] # Obtain the max_degree and max_atoms of all tensors in the list for atoms, bonds, edges in mol_tensors_list: assert bonds.shape[0] == edges.shape[0] == atoms.shape[0], "batchsize doesn't match within tensor" assert bonds.shape[1] == edges.shape[1] == atoms.shape[1], "max_atoms doesn't match within tensor" assert bonds.shape[2] == edges.shape[2], "degree doesn't match within tensor" if match_max_atoms: assert max_atoms == atoms.shape[1], '`max_atoms` of molecule tensors does not match, set `match_max_atoms` to False to adjust' else: max_atoms = max(max_atoms, atoms.shape[1]) if match_degree: assert max_degree == bonds.shape[2], '`degree` of molecule tensors does not match, set `match_degree` to False to adjust' else: max_degree = max(max_degree, bonds.shape[2]) # Pad if necessary and separate tensors atoms_list = [] bonds_list = [] edges_list = [] for atoms, bonds, edges in mol_tensors_list: atoms = padaxis(atoms, max_atoms, axis=1) bonds = padaxis(bonds, max_atoms, axis=1) edges = padaxis(edges, max_atoms, axis=1, pad_value=-1) bonds = padaxis(bonds, max_degree, axis=2) edges = padaxis(edges, max_degree, axis=2, pad_value=-1) atoms_list.append(atoms) bonds_list.append(bonds) edges_list.append(edges) #stack along batch-size axis atoms = np.concatenate(atoms_list, axis=0) bonds = np.concatenate(bonds_list, axis=0) edges = np.concatenate(edges_list, axis=0) return atoms, bonds, edges
def tensorise_molecules_singlecore(molecules, max_degree=5, max_atoms=None): """ Takes a list of molecules and provides tensor representation of atom and bond features. Parameters ---------- molecules : chemml.chem.Molecule object or array If list, it must be a list of chemml.chem.Molecule objects, otherwise we raise a ValueError. In addition, all the molecule objects must provide the SMILES representation. We try to create the SMILES representation if it's not available. max_degree : int, optional (default=5) The maximum number of neighbour per atom that each molecule can have (to which all molecules will be padded), use 'None' for auto max_atoms : int, optional (default=None) The maximum number of atoms per molecule (to which all molecules will be padded), use 'None' for auto Notes ----- It is not recommended to set max_degree to `None`/auto when using `NeuralGraph` layers. Max_degree determines the number of trainable parameters and is essentially a hyperparameter. While models can be rebuilt using different `max_atoms`, they cannot be rebuild for different values of `max_degree`, as the architecture will be different. For organic molecules `max_degree=5` is a good value (Duvenaud et. al, 2015) Returns ------- atoms : array An atom feature array of shape (molecules, max_atoms, atom_features) bonds : array A bonds array of shape (molecules, max_atoms, max_degree) edges : array A connectivity array of shape (molecules, max_atoms, max_degree, bond_features) """ # TODO: Arguments for sparse vector encoding # molecules if isinstance(molecules, list) or isinstance(molecules, np.ndarray): molecules = np.array(molecules) elif isinstance(molecules, Molecule): molecules = np.array([molecules]) else: msg = "The input molecules must be a chemml.chem.Molecule object or a list of objects." raise ValueError(msg) # import sizes n = len(molecules) n_atom_features = num_atom_features() n_bond_features = num_bond_features() # preallocate atom tensor with 0's and bond tensor with -1 (because of 0 index) # If max_degree or max_atoms is set to None (auto), initialise dim as small # as possible (1) atom_tensor = np.zeros((n, max_atoms or 1, n_atom_features)) bond_tensor = np.zeros((n, max_atoms or 1, max_degree or 1, n_bond_features)) edge_tensor = -np.ones((n, max_atoms or 1, max_degree or 1), dtype=int) for mol_ix, mol in enumerate(molecules): #load mol, atoms and bonds if mol.rdkit_molecule is None: try: mol.to_smiles() except: msg = "The SMILES representation of the molecule %s can not be generated."%str(mol) raise ValueError(msg) atoms = mol.rdkit_molecule.GetAtoms() bonds = mol.rdkit_molecule.GetBonds() # If max_atoms is exceeded, resize if max_atoms=None (auto), else raise if len(atoms) > atom_tensor.shape[1]: assert max_atoms is None, 'too many atoms ({0}) in molecule: {1}'.format(len(atoms), str(mol)) atom_tensor = padaxis(atom_tensor, len(atoms), axis=1) bond_tensor = padaxis(bond_tensor, len(atoms), axis=1) edge_tensor = padaxis(edge_tensor, len(atoms), axis=1, pad_value=-1) rdkit_ix_lookup = {} connectivity_mat = {} for atom_ix, atom in enumerate(atoms): # write atom features atom_tensor[mol_ix, atom_ix, : n_atom_features] = atom_features(atom) # store entry in idx rdkit_ix_lookup[atom.GetIdx()] = atom_ix # preallocate array with neighbour lists (indexed by atom) connectivity_mat = [ [] for _ in atoms] for bond in bonds: # lookup atom ids a1_ix = rdkit_ix_lookup[bond.GetBeginAtom().GetIdx()] a2_ix = rdkit_ix_lookup[bond.GetEndAtom().GetIdx()] # lookup how many neighbours are encoded yet a1_neigh = len(connectivity_mat[a1_ix]) a2_neigh = len(connectivity_mat[a2_ix]) # If max_degree is exceeded, resize if max_degree=None (auto), else raise new_degree = max(a1_neigh, a2_neigh) + 1 if new_degree > bond_tensor.shape[2]: assert max_degree is None, 'too many neighours ({0}) in molecule: {1}'.format(new_degree, mol) bond_tensor = padaxis(bond_tensor, new_degree, axis=2) edge_tensor = padaxis(edge_tensor, new_degree, axis=2, pad_value=-1) # store bond features bond_feature = np.array(bond_features(bond), dtype=int) bond_tensor[mol_ix, a1_ix, a1_neigh, :] = bond_feature bond_tensor[mol_ix, a2_ix, a2_neigh, :] = bond_feature # add to connectivity matrix connectivity_mat[a1_ix].append(a2_ix) connectivity_mat[a2_ix].append(a1_ix) # store connectivity matrix for a1_ix, neighbours in enumerate(connectivity_mat): degree = len(neighbours) edge_tensor[mol_ix, a1_ix, : degree] = neighbours return atom_tensor, bond_tensor, edge_tensor