def graph_from_smiles(smiles): graph = MolGraph() mol = MolFromSmiles(smiles) # mol = MolFromSmiles(smiles, sanitize=False) # mol.UpdatePropertyCache(strict=False) # Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_FINDRADICALS | Chem.SanitizeFlags.SANITIZE_KEKULIZE | Chem.SanitizeFlags.SANITIZE_SETAROMATICITY | Chem.SanitizeFlags.SANITIZE_SETCONJUGATION | Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION | Chem.SanitizeFlags.SANITIZE_SYMMRINGS, catchErrors=True) if not mol: raise ValueError("Could not parse SMILES string:", smiles) atoms_by_rd_idx = {} for atom in mol.GetAtoms(): new_atom_node = graph.new_node('atom', features=atom_features(atom), rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node for bond in mol.GetBonds(): atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node('bond', features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node, )) mol_node = graph.new_node('molecule') mol_node.add_neighbors(graph.nodes['atom']) return graph
def graph_from_smiles(smiles): graph = MolGraph() check = np.array(1) if type(check) is not type(smiles): str_smiles = smiles._data[0][0] else: str_smiles = smiles[0] mol = MolFromSmiles(str_smiles) if not mol: raise ValueError("Could not parse SMILES string:", str_smiles) atoms_by_rd_idx = {} for atom in mol.GetAtoms(): new_atom_node = graph.new_node('atom', features=atom_features(atom), rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node for bond in mol.GetBonds(): atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node('bond', features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node, )) mol_node = graph.new_node('molecule') mol_node.add_neighbors(graph.nodes['atom']) return graph
def graph_from_smiles(smiles): graph = MolGraph() mol = MolFromSmiles(smiles) if not mol: raise ValueError("Could not parse SMILES string:", smiles) atoms_by_rd_idx = {} for atom in mol.GetAtoms(): new_atom_node = graph.new_node("atom", features=atom_features(atom), rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node for bond in mol.GetBonds(): atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node("bond", features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node,)) mol_node = graph.new_node("molecule") mol_node.add_neighbors(graph.nodes["atom"]) return graph
def graph_from_amino_acids(sequence): graph = MolGraph() mol = MolFromSequence(sequence) if not mol: raise ValueError("Could not parse input string:", sequence) atoms_by_rd_idx = {} for atom in mol.GetAtoms(): new_atom_node = graph.new_node('atom', features=atom_features(atom), rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node for bond in mol.GetBonds(): atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node('bond', features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node,)) mol_node = graph.new_node('molecule') mol_node.add_neighbors(graph.nodes['atom']) return graph
def convertDataToGraph(self, data): chemData = [] for row in data: mol = row['mol'] atoms = mol.GetAtoms() graphList = [] atomsFeatures = np.zeros((len(atoms), self.featureSize)) for atom in atoms: atomFeature = atom_features(atom) bondFeature = np.zeros(6) neighborList = [] bonds = atom.GetBonds() for bond in bonds: neighbor = bond.GetBeginAtom() if (neighbor.GetIdx() == atom.GetIdx()): neighbor = bond.GetEndAtom() neighborList += [neighbor.GetIdx()] bondFeature += bond_features(bond) features = np.concatenate((atomFeature, bondFeature), axis=0) atomsFeatures[atom.GetIdx()] = features graphList += [{'idx': atom.GetIdx(), 'neighbor': np.array(neighborList)}] chemData += [[float(row['pce']), {'graphList': graphList, 'atomsFeatures': atomsFeatures}]] chemData = np.array(chemData) return chemData
def tensorize_smiles_job(smiles, max_degree=5, max_atoms=None): '''Takes a list of smiles and turns the graphs in tensor representation. # Arguments: smiles: a list (or iterable) of smiles representations max_atoms: the maximum number of atoms per molecule (to which all molecules will be padded), use `None` for auto max_degree: max_atoms: the maximum number of neigbour per atom that each molecule can have (to which all molecules will be padded), use `None` for auto **NOTE**: It is not recommended to set max_degree to `None`/auto when using `NeuralGraph` layers. Max_degree determines the number of trainable parameters and is essentially a hyperparameter. While models can be rebuilt using different `max_atoms`, they cannot be rebuild for different values of `max_degree`, as the architecture will be different. For organic molecules `max_degree=5` is a good value (Duvenaud et. al, 2015) # Returns: atoms: np.array, An atom feature np.array of size `(molecules, max_atoms, atom_features)` bonds: np.array, A bonds np.array of size `(molecules, max_atoms, max_neighbours)` edges: np.array, A connectivity array of size `(molecules, max_atoms, max_neighbours, bond_features)` TODO: * Arguments for sparse vector encoding ''' # import sizes n = len(smiles) n_atom_features = features.num_atom_features() n_bond_features = features.num_bond_features() # preallocate atom tensor with 0's and bond tensor with -1 (because of 0 index) # If max_degree or max_atoms is set to None (auto), initialise dim as small # as possible (1) atom_tensor = np.zeros((n, max_atoms or 1, n_atom_features), dtype=np.float32) bond_tensor = np.zeros( (n, max_atoms or 1, max_degree or 1, n_bond_features), dtype=np.float32) edge_tensor = -np.ones((n, max_atoms or 1, max_degree or 1), dtype=np.int8) for mol_ix, s in enumerate(smiles): #load mol, atoms and bonds sio = sys.stderr = StringIO() mol = Chem.MolFromSmiles(s) assert mol is not None, 'Could not parse smiles {}, error: {}'.format( s, sio.getvalue()) atoms = mol.GetAtoms() bonds = mol.GetBonds() # If max_atoms is exceeded, resize if max_atoms=None (auto), else raise if len(atoms) > atom_tensor.shape[1]: assert max_atoms is None, 'too many atoms ({0}) in molecule: {1}'.format( len(atoms), s) atom_tensor = padaxis(atom_tensor, len(atoms), axis=1) bond_tensor = padaxis(bond_tensor, len(atoms), axis=1) edge_tensor = padaxis(edge_tensor, len(atoms), axis=1, pad_value=-1) rdkit_ix_lookup = {} connectivity_mat = {} for atom_ix, atom in enumerate(atoms): # write atom features atom_tensor[mol_ix, atom_ix, :n_atom_features] = features.atom_features( atom) # store entry in idx rdkit_ix_lookup[atom.GetIdx()] = atom_ix # preallocate array with neighbour lists (indexed by atom) connectivity_mat = [[] for _ in atoms] for bond in bonds: # lookup atom ids a1_ix = rdkit_ix_lookup[bond.GetBeginAtom().GetIdx()] a2_ix = rdkit_ix_lookup[bond.GetEndAtom().GetIdx()] # lookup how many neighbours are encoded yet a1_neigh = len(connectivity_mat[a1_ix]) a2_neigh = len(connectivity_mat[a2_ix]) # If max_degree is exceeded, resize if max_degree=None (auto), else raise new_degree = max(a1_neigh, a2_neigh) + 1 if new_degree > bond_tensor.shape[2]: assert max_degree is None, 'too many neighours ({0}) in molecule: {1}'.format( new_degree, s) bond_tensor = padaxis(bond_tensor, new_degree, axis=2) edge_tensor = padaxis(edge_tensor, new_degree, axis=2, pad_value=-1) # store bond features bond_features = np.array(features.bond_features(bond), dtype=int) bond_tensor[mol_ix, a1_ix, a1_neigh, :] = bond_features bond_tensor[mol_ix, a2_ix, a2_neigh, :] = bond_features #add to connectivity matrix connectivity_mat[a1_ix].append(a2_ix) connectivity_mat[a2_ix].append(a1_ix) #store connectivity matrix for a1_ix, neighbours in enumerate(connectivity_mat): degree = len(neighbours) edge_tensor[mol_ix, a1_ix, :degree] = neighbours return atom_tensor, bond_tensor, edge_tensor