def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """ Featurize the molecule. Parameters ---------- datapoint: RDKitMol RDKit mol object. Returns ------- MATEncoding A MATEncoding dataclass instance consisting of processed node_features, adjacency_matrix and distance_matrix. """ if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) from rdkit import Chem datapoint = self.construct_mol(datapoint) node_features = self.construct_node_features_matrix(datapoint) adjacency_matrix = Chem.GetAdjacencyMatrix(datapoint) distance_matrix = Chem.GetDistanceMatrix(datapoint) node_features, adjacency_matrix, distance_matrix = self._add_dummy_node( node_features, adjacency_matrix, distance_matrix) node_features = self._pad_sequence(node_features) adjacency_matrix = self._pad_sequence(adjacency_matrix) distance_matrix = self._pad_sequence(distance_matrix) return MATEncoding(node_features, adjacency_matrix, distance_matrix)
def test_mat_encoder_layer(): """Test invoking MATEncoderLayer.""" torch.manual_seed(0) from rdkit import Chem input_ar = torch.Tensor([[1., 2.], [5., 6.]]) mask = torch.Tensor([[1., 1.], [1., 1.]]) mol = Chem.MolFromSmiles("CC") adj_matrix = Chem.GetAdjacencyMatrix(mol) distance_matrix = Chem.GetDistanceMatrix(mol) layer = torch_layers.MATEncoderLayer(dist_kernel='softmax', lambda_attention=0.33, lambda_distance=0.33, h=2, sa_hsize=2, sa_dropout_p=0.0, output_bias=True, d_input=2, d_hidden=2, d_output=2, activation='relu', n_layers=2, ff_dropout_p=0.0, encoder_hsize=2, encoder_dropout_p=0.0) result = layer(input_ar, mask, adj_matrix, distance_matrix, 0.0) output_ar = torch.tensor([[[0.9988, 2.0012], [-0.9999, 3.9999], [0.9988, 2.0012], [-0.9999, 3.9999]], [[5.0000, 6.0000], [3.0000, 8.0000], [5.0000, 6.0000], [3.0000, 8.0000]]]) assert torch.allclose(result, output_ar, rtol=1e-4)
def create_adjancy_matrix(mol): mol_adj = Chem.GetAdjacencyMatrix(mol) row_num = len(mol_adj) adj = np.array(mol_adj, dtype=np.int8) for i in range(row_num): # Set diagonal elements to 1, fill others with the adjacency matrix from RDkit adj[i][i] = int(1) return adj
def extract_info(smiles: str): # First convert to the SMILES strings to rdkit Mol object try: mol = Chem.MolFromSmiles(smiles) assert mol except AssertionError: raise print('SMARTS strings: %s' % Chem.MolToSmarts(mol)) # print('Molecule block: \n%s' % Chem.MolToMolBlock(mol)) # Get all the atoms, bonds, information on both, and adjacency matrix for idx, atom in enumerate(mol.GetAtoms()): print('Information on atom #%i in the molecule: ' % idx) print('\tAtom: %s (%i)' % (atom.GetSymbol(), atom.GetAtomicNum())) print(atom.GetHybridization()) print(atom.GetSymbol()) for bond in mol.GetBonds(): print(bond) print(Chem.GetAdjacencyMatrix(mol)) pass
def create_dataset(self, filename, dataset, radius, device): dir_dataset = '../dataset/' + dataset + '/' '''Load a dataset.''' with open(dir_dataset + filename, 'r') as f: smiles_property = f.readline().strip().split() data_original = f.read().strip().split('\n') '''Exclude the data contains '.' in its smiles.''' data_original = [data for data in data_original if '.' not in data.split()[0]] dataset = [] for data in data_original: smiles, property = data.strip().split() '''Create each data with the above defined functions.''' mol = Chem.AddHs(Chem.MolFromSmiles(smiles)) atoms = self.create_atoms(mol, self.atom_dict) molecular_size = len(atoms) i_jbond_dict = self.create_ijbonddict(mol, self.bond_dict) fingerprints = self.extract_fingerprints(radius, atoms, i_jbond_dict, self.fingerprint_dict, self.edge_dict) adjacency = Chem.GetAdjacencyMatrix(mol) '''Transform the above each data of numpy to pytorch tensor on a device (i.e., CPU or GPU). ''' fingerprints = torch.LongTensor(fingerprints).to(device) adjacency = torch.FloatTensor(adjacency).to(device) property = torch.LongTensor([int(property)]).to(device) dataset.append((fingerprints, adjacency, molecular_size, property)) return dataset
def __init__( self, mol, radius: int = 2, nbits: int = 2048, n_feat: np.array = None ): self.mol = mol self.radius = radius self.nbits = nbits self.fps = np.zeros(shape=(self.nbits,), dtype=np.int32) if n_feat is None: n_feat = self.createNodeFeatures() n_feat = np.array(n_feat, dtype=np.int32) self.adj = Chem.GetAdjacencyMatrix(mol) # concatenate node features. self.identifier: Dict[int, Dict[int, int]] = defaultdict(dict) for i in range(radius + 1): self.identifier[i] = {} self.identifier[0].update( { i: k for i, k in enumerate( [ hash("".join([str(f) for f in n_feat[i]])) for i in range(len(n_feat)) ] ) } )
def test_smiles_from_adjacent_matrix(smiles): charged_fragments = True quick = True # Cut apart the smiles mol = get_mol(smiles) atoms = get_atoms(mol) charge = Chem.GetFormalCharge(mol) adjacent_matrix = Chem.GetAdjacencyMatrix(mol) # mol = Chem.RemoveHs(mol) canonical_smiles = Chem.MolToSmiles(mol) # Define new molecule template from atoms new_mol = x2m.get_proto_mol(atoms) # reconstruct the molecule from adjacent matrix, atoms and total charge new_mols = x2m.AC2mol(new_mol, adjacent_matrix, atoms, charge, charged_fragments, quick) new_mol_smiles_list = [] for new_mol in new_mols: new_mol = Chem.RemoveHs(new_mol) new_mol_smiles = Chem.MolToSmiles(new_mol) new_mol_smiles_list.append(new_mol_smiles) assert canonical_smiles in new_mol_smiles_list return
def read_graph(source_path,MAX_size): Vertex = [] Adj = [] # Normalized adjacency matrix mycount=1 PAD=0 mydict={} max_size=0 with tf.gfile.GFile(source_path, mode="r") as source_file: source = source_file.readline().strip() counter = 0 while source: mol = Chem.MolFromSmiles(source) atom_list = [] for a in mol.GetAtoms(): m = a.GetSymbol() if m not in mydict: mydict[m]=mycount mycount = mycount +1 atom_list.append(mydict[m]) if len(atom_list) > max_size: max_size = len(atom_list) if len(atom_list) < MAX_size: pad = [PAD] * (MAX_size - len(atom_list)) atom_list = atom_list+pad vertex = np.array(atom_list, np.int32) Vertex.append(vertex) adja_mat = Chem.GetAdjacencyMatrix(mol) adj_temp = [] for adja in adja_mat: if len(adja) < MAX_size: pad = [PAD]*(MAX_size - len(adja)) adja = np.array(list(adja)+pad,np.int32) adj_temp.append(adja) cur_len = len(adj_temp) for i in range(MAX_size - cur_len): adja =np.array( [PAD]*MAX_size,np.int32) adj_temp.append(adja) adj_temp = adj_temp + np.eye(MAX_size) # A_hat = A + I deg = np.power(np.sum(adj_temp,axis=1),-0.5) deg_new = [] for i in range(MAX_size): if deg[i]==1: deg_new.append(0) else: deg_new.append(deg[i]) deg_new = np.array(deg_new) deg_diag = np.diag(deg_new) adj = np.matmul(deg_diag,adj_temp) adj = np.matmul(adj,deg_diag) # normalized Adj.append(adj) source = source_file.readline().strip()
def CalculateBalaban(mol): """ ################################################################# Calculation of Balaban index in a molecule ---->J Usage: result=CalculateBalaban(mol) Input: mol is a molecule object Output: result is a numeric value ################################################################# """ adjMat = Chem.GetAdjacencyMatrix(mol) Distance = Chem.GetDistanceMatrix(mol) Nbond = mol.GetNumBonds() Natom = mol.GetNumAtoms() S = numpy.sum(Distance, axis=1) mu = Nbond - Natom + 1 sumk = 0. for i in range(len(Distance)): si = S[i] for j in range(i, len(Distance)): if adjMat[i, j] == 1: sumk += 1. / numpy.sqrt(si * S[j]) if mu + 1 != 0: J = float(Nbond) / float(mu + 1) * sumk else: J = 0 return J
def extract_smiles(xyz_file, charge, allow_charge=True, check_ac=False): """ uses xyz2mol to extract smiles with as much 3d structural information as possible """ atoms, _, xyz_coordinates = xyz2mol_local.read_xyz_file(xyz_file) try: input_mol = xyz2mol_local.xyz2mol(atoms, xyz_coordinates, charge=charge, use_graph=True, allow_charged_fragments=allow_charge, use_huckel=True, use_atom_maps=True, embed_chiral=True) except: input_mol = xyz2mol_local.xyz2mol(atoms, xyz_coordinates, charge=charge, use_graph=True, allow_charged_fragments=allow_charge, use_huckel=False, use_atom_maps=True, embed_chiral=True) input_mol = reorder_atoms_to_map(input_mol) structure_mol, res_status = choose_resonance_structure(input_mol) structure_mol = chiral_tags(structure_mol) rdmolops.AssignStereochemistry(structure_mol) structure_smiles = Chem.MolToSmiles(structure_mol) if check_ac: global AC_SAME ac = Chem.GetAdjacencyMatrix(input_mol) if not np.all(AC == ac): AC_SAME = False print("change in AC: stopping") return structure_smiles, GetFormalCharge(structure_mol), res_status
def calculate(self, An=None, A1=None): if self.order == 1: return Chem.GetAdjacencyMatrix(self.mol, useBO=self.useBO, force=True) return An.dot(A1)
def _GetBurdenMatrix(mol: Chem.Mol, propertylabel: str = 'm') -> numpy.matrix: """Calculate weighted Burden matrix and eigenvalues.""" mol = Chem.AddHs(mol) Natom = mol.GetNumAtoms() AdMatrix = Chem.GetAdjacencyMatrix(mol) bondindex = numpy.argwhere(AdMatrix) AdMatrix1 = numpy.array(AdMatrix, dtype=numpy.float32) # The diagonal elements of B, Bii, are either given by # the carbon normalized atomic mass, # van der Waals volume, Sanderson electronegativity, # and polarizability of atom i. for i in range(Natom): atom = mol.GetAtomWithIdx(i) temp = GetRelativeAtomicProperty(element=atom.GetSymbol(), propertyname=propertylabel) AdMatrix1[i, i] = round(temp, 3) # The element of B connecting atoms i and j, Bij, # is equal to the square root of the bond # order between atoms i and j. for i in bondindex: bond = mol.GetBondBetweenAtoms(int(i[0]), int(i[1])) if bond.GetBondType().name == 'SINGLE': AdMatrix1[i[0], i[1]] = round(numpy.sqrt(1), 3) if bond.GetBondType().name == "DOUBLE": AdMatrix1[i[0], i[1]] = round(numpy.sqrt(2), 3) if bond.GetBondType().name == "TRIPLE": AdMatrix1[i[0], i[1]] = round(numpy.sqrt(3), 3) if bond.GetBondType().name == "AROMATIC": AdMatrix1[i[0], i[1]] = round(numpy.sqrt(1.5), 3) # All other elements of B (corresponding non bonded # atom pairs) are set to 0.001 bondnonindex = numpy.argwhere(AdMatrix == 0) for i in bondnonindex: if i[0] != i[1]: AdMatrix1[i[0], i[1]] = 0.001 return numpy.real(numpy.linalg.eigvals(AdMatrix1))
def create_multi_adjancy_matrix(mol): mol_adj = Chem.GetAdjacencyMatrix(mol, useBO=True) num = mol.GetNumAtoms() nch = 5 adj = np.zeros((nch, num, num), dtype=np.int) for b in mol.GetBonds(): i = b.GetBeginAtomIdx() j = b.GetEndAtomIdx() t = b.GetBondType() if t == Chem.rdchem.BondType.SINGLE: ch = 0 adj[ch, i, j] = 1 elif t == Chem.rdchem.BondType.DOUBLE: ch = 1 adj[ch, i, j] = 1 elif t == Chem.rdchem.BondType.TRIPLE: ch = 2 adj[ch, i, j] = 1 elif t == Chem.rdchem.BondType.AROMATIC: ch = 3 adj[ch, i, j] = 1 else: ch = 4 adj[ch, i, j] = 1 for ch in range(nch): for i in range(num): adj[ch][i][i] = int(1) return adj
def create_adjacency(mol): """ :param mol: rdkit.Chem.Mol object :return: """ adjacency = Chem.GetAdjacencyMatrix(mol) return np.array(adjacency, dtype=np.int32)
def CalculateSchiultz(mol: Chem.Mol) -> float: """Get Schiultz number. Or Tsch. """ Distance = numpy.array(Chem.GetDistanceMatrix(mol), 'd') Adjacent = numpy.array(Chem.GetAdjacencyMatrix(mol), 'd') VertexDegree = sum(Adjacent) return sum(scipy.dot((Distance + Adjacent), VertexDegree))
def create_adjacency(mol): adjacency = Chem.GetAdjacencyMatrix(mol) n = adjacency.shape[0] adjacency = adjacency + np.eye(n) degree = sum(adjacency) d_half = np.sqrt(np.diag(degree)) d_half_inv = np.linalg.inv(d_half) adjacency = np.matmul(d_half_inv, np.matmul(adjacency, d_half_inv)) return np.array(adjacency)
def valences_not_too_large(mol): valence_dict = {5: 3, 6: 4, 7: 3, 8: 2, 9: 1, 16: 6, 17: 1, 35: 1, 53: 1} atomicNumList = [a.GetAtomicNum() for a in mol.GetAtoms()] valences = [valence_dict[atomic_num] for atomic_num in atomicNumList] BO = Chem.GetAdjacencyMatrix(mol, useBO=True) number_of_bonds_list = BO.sum(axis=1) for valence, number_of_bonds in zip(valences, number_of_bonds_list): if number_of_bonds > valence: return False return True
def CalculateSchiultz(mol): """ Calculation of Schiultz number Parameters: mol: RDKit molecule object Returns: Tsch: Thara number """ Distance = numpy.array(Chem.GetDistanceMatrix(mol), 'd') Adjacent = numpy.array(Chem.GetAdjacencyMatrix(mol), 'd') VertexDegree = sum(Adjacent) return sum(scipy.dot((Distance + Adjacent), VertexDegree))
def buildMPNN(molecule, med_voc, radius=1, device="cpu:0"): atom_dict = defaultdict(lambda: len(atom_dict)) bond_dict = defaultdict(lambda: len(bond_dict)) fingerprint_dict = defaultdict(lambda: len(fingerprint_dict)) edge_dict = defaultdict(lambda: len(edge_dict)) MPNNSet, average_index = [], [] for index, atc3 in med_voc.items(): smilesList = list(molecule[atc3]) """Create each data with the above defined functions.""" counter = 0 # counter how many drugs are under that ATC-3 for smiles in smilesList: try: mol = Chem.AddHs(Chem.MolFromSmiles(smiles)) atoms = create_atoms(mol, atom_dict) molecular_size = len(atoms) i_jbond_dict = create_ijbonddict(mol, bond_dict) fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict, fingerprint_dict, edge_dict) adjacency = Chem.GetAdjacencyMatrix(mol) # if fingerprints.shape[0] == adjacency.shape[0]: for _ in range(adjacency.shape[0] - fingerprints.shape[0]): fingerprints = np.append(fingerprints, 1) fingerprints = torch.LongTensor(fingerprints).to(device) adjacency = torch.FloatTensor(adjacency).to(device) MPNNSet.append((fingerprints, adjacency, molecular_size)) counter += 1 except: continue average_index.append(counter) """Transform the above each data of numpy to pytorch tensor on a device (i.e., CPU or GPU). """ N_fingerprint = len(fingerprint_dict) # transform into projection matrix n_col = sum(average_index) n_row = len(average_index) average_projection = np.zeros((n_row, n_col)) col_counter = 0 for i, item in enumerate(average_index): if item > 0: average_projection[i, col_counter : col_counter + item] = 1 / item col_counter += item return MPNNSet, N_fingerprint, torch.FloatTensor(average_projection)
def _process_row(self, smiles, label=None): mol = Chem.MolFromSmiles(smiles) if self.feature_extractor is None: adj = Chem.GetAdjacencyMatrix(mol) features = { 'num_nodes': adj.shape[0], 'edge_index': torch.LongTensor(np.stack(np.nonzero(adj))) } else: features = self.feature_extractor(mol) return Data(y=label, **features)
def create_dataset_randomsplit(x, y, path, dataname): dir_input = path + 'SMRT-' with open(dir_input + 'atom_dict.pickle', 'rb') as f: c = pickle.load(f) for k in c.keys(): atom_dict.get(k) atom_dict[k] = c[k] with open(dir_input + 'bond_dict.pickle', 'rb') as f: c = pickle.load(f) for k in c.keys(): bond_dict.get(k) bond_dict[k] = c[k] with open(dir_input + 'edge_dict.pickle', 'rb') as f: c = pickle.load(f) for k in c.keys(): edge_dict.get(k) edge_dict[k] = c[k] with open(dir_input + 'fingerprint_dict.pickle', 'rb') as f: c = pickle.load(f) for k in c.keys(): fingerprint_dict.get(k) fingerprint_dict[k] = c[k] dataset = [] for i in range(len(x)): smiles = x[i] property = y[i] """Create each data with the above defined functions.""" mol = Chem.MolFromInchi(smiles) mol = Chem.AddHs(Chem.MolFromInchi(smiles)) atoms = create_atoms(mol, atom_dict) molecular_size = len(atoms) i_jbond_dict = create_ijbonddict(mol, bond_dict) fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict, fingerprint_dict, edge_dict) adjacency = np.float32((Chem.GetAdjacencyMatrix(mol))) #Transform the above each data of numpy to pytorch tensor on a device (i.e., CPU or GPU). fingerprints = torch.LongTensor(fingerprints).to(device) adjacency = torch.FloatTensor(adjacency).to(device) property = torch.FloatTensor([[float(property)]]).to(device) dataset.append( (smiles, fingerprints, adjacency, molecular_size, property)) dir_dataset = path dump_dictionary(fingerprint_dict, dir_dataset + dataname + '-fingerprint_dict.pickle') dump_dictionary(atom_dict, dir_dataset + dataname + '-atom_dict.pickle') dump_dictionary(bond_dict, dir_dataset + dataname + '-bond_dict.pickle') dump_dictionary(edge_dict, dir_dataset + dataname + '-edge_dict.pickle') return dataset
def read_graph(source_path,MAX_size): Vertex = [] Adj = [] # Normalized adjacency matrix mycount=1 PAD=0 mydict={} max_size=0 with tf.gfile.GFile(source_path, mode="r") as source_file: source = source_file.readline().strip() counter = 0 while source: mol = Chem.MolFromSmiles(source) atom_list = [] for a in mol.GetAtoms(): m = a.GetSymbol() if m not in mydict: mydict[m]=mycount mycount = mycount +1 atom_list.append(mydict[m]) if len(atom_list) > max_size: max_size = len(atom_list) if len(atom_list) < MAX_size: pad = [PAD] * (MAX_size - len(atom_list)) atom_list = atom_list+pad vertex = np.array(atom_list, np.int32) Vertex.append(vertex) adja_mat = Chem.GetAdjacencyMatrix(mol) adj_temp = [] for adja in adja_mat: if len(adja) < MAX_size: pad = [PAD]*(MAX_size - len(adja)) adja = np.array(list(adja)+pad,np.int32) adj_temp.append(adja) cur_len = len(adj_temp) for i in range(MAX_size - cur_len): adja =np.array( [PAD]*MAX_size,np.int32) adj_temp.append(adja) adj_temp = adj_temp + np.eye(MAX_size) # A_hat = A + I Adj.append(adj_temp) source = source_file.readline().strip() return Vertex,Adj,max_size
def get_adjacency_matrix(smiles: str): """ Compute adjacency matrix between atoms. Only works for single molecules atm and not for rxns Args: smiles: SMILES representation of a molecule Returns: Numpy array representing the adjacency between each atom and every other atom in the molecular SMILES. Equivalent to `distance_matrix[distance_matrix == 1]` """ mol = Chem.MolFromSmiles(smiles) return Chem.GetAdjacencyMatrix(mol)
def create_dataset(filepath): """Load a dataset.""" with open(filepath, 'r') as f: #smiles_property = f.readline().strip().split() #data_original = f.read().strip().split('\n') data_original = f.readlines() print(len(data_original)) data_original = [[data.strip('\n').split('\t')[6], data.strip('\n').split('\t')[7]] for data in data_original] """Exclude the data contains '.' in its smiles. data_original = [data for data in data_original if '.' not in data.split()[0]] """ dataset = [] mask = [] for data in data_original: dataset_ = [] for smiles in data: """Replace the smiles its contains '.' with 'CC' Replace the no smiles data with 'CC'""" if '.' in smiles or smiles == '': smiles = 'CC' mask = [0] else: try: Chem.AddHs(Chem.MolFromSmiles(smiles)) mask = [1] except: """Replace invalid smiles with 'CC'""" smiles = 'CC' mask = [0] """Create each data with the above defined functions.""" mol = Chem.AddHs(Chem.MolFromSmiles(smiles)) atoms = create_atoms(mol, atom_dict) molecular_size = len(atoms) i_jbond_dict = create_ijbonddict(mol, bond_dict) fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict, fingerprint_dict, edge_dict) adjacency = Chem.GetAdjacencyMatrix(mol) dataset_.append((fingerprints, adjacency, molecular_size, mask)) dataset.append(dataset_) return dataset
def smiles_to_mol_graph(smiles): try: mol = Chem.MolFromSmiles(smiles) adj_mat = Chem.GetAdjacencyMatrix(mol) node_feat_mat = np.empty([mol.GetNumAtoms(), atomic_props.get(1).shape[0]]) ind = 0 for atom in mol.GetAtoms(): node_feat_mat[ind, :] = atomic_props.get(atom.GetAtomicNum()) ind = ind + 1 return mol, construct_mol_graph(smiles, mol, adj_mat, node_feat_mat) except: print(smiles + ' could not be converted to molecular graph due to the internal errors of RDKit') return None, None
def adjacency_matrix(self, bond_orders=False, force=True): """ The vertex adjacency matrix. Args: bond_orders (bool): Whether to use bond orders. force (bool): Whether to recalculate or used rdkit cached value. Returns: np.array[int] """ return Chem.GetAdjacencyMatrix(self.owner, useBO=bond_orders, force=force)
def smiles_to_pseudo_xyz(smiles): configs = [] valid = [] for idx, smi in enumerate(smiles): mol = chem.MolFromSmiles(smi) mol = chem.AddHs(mol) if mol is None: pass else: symbols = [ a.GetSymbol() for a in mol.GetAtoms() ] pos = np.zeros((len(symbols),3)) config = readwrite.ExtendedXyz(pos=pos, symbols=symbols) config.info["lmat"] = 1.*chem.GetAdjacencyMatrix(mol) configs.append(config) valid.append(idx) return configs
def take_elementary_step(mol, charge, E_cutoff, heterolytic, quick): chiral_parent = Chem.FindMolChiralCenters(mol, includeUnassigned=True) parent_is_chiral = len(chiral_parent) > 0 if parent_is_chiral: atom2chirality = {key: value for (key, value) in chiral_parent} atomicNumList = [a.GetAtomicNum() for a in mol.GetAtoms()] proto_mol = xyz2mol.get_proto_mol(atomicNumList) AC = Chem.GetAdjacencyMatrix(mol) num_atoms = len(atomicNumList) I_elementary = get_I_elementary(AC, num_atoms, atomicNumList) smiles_list = [] molecules = [] raw_smiles_list = [] raw_molecules = [] for I in I_elementary: newmol = xyz2mol.AC2mol(proto_mol, I, atomicNumList, charge, heterolytic, quick) if parent_is_chiral: newmol = set_chirality(mol, newmol, atom2chirality) raw_smiles = Chem.MolToSmiles(newmol, isomericSmiles=True) if raw_smiles not in raw_smiles_list: raw_smiles_list.append(raw_smiles) raw_molecules.append(newmol) energy_of_reactant = get_BO_energy(mol) for smiles, raw_mol in zip(raw_smiles_list, raw_molecules): try: test_mol = Chem.MolFromSmiles(smiles) except: continue if test_mol != None: energy = get_BO_energy(raw_mol) if smiles not in smiles_list and energy_of_reactant - energy < E_cutoff: smiles_list.append(smiles) molecules.append(raw_mol) smiles_list.insert(0, Chem.MolToSmiles(mol, isomericSmiles=True)) molecules.insert(0, mol) return smiles_list, molecules
def transferlearning_dataset_predict(x, path): dir_input = path + 'SMRT-' with open(dir_input + 'atom_dict.pickle', 'rb') as f: c = pickle.load(f) for k in c.keys(): atom_dict.get(k) atom_dict[k] = c[k] with open(dir_input + 'bond_dict.pickle', 'rb') as f: c = pickle.load(f) for k in c.keys(): bond_dict.get(k) bond_dict[k] = c[k] with open(dir_input + 'edge_dict.pickle', 'rb') as f: c = pickle.load(f) for k in c.keys(): edge_dict.get(k) edge_dict[k] = c[k] with open(dir_input + 'fingerprint_dict.pickle', 'rb') as f: c = pickle.load(f) for k in c.keys(): fingerprint_dict.get(k) fingerprint_dict[k] = c[k] dataset = [] for i in range(len(x)): smiles = x[i] """Create each data with the above defined functions.""" mol = Chem.MolFromSmiles(smiles) if mol is None: continue else: smi = Chem.MolToSmiles(mol) mol = Chem.AddHs(Chem.MolFromSmiles(smiles)) atoms = create_atoms(mol, atom_dict) molecular_size = len(atoms) i_jbond_dict = create_ijbonddict(mol, bond_dict) fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict, fingerprint_dict, edge_dict) adjacency = np.float32((Chem.GetAdjacencyMatrix(mol))) #Transform the above each data of numpy to pytorch tensor on a device (i.e., CPU or GPU). fingerprints = torch.LongTensor(fingerprints).to(device) adjacency = torch.FloatTensor(adjacency).to(device) dataset.append((smiles, fingerprints, adjacency, molecular_size)) return dataset
def CalculateBalaban(mol): adjMat = Chem.GetAdjacencyMatrix(mol) Distance = Chem.GetDistanceMatrix(mol) Nbond = mol.GetNumBonds() Natom = mol.GetNumAtoms() S = numpy.sum(Distance, axis=1) mu = Nbond - Natom + 1 sumk = 0. for i in range(len(Distance)): si = S[i] for j in range(i, len(Distance)): if adjMat[i, j] == 1: sumk += 1. / numpy.sqrt(si * S[j]) if mu + 1 != 0: J = float(Nbond) / float(mu + 1) * sumk else: J = 0 return J