def process(self): data1 = np.load(self.raw_paths[0]) data2 = np.load(self.raw_paths[1]) data1_feed_dict = { 'E': torch.as_tensor(data1['E']), 'N': torch.as_tensor(data1['N']), 'R': torch.as_tensor(data1['R_qm'] if self.qm else data1['R_mmff']), 'D': torch.as_tensor(data1['D_qm'] if self.qm else data1['D_mmff']), 'Q': torch.as_tensor(data1['Q']), 'Z': torch.as_tensor(data1['Z']) } data2_feed_dict = { 'E': torch.as_tensor(data2['E']), 'N': torch.as_tensor(data2['N']), 'R': torch.as_tensor(data2['R_qm'] if self.qm else data2['R_mmff']), 'D': torch.as_tensor(data2['D_qm'] if self.qm else data2['D_mmff']), 'Q': torch.as_tensor(data2['Q']), 'Z': torch.as_tensor(data2['Z']) } data1_size = data1['E'].shape[0] data2_size = data2['E'].shape[0] if not self.sep_heavy_atom: data_size = data1_size + data2_size else: in_part1 = (self.num_heavy_atom < 14) heavy_atom_data = pd.read_csv(self.raw_paths[2] if in_part1 else self.raw_paths[3]) num_heavy_atom = torch.as_tensor(heavy_atom_data['numberHA']).long() atom_mask = (num_heavy_atom == self.num_heavy_atom) atom_mask = atom_mask.view(-1) data_dict_used = data1_feed_dict if in_part1 else data2_feed_dict for key in data_dict_used.keys(): data_dict_used[key] = data_dict_used[key][atom_mask] ''' Here is a trick to make sure later part only calculate data_dict_used ''' data_size = data_dict_used['E'].shape[0] data1_feed_dict = data_dict_used data_array = np.empty(data_size, dtype=Data) for i in tqdm(range(data_size)): data_index = i if i < data1_size else i - data1_size if i < data1_size: tmp_data = _get_ith_data(data_index, **data1_feed_dict) else: tmp_data = _get_ith_data(data_index, **data2_feed_dict) tmp_data = self.pre_transform(tmp_data, edge_version='cutoff', do_sort_edge=True, cal_efg=False, cutoff=self.cutoff, boundary_factor=None, use_center=None, mol=AddHs(MolFromSmiles('C')), cal_3body_term=self.cal_3body_term, bond_atom_sep=self.bond_atom_sep, record_long_range=self.record_long_range) data_array[i] = tmp_data data_list = [data_array[i] for i in range(data_size)] print('collating...') data1, slices = self.collate(data_list) print('saving...') torch.save((data1, slices), self.processed_paths[0])
def get_mol_objects(SMILES): if type(SMILES) == list: return [MolFromSmiles(SMILES) for SMILES in SMILES] if type(SMILES) == str: return MolFromSmiles(SMILES)
def structure_standardization(smi: str) -> str: """ Standardization function to clean up smiles with RDKit. First, the input smiles is converted into a mol object. Not-readable SMILES are written to the log file. The molecule size is checked by the number of atoms (non-hydrogen). If the molecule has more than 100 non-hydrogen atoms, the compound is discarded and written in the log file. Molecules with number of non-hydrogen atoms <= 100 are standardized with the MolVS toolkit (https://molvs.readthedocs.io/en/latest/index.html) relying on RDKit. Molecules which failed the standardization process are saved in the log file. The remaining standardized structures are converted back into their canonical SMILES format. :param smi: Input SMILES from the given structure data file T4 :return: smi_clean: Cleaned and standardized canonical SMILES of the given input SMILES. Args: smi (str): Non-standardized smiles string Returns: str: standardized smiles string """ # tautomer.TAUTOMER_TRANSFORMS = update_tautomer_rules() # importlib.reload(MolVS_standardizer) # param = ReadConfig() standardization_param = ConfigDict.get_parameters()["standardization"] max_num_atoms = standardization_param["max_num_atoms"] max_num_tautomers = standardization_param["max_num_tautomers"] include_stereoinfo = standardization_param["include_stereoinfo"] ## Load new tautomer enumarator/canonicalizer tautomerizer = rdMolStandardize.TautomerEnumerator() tautomerizer.SetMaxTautomers(max_num_tautomers) tautomerizer.SetRemoveSp3Stereo( False) # Keep stereo information of keto/enol tautomerization def isotope_parent(mol: Chem.Mol) -> Chem.Mol: """ Isotope parent from MOLVS Return the isotope parent of a given molecule. The isotope parent has all atoms replaced with the most abundant isotope for that element. Args: mol (Chem.Mol): input rdkit mol object Returns: Chem.Mol: isotope parent rdkit mol object """ mol = copy.deepcopy(mol) # Replace isotopes with common weight for atom in mol.GetAtoms(): atom.SetIsotope(0) return mol def my_standardizer(mol: Chem.Mol) -> Chem.Mol: """ MolVS implementation of standardization Args: mol (Chem.Mol): non-standardized rdkit mol object Returns: Chem.Mol: stndardized rdkit mol object """ mol = copy.deepcopy(mol) Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) disconnector = rdMolStandardize.MetalDisconnector() mol = disconnector.Disconnect(mol) normalizer = rdMolStandardize.Normalizer() mol = normalizer.normalize(mol) reionizer = rdMolStandardize.Reionizer() mol = reionizer.reionize(mol) Chem.AssignStereochemistry(mol, force=True, cleanIt=True) # TODO: Check this removes symmetric stereocenters return mol mol = MolFromSmiles(smi) # Read SMILES and convert it to RDKit mol object. if (mol is not None ): # Check, if the input SMILES has been converted into a mol object. if ( mol.GetNumAtoms() <= max_num_atoms ): # check size of the molecule based on the non-hydrogen atom count. try: mol = rdMolStandardize.ChargeParent( mol) # standardize molecules using MolVS and RDKit mol = isotope_parent(mol) if include_stereoinfo is False: Chem.RemoveStereochemistry(mol) mol = tautomerizer.Canonicalize(mol) mol_clean = my_standardizer(mol) smi_clean = MolToSmiles( mol_clean) # convert mol object back to SMILES else: mol = tautomerizer.Canonicalize(mol) mol_clean = my_standardizer(mol) smi_clean = MolToSmiles(mol_clean) except (ValueError, AttributeError) as e: smi_clean = np.nan logging.error( "Standardization error, " + smi + ", Error Type: " + str(e) ) # write failed molecules during standardization to log file else: smi_clean = np.nan logging.error("Molecule too large, " + smi) else: smi_clean = np.nan logging.error("Reading Error, " + smi) return smi_clean
xmin = next_x(lb, ub, 5, 60) valid_smiles = [] scores = [] for x_new in xmin: #model = DGLJTNNVAE(vocab, hidden_size, latent_size, depth) #model.load_state_dict(torch.load(opts.model_path)) #model = cuda(model) tree_vec, mol_vec = x_new.chunk(2, 1) print(x_new.shape, tree_vec.shape, mol_vec.shape) print(x_new) s = model.decode(tree_vec, mol_vec) if s is not None: valid_smiles.append(s) current_log_P_value = Descriptors.MolLogP(MolFromSmiles(s)) current_SA_score = -sascorer.calculateScore(MolFromSmiles(s)) cycle_list = nx.cycle_basis( nx.Graph(rdmolops.GetAdjacencyMatrix(MolFromSmiles(s)))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 current_cycle_score = -cycle_length current_SA_score_normalized = (
import csv from rdkit.Chem import MolFromSmiles, Draw labels = [] smiles = [] with open("smiles.csv", "r") as f: content = csv.reader(f) for row in content: name, smile = row labels.append(row[0]) smiles.append(MolFromSmiles(row[1])) img = Draw.MolsToGridImage(smiles, molsPerRow=2, subImgSize=(300, 300), legends=labels) img.save("happiness.png")
def test_roundtrip_translation(): """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the SMILES examples in QM9, NonFullerene, Zinc, etc. """ # modify constraints constraints = sf.get_hypervalent_constraints() constraints['N'] = 6 constraints['Br'] = 7 constraints['Cl'] = 7 constraints['I'] = 7 sf.set_semantic_constraints(constraints) # file I/O ckpt_path = os.path.join(curr_dir, 'checkpoints', 'emolecule_ckpt.txt') error_path = os.path.join(curr_dir, 'error_sets', 'errors_emolecules.csv') # check if a previous checkpoint exists to continue tests if os.path.exists(ckpt_path): with open(ckpt_path, 'r') as ckpt_file: checkpoint = int(ckpt_file.readlines()[0]) # if no path to a checkpoint exists, # create a new directory for error logging and checkpoints else: os.makedirs(os.path.dirname(ckpt_path), exist_ok=True) os.makedirs(os.path.dirname(error_path), exist_ok=True) with open(error_path, "w+") as error_log: error_log.write("In, Out\n") checkpoint = -1 error_list = [] error_found_flag = False # make pandas reader reader = pd.read_csv(EMOL_PATH, chunksize=10000, compression='gzip', delimiter=' ', header=0) # roundtrip testing for chunk_idx, chunk in enumerate(reader): if chunk_idx <= checkpoint: continue for in_smiles in chunk[COL_NAME]: # check if SMILES in chunk is a valid RDKit molecule. # if not, skip testing # All inputted SMILES must be valid # RDKit Mol objects to be encoded. if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles): continue # encode selfies selfies = sf.encoder(in_smiles) # if unable to encode SMILES, write to list of errors if selfies is None: error_list.append((in_smiles, '')) continue # take encoeded SELFIES and decode out_smiles = sf.decoder(selfies) # compare original SMILES to decoded SELFIE string. # if not the same string, write to list of errors. if not is_same_mol(in_smiles, out_smiles): error_list.append((in_smiles, out_smiles)) # open and write all errors to errors_emolecule.csv with open(error_path, "a") as error_log: for error in error_list: error_log.write(','.join(error) + "\n") error_found_flag = error_found_flag or error_list error_list = [] # create checkpoint from the current pandas reader chunk, # to load from and continue testing. with open(ckpt_path, 'w+') as ckpt_file: ckpt_file.write(str(chunk_idx)) sf.set_semantic_constraints() # restore defaults os.remove(ckpt_path) # remove checkpoint assert not error_found_flag
def scorer(smiles, pIC50_weight, QED_weight, logP_weight, SA_weight, cycle_weight, sim_weight): smiles_rdkit = [] for i in range(len(smiles)): smiles_rdkit.append( MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True)) # calculate IC50 of training set using MPNN #IC50_scores=calculateScore(smiles_rdkit) # read in IC50 of training set from database IC50_scores = np.loadtxt('../data/covid/ic50-fulltrain.txt') IC50_scores = [x for x in IC50_scores] IC50_scores_normalized = (np.array(IC50_scores) - np.mean(IC50_scores)) / np.std(IC50_scores) if sim_weight != 0: # df_100 = list of molecules to match similarity df_100 = pd.read_csv('../data/covid/MPro_6wqf_A_ProteaseData_smiles_top100.csv') ms_db = [MolFromSmiles(x) for x in df_100['SMILES'].tolist()] fps_db = [RDKFingerprint(x) for x in ms_db] sim_values = [] for i in range(len(smiles)): sim_values.append( similarity_search(fps_db, smiles_rdkit[i])) sim_values_normalized = ( np.array(sim_values) - np.mean(sim_values)) / np.std(sim_values) else: sim_values, sim_values_normalized = [], [] for i in range(len(smiles)): sim_values.append(0) sim_values_normalized.append(0) sim_values_normalized=np.array(sim_values_normalized) logP_values = [] for i in range(len(smiles)): logP_values.append( Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i]))) qed_values = [] for i in range(len(smiles)): qed_values.append( QED.qed(MolFromSmiles(smiles_rdkit[i]))) SA_scores = [] for i in range(len(smiles)): SA_scores.append( -sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i]))) cycle_scores = [] for i in range(len(smiles)): cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles(smiles_rdkit[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_scores.append(-cycle_length) SA_scores_normalized = ( np.array(SA_scores) - np.mean(SA_scores)) / np.std(SA_scores) qed_values_normalized = ( np.array(qed_values) - np.mean(qed_values)) / np.std(qed_values) cycle_scores_normalized = ( np.array(cycle_scores) - np.mean(cycle_scores)) / np.std(cycle_scores) logP_values_normalized = ( np.array(logP_values) - np.mean(logP_values)) / np.std(logP_values) targets = (pIC50_weight * IC50_scores_normalized + logP_weight * logP_values_normalized + SA_weight * SA_scores_normalized + QED_weight * qed_values_normalized + cycle_weight * cycle_scores_normalized + sim_weight * sim_values_normalized) return (IC50_scores, qed_values, logP_values, SA_scores, cycle_scores, sim_values, targets)
print('This mol fails! ' + MolToSmiles(mol)) continue products = rxn.RunReactants((Chem.AddHs(mol), )) if products != (): for prod in products: prod1_list.append(prod[0]) prod2_list.append(prod[1]) return prod1_list, prod2_list df_orig = pd.read_csv('../data/acry_activity.smi') # df_actives = df_orig[df_orig['activity']==1] # print('Number of acry actives: {}'.format(len(df_actives))) smiles_list = df_orig['SMILES'].values smiles_list = list( set([MolToSmiles(MolFromSmiles(smi)) for smi in smiles_list])) # print(smiles_list) # print([MolFromSmiles(smi) for smi in smiles_list]) # mol_list = [MolFromSmiles(MolToSmiles(MolFromSmiles(smi))) for smi in smiles_list] # mol_list = [mol for mol in mol_list if mol] # print(mol_list) # print(len(list(set([MolToSmiles(mol) for mol in mol_list])))) mols = [MolFromSmiles(smi) for smi in smiles_list] #print('Size of actives: {}'.format(len(canonicalize(mols)))) print('Size of original dataset: {}'.format(len(canonicalize(mols)))) acry_slice = AllChem.ReactionFromSmarts( '[c,C:1][C](=[O])[N]([c,C,#1:2])[C]([c,C,#1:3])([c,C,#1:4])[C](=[O])[N]([#1])[c,C:5]>>[*:1][C](=[O])[O][#1].[*:2][N]([#1])[#1].[*:3][C](=[O])[*:4].[*:5][N+]#[C-]' ) acry_comb = AllChem.ReactionFromSmarts( '[c,C:1][C](=[O])[O][#1].[c,C:2][N]([#1])[#1].[c,C,#1:3][C](=[O])[c,C,#1:4].[c,C:5][N+]#[C-]>>[*:1][C](=[O])[N]([*:2])[C]([*:3])([*:4])[C](=[O])[N]([#1])[*:5]' )
def canonicalize_smiles(smiles, isomeric=True, sanitize=True): try: mol = MolFromSmiles(smiles, sanitize=sanitize) return MolToSmiles(mol, isomericSmiles=isomeric) except Exception: pass
def test_topological_fprint_min_path_lesser_than_atoms(self): atomic_mols = [ MolFromSmiles(smiles) for smiles in ['C', 'O', 'N', 'P'] ] diatomic_mols = [ MolFromSmiles(smiles) for smiles in ['CC', 'CO', 'CN', 'CP'] ] triatomic_mols = [ MolFromSmiles(smiles) for smiles in ['CCC', 'COO', 'CCN', 'CCP'] ] min_path = 1 for mol in atomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) for diatomic_mol in diatomic_mols: descriptor = Descriptor() try: descriptor.make_fingerprint( molecule_graph=diatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) except InvalidConfigurationError: self.fail("Did not expect Descriptor to raise " "InvalidConfigurationError") for triatomic_mol in triatomic_mols: descriptor = Descriptor() try: descriptor.make_fingerprint( molecule_graph=triatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) except InvalidConfigurationError: self.fail("Did not expect Descriptor to raise " "InvalidConfigurationError") min_path = 2 for mol in atomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) for diatomic_mol in diatomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=diatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) for triatomic_mol in triatomic_mols: descriptor = Descriptor() try: descriptor.make_fingerprint( molecule_graph=triatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) except InvalidConfigurationError: self.fail("Did not expect Descriptor to raise " "InvalidConfigurationError") min_path = 3 for mol in atomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) for diatomic_mol in diatomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=diatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) for triatomic_mol in triatomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=triatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path})
def test_roundtrip_translation(test_name, column_name, dataset_samples): """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the SMILES examples in QM9, NonFullerene, Zinc, etc. """ # modify semantic bond constraints constraints = sf.get_semantic_constraints() constraints['N'] = 6 sf.set_semantic_constraints(constraints) # file I/O curr_dir = os.path.dirname(__file__) test_path = os.path.join(curr_dir, 'test_sets', test_name + ".txt") error_path = os.path.join(curr_dir, 'error_sets', "errors_{}.csv".format(test_name)) # create error directory os.makedirs(os.path.dirname(error_path), exist_ok=True) error_list = [] # add header in error log text file with open(error_path, "w+") as error_log: error_log.write("In, Out\n") error_found_flag = False # make pandas reader N = sum(1 for _ in open(test_path)) - 1 S = dataset_samples if (0 < dataset_samples <= N) else N skip = sorted(random.sample(range(1, N + 1), N - S)) reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip) # roundtrip testing for chunk in reader: for in_smiles in chunk[column_name]: # check if SMILES in chunk is a valid RDKit molecule. # if not, skip testing # All inputted SMILES must be valid # RDKit Mol objects to be encoded. if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles): continue # encode SELFIE string selfies = sf.encoder(in_smiles) # if unable to encode SMILES, write to list of errors if selfies is None: error_list.append((in_smiles, '')) continue # take encoeded SELFIES and decode out_smiles = sf.decoder(selfies) # compare original SMILES to decoded SELFIE string. # if not the same string, write to list of errors. if not is_same_mol(in_smiles, out_smiles): error_list.append((in_smiles, out_smiles)) # open and write all errors to errors_{test_name}.csv with open(error_path, "a") as error_log: for error in error_list: error_log.write(','.join(error) + "\n") error_found_flag = error_found_flag or error_list error_list = [] sf.set_semantic_constraints() # restore defaults assert not error_found_flag
def test4MolToInchiKey(self): m = MolFromSmiles("CC=C(N)C") inchi = MolToInchi(m) k1 = InchiToInchiKey(inchi) k2 = MolToInchiKey(m) self.assertEqual(k1, k2)
def test2InchiOptions(self): m = MolFromSmiles("CC=C(N)C") inchi1 = MolToInchi(m).split('/', 1)[1] inchi2 = MolToInchi(m, "/SUU").split('/', 1)[1] self.assertEqual(inchi1 + '/b4-3?', inchi2)
def test1InchiReadPubChem(self): for f in self.dataset.values(): same, diff, reasonable = 0, 0, 0 for m in f: if m is None: # pragma: nocover continue x = MolToInchi(m) y = None RDLogger.DisableLog('rdApp.error') mol = MolFromInchi(x) RDLogger.EnableLog('rdApp.error') if mol is not None: y = MolToInchi( MolFromSmiles(MolToSmiles(mol, isomericSmiles=True))) if y is None: # metal involved? try: MolToInchi(m, treatWarningAsError=True) except InchiReadWriteError as inst: _, error = inst.args if 'Metal' in error or \ 'Charges were rearranged' in error: reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # RDKit does not like the SMILES? use MolBlock instead inchiMol = MolFromInchi(x) if inchiMol: rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock( MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue # InChI messed up the radical? unsanitizedInchiMol = MolFromInchi(x, sanitize=False) if sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in m.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]) != sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in unsanitizedInchiMol.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]): reasonable += 1 continue diff += 1 cid = m.GetProp('PUBCHEM_COMPOUND_CID') print(COLOR_GREEN + 'Empty mol for PubChem Compound ' + cid + '\n' + COLOR_RESET) continue if x != y: # if there was warning in the first place, then this is # tolerable try: MolToInchi(m, treatWarningAsError=True) MolFromInchi(x, treatWarningAsError=True) except InchiReadWriteError as inst: reasonable += 1 continue # or if there are big rings SanitizeMol(m) if filter(lambda i: i >= 8, [len(r) for r in m.GetRingInfo().AtomRings()]): reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # or if RDKit loses bond stereo s = MolToSmiles(m, True) if MolToSmiles(MolFromSmiles(s), True) != s: reasonable += 1 continue # or if it is RDKit SMILES writer unhappy about the mol inchiMol = MolFromInchi(x) rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue diff += 1 print(COLOR_GREEN + 'Molecule mismatch for PubChem Compound ' + cid + COLOR_RESET) print(inchiDiff(x, y)) print() else: same += 1 fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}" print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET)) self.assertEqual(same, 684) self.assertEqual(diff, 0) self.assertEqual(reasonable, 497)
def test_kekulize(): mol = Filters.kekulize(MolFromSmiles('c1ccccc1')) assert MolToSmiles(mol) == 'C1=CC=CC=C1'
def construct_feature_matrices(self, smiles): """ construct a molecule from the given smiles string and return atom and bond classes. Returns dict with entries 'n_atom' : number of atoms in the molecule 'n_bond' : number of bonds in the molecule 'atom' : (n_atom,) length list of atom classes 'bond' : (n_bond,) list of bond classes 'connectivity' : (n_bond, 2) array of source atom, target atom pairs. """ mol = MolFromSmiles(smiles) if self.explicit_hs: mol = AddHs(mol) n_atom = len(mol.GetAtoms()) n_bond = 2 * len(mol.GetBonds()) # If its an isolated atom, add a self-link if n_bond == 0: n_bond = 1 atom_feature_matrix = np.zeros(n_atom, dtype='int') bond_feature_matrix = np.zeros(n_bond, dtype='int') connectivity = np.zeros((n_bond, 2), dtype='int') bond_index = 0 atom_seq = mol.GetAtoms() atoms = [atom_seq[i] for i in range(n_atom)] for n, atom in enumerate(atoms): # Atom Classes atom_feature_matrix[n] = self.atom_tokenizer( self.atom_features(atom)) start_index = atom.GetIdx() for bond in atom.GetBonds(): # Is the bond pointing at the target atom rev = bond.GetBeginAtomIdx() != start_index # Bond Classes bond_feature_matrix[n] = self.bond_tokenizer( self.bond_features(bond, flipped=rev)) # Connectivity if not rev: # Original direction connectivity[bond_index, 0] = bond.GetBeginAtomIdx() connectivity[bond_index, 1] = bond.GetEndAtomIdx() else: # Reversed connectivity[bond_index, 0] = bond.GetEndAtomIdx() connectivity[bond_index, 1] = bond.GetBeginAtomIdx() bond_index += 1 return { 'n_atom': n_atom, 'n_bond': n_bond, 'atom': atom_feature_matrix, 'bond': bond_feature_matrix, 'connectivity': connectivity, }
def smiles_reader(smiles, **kwargs): kwargs.setdefault('sanitize', True) return MolFromSmiles(smiles, **kwargs)
batch_size = 32 # hidden_size = int(args.hidden_size) # latent_size = int(args.latent_size) # depth = int(opts.depth) model = JTNNVAE(vocab, args.hidden_size, args.latent_size, args.depthT, args.depthG, args.num_layers, args.use_graph_conv) model.load_state_dict(torch.load(args.model)) model = model.cuda() smiles_rdkit = [] for i in range(len(smiles)): print(i, 'smiles') smiles_rdkit.append( MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True)) logP_values = [] for i in range(len(smiles)): print(i, 'logP_values') logP_values.append(Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i]))) SA_scores = [] for i in range(len(smiles)): print(i, 'SA_scores') SA_scores.append(-sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i]))) import networkx as nx cycle_scores = [] for i in range(len(smiles)):
def from_smiles(cls, smiles): mol = MolFromSmiles(smiles) return cls.from_mol(mol)
smiles_list, y = parse_dataset(task, PATHS[task]) #NEED TO FIX MALARIA dat_size = len(smiles_list) mpi_comm = MPI.COMM_WORLD mpi_rank = mpi_comm.Get_rank() mpi_size = mpi_comm.Get_size() my_border_low, my_border_high = return_borders(mpi_rank, dat_size, mpi_size) my_list = smiles_list[my_border_low:my_border_high] bit_list = [2048] for bits in bit_list: my_mols = [MolFromSmiles(smiles) for smiles in my_list] X = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=bits) for mol in my_mols] X = np.asarray(X) dbs = mpi_comm.gather(X, root=0) if mpi_rank==0: for db in dbs[1:]: X = np.vstack([X, db]) #print(X) #np.save('data/'+task+'/'+task+'_ecfp_'+str(bits)+'.npy',X) print('Number of bits: {}'.format(bits)) kernel = 1-tanimoto(X,X) print(kernel) print(kernel.shape)
#!/usr/bin/python2 # Little harness for timing how long it takes to embed a molecule # which seems extremely variable on one machine, from __future__ import print_function, division import sys, time, os from rdkit.Chem import MolFromSmiles, AddHs, RemoveHs from rdkit.Chem.AllChem import EmbedMolecule if __name__ == "__main__": dotimestamp = int(os.getenv('MOLEMBED_TIME', '0')) doaddh = int(os.getenv('MOLEMBED_ADDH', '0')) rseed = int(os.getenv('MOLEMBED_SEED', '0')) t0 = time.time() for line in sys.stdin.readlines(): s = line.strip() if dotimestamp: t1 = time.time() dt = (t1 - t0) * 1e3 print('%.3f' % dt, s) t0 = t1 else: print(s) m = MolFromSmiles(s) if doaddh: m2 = AddHs(m) else: m2 = m EmbedMolecule(m2, randomSeed=rseed)
def checksmi(smiles): return MolFromSmiles(str(smiles))
if __name__ == '__main__': if TASK == 'e_iso_pi': X_train, X_test, y_train, y_test, dft_vals = dft_train_test_split(PATH, TASK) elif TASK == 'z_iso_pi': X_train, X_test, y_train, y_test, dft_vals = dft_train_test_split(PATH, TASK) elif TASK == 'e_iso_n': X_train, X_test, y_train, y_test, dft_vals = dft_train_test_split(PATH, TASK) elif TASK == 'z_iso_n': X_train, X_test, y_train, y_test, dft_vals = dft_train_test_split(PATH, TASK) else: raise Exception('Must specify a valid task') rdkit_train_mols = [MolFromSmiles(smiles) for smiles in X_train] X_train = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=512) for mol in rdkit_train_mols] X_train = np.asarray(X_train) rdkit_test_mols = [MolFromSmiles(smiles) for smiles in X_test] X_test = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=512) for mol in rdkit_test_mols] X_test = np.asarray(X_test) X_train, y_train, X_test, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) regr_rf = RandomForestRegressor(n_estimators=100, max_depth=30, random_state=2) regr_rf.fit(X_train, y_train) # Predict on new data y_rf = regr_rf.predict(X_test) y_rf = y_scaler.inverse_transform(y_rf)
def test_remove_isotope(): mol = Filters.remove_isotope(MolFromSmiles('c1cc[14cH]cc1')) assert MolToSmiles(mol) == ('c1ccccc1')
def gen_latent_demo(data_path): import sys sys.path.append('/home/icml18-jtnn') import torch import torch.nn as nn from torch.autograd import Variable from optparse import OptionParser import rdkit from rdkit.Chem import Descriptors from rdkit.Chem import MolFromSmiles, MolToSmiles from rdkit.Chem import rdmolops import sascorer import numpy as np from jtnn import * lg = rdkit.RDLogger.logger() lg.setLevel(rdkit.RDLogger.CRITICAL) #data_path=args.data_path vocab_path = '../data/vocab.txt' with open(data_path) as f: smiles = f.readlines() for i in xrange(len(smiles)): smiles[i] = smiles[i].strip() vocab = [x.strip("\r\n ") for x in open(vocab_path)] vocab = Vocab(vocab) batch_size = 1 hidden_size = 450 latent_size = 56 depth = 3 model = JTNNVAE(vocab, hidden_size, latent_size, depth) model.load_state_dict( torch.load('../molvae/MPNVAE-h450-L56-d3-beta0.005/model.iter-4', map_location=lambda storage, loc: storage)) smiles_rdkit = [] for i in xrange(len(smiles)): smiles_rdkit.append( MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True)) logP_values = [] for i in xrange(len(smiles)): logP_values.append(Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i]))) SA_scores = [] for i in xrange(len(smiles)): SA_scores.append( -sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i]))) import networkx as nx cycle_scores = [] for i in range(len(smiles)): cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles(smiles_rdkit[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_scores.append(-cycle_length) SA_scores_normalized = (np.array(SA_scores) - np.mean(SA_scores)) / np.std(SA_scores) logP_values_normalized = (np.array(logP_values) - np.mean(logP_values)) / np.std(logP_values) cycle_scores_normalized = (np.array(cycle_scores) - np.mean(cycle_scores)) / np.std(cycle_scores) latent_points = [] for i in xrange(0, len(smiles), batch_size): batch = smiles[i:i + batch_size] mol_vec = model.encode_latent_mean(batch) latent_points.append(mol_vec.data.cpu().numpy()) # We store the results latent_points = np.vstack(latent_points) np.savetxt('latent_features_demo.txt', latent_points) targets = SA_scores_normalized + logP_values_normalized + cycle_scores_normalized np.savetxt('targets_demo.txt', targets) np.savetxt('logP_values_demo.txt', np.array(logP_values)) np.savetxt('SA_scores_demo.txt', np.array(SA_scores)) np.savetxt('cycle_scores_demo.txt', np.array(cycle_scores))
def test_neutralise_charge(): mol = Filters.neutralise_charge(MolFromSmiles('CC(C(=O)[O-])O')) assert MolToSmiles(mol) == ('CC(O)C(=O)O')
print(len(valid_smiles), " molecules are found") valid_smiles = valid_smiles[:50] new_features = next_inputs[:50] new_features = np.vstack(new_features) save_object(valid_smiles, args.save_dir + "/valid_smiles{}.dat".format(iteration)) import sascorer import networkx as nx from rdkit.Chem import rdmolops scores = [] for i in range(len(valid_smiles)): print(i, 'calculating scores') current_log_P_value = Descriptors.MolLogP( MolFromSmiles(valid_smiles[i])) current_SA_score = -sascorer.calculateScore( MolFromSmiles(valid_smiles[i])) cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles(valid_smiles[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 current_cycle_score = -cycle_length
def test_add_hydrogen(): mol = Filters.add_hydrogen(MolFromSmiles('CC(O)C(=O)O')) assert MolToSmiles(mol) == '[H]OC(=O)C([H])(O[H])C([H])([H])[H]' mol = Filters.add_hydrogen(MolFromSmiles('CC(C(=O)[O-])O')) assert MolToSmiles(mol) == '[H]OC([H])(C(=O)[O-])C([H])([H])[H]'
def process(self, input: Union[str, list] = "", input_file: str = "", output_file: str = "", output_file_sdf: str = "", output_file_cml: str = "", sdf_append: bool = False, format_output: bool = True, opsin_output_format: str = "", output_formats: list = None, write_header: bool = True, dry_run: bool = False, csv_delimiter: str = ";", standardize_mols: bool = True, normalize_plurals: bool = True, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OPSIN. Parameters ---------- input : str or list | str: String with IUPAC names, one per line. | list: List of IUPAC names. input_file : str Path to file to be processed by OPSIN. One IUPAC name per line. output_file : str File to write output in. output_file_sdf : str File to write SDF output in. output_file_cml : str | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml". | Not supported by RDKit so standardization and conversion to other formats cannot be done. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys: | "iupac", <output formats>, ..., "error" | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error" | If False, the value of "content" key of returned dict will be None. opsin_output_format : str | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey" output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | Default value: ["smiles"] +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=======================+=======================+============================================================================================+ | smiles | RDKit | canonical | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_opsin | OPSIN ("smi") | SMILES | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_extended_opsin | OPSIN ("extendedsmi") | Extended SMILES. Not supported by RDKit. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi_opsin | OPSIN ("inchi") | InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchi_opsin | OPSIN ("stdinchi") | standard InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". Also molecule cannot be created from InChI-key. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchikey_opsin | OPSIN ("stdinchikey") | Standard InChI-key. Cannot be used by RDKit to create molecule. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. normalize_plurals : bool | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can set your own regex pattern with `plural_patterns` in __init__. continue_on_failure : bool | If True, continue running even if OPSIN returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OPSIN - stderr: str ... standard error output from OPSIN - exit_code: int ... exit code from OPSIN - content: - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error" - None ... when format_output is False """ options_internal = self.options_internal.copy() opsin_nonreadable_formats = ["cml", "stdinchikey"] if input and input_file: input_file = "" self.logger.warning( "Both 'input' and 'input_file' are set, but 'input' will be prefered." ) elif not input and not input_file: raise ValueError("One of 'input' or 'input_file' must be set.") # OSRA output format check if opsin_output_format: options_internal["output_format"] = opsin_output_format else: opsin_output_format = options_internal["output_format"] opsin_valid_output_formats = { "cml": "cml_opsin", "smi": "smiles_opsin", "extendedsmi": "smiles_extended_opsin", "inchi": "inchi_opsin", "stdinchi": "stdinchi_opsin", "stdinchikey": "stdinchikey_opsin" } if opsin_output_format not in opsin_valid_output_formats: raise ValueError( "Unknown OPSIN output format. Possible values: {}".format( list(opsin_valid_output_formats.keys()))) if standardize_mols and opsin_output_format in opsin_nonreadable_formats: self.logger.warning( "OPSIN output format is \"{}\", which cannot be used by RDKit." .format(opsin_output_format)) # output formats check if not output_formats: output_formats = ["smiles"] else: if opsin_output_format == "stdinchikey": output_formats = ["stdinchikey_opsin"] elif opsin_output_format == "extendedsmi": output_formats = ["smiles_extended_opsin"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = [ "smiles", "inchi", "inchikey", "sdf" ] output_formats = [ x for x in output_formats if x in possible_output_formats or x == opsin_valid_output_formats[opsin_output_format] ] if normalize_plurals: if input_file: with open(input_file, mode="r", encoding="utf-8") as f: input = "\n".join([x.strip() for x in f.readlines()]) input_file = "" input = self.normalize_iupac(input) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) if input_file: commands.append(input) stdout, stderr, exit_code = common_subprocess(commands) elif input: if isinstance(input, list): input = "\n".join([x.strip() for x in input]) stdout, stderr, exit_code = common_subprocess(commands, stdin=input) else: raise UserWarning("Input is empty.") if dry_run: return " ".join(commands) to_return = { "stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None } if not continue_on_failure and exit_code > 0: self.logger.warning("OPSIN error:") eprint("\n\t".join("\n{}".format(stderr).splitlines())) return to_return if output_file_cml and opsin_output_format == "cml": with open(output_file_cml, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return elif output_file_cml and opsin_output_format != "cml": self.logger.warning( "Output file for CML is requested, but OPSIN output format is '{}'" .format(opsin_output_format)) if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return compounds = [] standardizer = Standardizer() empty_cols = OrderedDict([(x, "") for x in output_formats]) if output_file_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) stdout = stdout.split("\n") del stdout[-1] stderr = [ x.strip() for x in stderr.split("\n")[1:] if x ] # remove first line of stderr because there is OPSIN message (y u du dis...) if input_file: with open(input_file, mode="r", encoding="utf-8") as f: lines = iter(f.readlines()) else: lines = iter(input.split("\n")) mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats + ["error"]) e = 0 for i, line in enumerate(lines): line = line.strip() converted = stdout[i].strip() mol_output = mol_output_template.copy() if converted: if opsin_output_format == "stdinchikey": compounds.append( OrderedDict([("iupac", line), ("stdinchikey_opsin", converted), ("error", "")])) continue elif opsin_output_format == "extendedsmi": compounds.append( OrderedDict([("iupac", line), ("smiles_extended_opsin", converted), ("error", "")])) continue if opsin_output_format == "smi": mol = MolFromSmiles( converted, sanitize=False if standardize_mols else True) elif opsin_output_format in ["inchi", "stdinchi"]: mol = MolFromInchi( converted, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": mol_output["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_opsin" and opsin_output_format == "smi": mol_output["smiles_opsin"] = converted elif f == "inchi": inchi = MolToInchi(mol) if inchi: mol_output["inchi"] = inchi else: mol_output["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( converted)) elif f == "inchi_opsin" and opsin_output_format == "inchi": mol_output["inchi_opsin"] = converted elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi": mol_output["stdinchi_opsin"] = converted elif f == "inchikey": inchi = MolToInchi(mol) if inchi: mol_output["inchikey"] = InchiToInchiKey(inchi) else: mol_output["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}". format(converted)) elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey": mol_output["stdinchikey_opsin"] = converted elif f == "sdf": mol_output["sdf"] = MolToMolBlock( mol, includeStereo=True) if output_file_sdf: writer.write(mol) mol_output.update( OrderedDict([("iupac", line), ("error", "")])) else: mol_output.update([ ("iupac", line), ("error", "Cannot convert to RDKit mol: {}".format(converted)) ]) mol_output.update(empty_cols) self.logger.warning(compounds[-1].error) else: try: error = stderr[e].strip() except IndexError: error = "" mol_output.update([("iupac", line), ("error", error)]) mol_output.update(empty_cols) e += 1 compounds.append(mol_output) to_return["content"] = compounds if output_file and compounds: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) elif output_file and not compounds: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(mol_output_template.keys()), write_header=write_header) return to_return
reg_scores = [] # collect scores for objective function logP_scores = [] # collect scores for logP term in objective function SA_values = [ ] # collect scores for synthetic accessibility term in objective function # 30 September - CAREFUL about variable names!!! This is the cause of the nans. This conflicts with the variable on line 340 and causes nans in the program at runtime. # 2 October - changed to SA_values - will have to change the variable names to be consistents between values and scores. for i in range(len(valid_smiles_final)): to_add = [] logP = [] SA = [] if len(valid_smiles_final[i]) != 0: for j in range(0, len(valid_smiles_final[i])): current_log_P_value = Descriptors.MolLogP( MolFromSmiles(valid_smiles_final[i][j])) current_SA_score = -sascorer.calculateScore( MolFromSmiles(valid_smiles_final[i][j])) cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix( MolFromSmiles(valid_smiles_final[i][j])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6