def test_SmartsMolFilter(self): smis = ['C1CCC1', 'C1CCC1C=O', 'CCCC', 'CCC=O', 'CC(=O)C', 'CCN', 'NCCN', 'NCC=O'] mols = [Chem.MolFromSmiles(x) for x in smis] suppl = SupplyNode(contents=mols) self.assertEqual(len(list(suppl)), 8) smas = ['C=O', 'CN'] counts = [1, 2] filt = SmartsMolFilter.SmartsFilter(patterns=smas, counts=counts) filt.AddParent(suppl) self.assertEqual(len(list(filt)), 5) suppl.reset() filt.SetNegate(True) self.assertEqual(len(list(filt)), 3) smas = ['C=O', 'CN'] filt = SmartsMolFilter.SmartsFilter(patterns=smas) filt.AddParent(suppl) self.assertEqual(len(list(filt)), 6) self.assertRaises(ValueError, SmartsMolFilter.SmartsFilter, patterns=smas, counts=['notEnough', ]) RDLogger.DisableLog('rdApp.error') self.assertRaises(ValueError, SmartsMolFilter.SmartsFilter, patterns=['BadSmarts']) RDLogger.EnableLog('rdApp.error')
def as_atom(symbol): # Temporarily disable rdkit's logging to avoid spamming with # "WARNING: not removing hydrogen atom without neighbors" RDLogger.DisableLog('rdApp.warning') mol = Chem.MolFromSmiles(f'[{symbol}]') RDLogger.EnableLog('rdApp.warning') return mol.GetAtoms()[0]
def test_SmartsRemover(self): salts = ['[Cl;H1&X1,-]', '[Na+]', '[O;H2,H1&-,X0&-2]', 'BadSmarts'] RDLogger.DisableLog('rdApp.error') self.assertRaises(ValueError, SmartsRemover.SmartsRemover, patterns=salts) RDLogger.EnableLog('rdApp.error')
def test_PatternHolder(self): fname = os.path.join(os.environ["RDBASE"], "Data", "NCI", "first_5K.smi") suppl = Chem.SmilesMolSupplier(fname, delimiter="\t", titleLine=False) mols1 = rdSubstructLibrary.CachedTrustedSmilesMolHolder() fps1 = rdSubstructLibrary.PatternHolder(2048) ssslib1 = rdSubstructLibrary.SubstructLibrary(mols1, fps1) mols2 = rdSubstructLibrary.CachedTrustedSmilesMolHolder() fps2 = rdSubstructLibrary.PatternHolder() ssslib2 = rdSubstructLibrary.SubstructLibrary(mols2, fps2) RDLogger.DisableLog('rdApp.error') for i in range(0, 1000, 10): try: mol = suppl[i] except Exception: continue if (not mol): continue mols1.AddSmiles(Chem.MolToSmiles(mol)) fps1.AddFingerprint(fps1.MakeFingerprint(mol)) ssslib2.AddMol(mol) RDLogger.EnableLog('rdApp.error') query = Chem.MolFromSmarts("N") self.assertIsNotNone(query) matches1 = sorted(ssslib1.GetMatches(query)) matches2 = sorted(ssslib2.GetMatches(query)) self.assertEqual(len(matches1), len(matches2)) self.assertTrue(all([m1 == matches2[i] for i, m1 in enumerate(matches1)]))
def test_case_study_rrc(uri, user, password): with open(DATABASE_CONF, "w") as file: file.write("uri=" + str(uri) + "\n") file.write("user="******"\n") file.write("password="******"/case_studies/redundant_representation_case/iJR904_mapped.xml") components = [ "cpd00214", "cpd03847", "cpd05274", "cpd25615", "cpd05237" ] solver = RedundantCaseSolver(model, "BiGG") solver.swap_from_generic(["cpd22513", "cpd15649"], components, True) # solver.generateISAreactions() os.remove(DATABASE_CONF) except: os.remove(DATABASE_CONF) raise Exception("Not well run")
def data_augm(rx_list): RDLogger.DisableLog('rdApp.*') rx_list_augm = rx_list.copy() for j, rx in enumerate(rx_list): rx_rand = rx i = 0 while rx == rx_rand and i < 10: rx_mol = Chem.MolFromSmiles(rx) if rx_mol == None: print(rx) new_atom_order = list(range(rx_mol.GetNumAtoms())) random.shuffle(new_atom_order) random_mol = Chem.RenumberAtoms(rx_mol, newOrder=new_atom_order) rx_rand = Chem.MolToSmiles(random_mol, canonical=False, isomericSmiles=False) i += 1 if rx_rand == rx: print( '\nFailed to generate random equivalent SMILES for the reaction:' ) print(rx) else: rx_list_augm.append(rx_rand) return rx_list_augm
def test_SmilesReaderBoundaryConditions(self): # Suppress the error message due to the incorrect smiles RDLogger.DisableLog('rdApp.error') smis = ['CC', 'CCOC', 'fail', 'CCO'] supp = Chem.SmilesMolSupplierFromText('\n'.join(smis), ',', 0, -1, 0) self.assertEqual(len(supp), 4) self.assertIsNone(supp[2]) self.assertIsNotNone(supp[3]) supp = Chem.SmilesMolSupplierFromText('\n'.join(smis), ',', 0, -1, 0) self.assertIsNone(supp[2]) self.assertIsNotNone(supp[3]) self.assertEqual(len(supp), 4) with self.assertRaises(IndexError): supp[4] supp = Chem.SmilesMolSupplierFromText('\n'.join(smis), ',', 0, -1, 0) self.assertEqual(len(supp), 4) self.assertIsNotNone(supp[3]) with self.assertRaises(IndexError): supp[4] supp = Chem.SmilesMolSupplierFromText('\n'.join(smis), ',', 0, -1, 0) with self.assertRaises(IndexError): supp[4] self.assertEqual(len(supp), 4) self.assertIsNotNone(supp[3])
def importDataFile( file_name: str, import_function: Callable[[str], pd.DataFrame] = pd.read_csv, fp_size: int = default_fp_size) -> pd.DataFrame: """ Reads data as CSV or TSV and calculates fingerprints from the SMILES in the data. :param import_function: :param file_name: Filename of CSV files containing the training data. The SMILES/Fingerprints are stored 1st column :param fp_size: Number of bits in the fingerprint :return: Two pandas dataframe containing the X and Y matrix for training and/or prediction. If no outcome data is provided, the Y matrix is a None object. """ # Read the data as Pandas pickle which already contains the calculated fingerprints name, ext = os.path.splitext(file_name) if ext == ".pkl": return pd.read_pickle(file_name) df = import_function(file_name) # disable the rdkit logger. We know that some inchis will fail and we took care of it. No use to spam the console RDLogger.DisableLog("rdApp.*") n_cores = multiprocessing.cpu_count() df_split = np.array_split(df, n_cores) with multiprocessing.Pool(n_cores) as pool: df = pd.concat( pool.map(partial(addFPColumn, fp_size=fp_size), df_split)) pool.close() pool.join() return df
def test_washing_with_dask(self): """Bit more elaborate of a test to see if rdkit handles a set of molecules in a consistent way in coming versions and to see how dask handles the used chem_functions functions. """ expected = ['CC(C)=CCC/C(C)=C\\CC/C(C)=C\\CO', 'CC12CC(O)C(CC1=O)C2(C)C', 'Oc1cc(C2CCNCC2)on1', 'Cn1ncc2cc(CN)ccc21', 'O=C(O)c1cc(Cl)cs1', 'Cc1cc(CN)ncc1Br', 'CO[C@@H](C)[C@@H](N)C(=O)O', 'Nc1ccc(Br)c(F)c1[N+](=O)[O-]', 'Cc1ccc(F)c(C#N)n1', 'Cc1ccc(F)c(CN)n1'] from rdkit import Chem from MCR import chem_functions import dask.bag as db from rdkit import RDLogger, rdBase rdBase.DisableLog('rdApp.error') RDLogger.DisableLog('rdApp.info') bag = db.read_text("../tests/test_data/test_db.smi", blocksize=16e6) bag = bag.map(lambda x: Chem.MolFromSmiles(x)).filter(lambda x: x is not None) bag = bag.map(chem_functions.remove_salts_mol) bag = bag.map(chem_functions.decharge_mol) bag = bag.map(chem_functions.get_largest_fragment_mol) bag = bag.map(chem_functions.standardize_mol) self.assertEqual([Chem.MolToSmiles(x) for x in bag.take(10)], expected)
def check_symmetry(met_filename): """ Function that checks if the given metabolite is symmetric. Uses pymatgen package for symmetry related operations, and RDKit for Molfile conversion to XYZ format. Requires Molfiles of the metabolites to be present in the working_dir/metabolites folder. Current criterion for symmetricity is for every carbon except one (central, if molecule consists of odd number of carbons) to have at least one equivalent carbon in the structure. Parameters ---------- met_filename : str Filename of the specific Molfile Returns ------- symmetrical : bool True if metabolite is symmetric, False if not. """ symmetrical = False # Disable RDKit warnings RDLogger.DisableLog('rdApp.*') # Counter for non symmetrical carbon atoms non_eq_carbons = 0 carbons = 0 # Convert Molfile to XYZ string molecule = Chem.MolFromMolFile(f'metabolites/{met_filename}') molecule_xyz = Chem.rdmolfiles.MolToXYZBlock(molecule) # Create IMolecule object to analyze its' symmetricity try: molecule_obj = structure.IMolecule.from_str(molecule_xyz, fmt='xyz') if len(molecule_obj) == 1: return symmetrical # Initialize point group analyzer pg_analyzer = analyzer.PointGroupAnalyzer(molecule_obj) except (IndexError, ValueError): # '*' is unrecognized in particular print(f'{met_filename} contains unrecognized symbols') return symmetrical # Extract equal atom sets eq_atoms = pg_analyzer.get_equivalent_atoms() for i in eq_atoms['eq_sets'].keys(): if str(molecule_obj[i].specie) == 'C': carbons += 1 if len(eq_atoms['eq_sets'].get(i)) == 1: non_eq_carbons += 1 if non_eq_carbons > 1: return symmetrical # Molecule has more than 1 carbon, and at most 1 non-symmetrical carbon if carbons > 1: symmetrical = True return symmetrical
def create_coms_from_mol_list(conformer_list, gau_tpl_content, base_out_name, max_num_coms, print_original): """ From a list of RDKit mol objects, create gaussian output files, optionally for only the specified number of objects :param conformer_list: :param gau_tpl_content: :param base_out_name: :param max_num_coms: int or infinity :param print_original: Boolean, whether to print the initial conformation :return: """ energy_list = [] if print_original: start_at = 0 else: start_at = 1 RDLogger.DisableLog('rdApp.*') for current_mol in conformer_list[start_at:]: opt_results = MMFFOptimizeMoleculeConfs(current_mol, maxIters=0) energy_list.append(opt_results[0][1]) combined_lists = zip(energy_list, conformer_list) zipped_sorted = sorted(combined_lists, key=itemgetter(0)) # for energy in sorted(energy_list): # print(f"{energy:15.8f}") mol_num = 0 last_energy = np.nan print_note = False com_fname = None for energy, current_mol in zipped_sorted: if mol_num >= max_num_coms: if np.isclose(energy, last_energy): print_note = True else: break mol_num += 1 last_energy = energy com_fname = create_out_fname(base_out_name, suffix=f"_{mol_num}", ext=".com", rel_path=True) pdb_str = MolToPDBBlock(current_mol) create_com_from_pdb_str(pdb_str, gau_tpl_content, com_fname) print(f"{int(energy):12,} {com_fname}") if com_fname: print( f"Wrote {mol_num} files, ending with: {os.path.relpath(com_fname)}" ) else: print("No output created from rotating dihedrals.") if print_note: print( f"More than {max_num_coms} conformations were output to ties calculated energies." )
def sample_Reaxys(df, s): #Remove rdkit warnings RDLogger.DisableLog('rdApp.*') #Sample given sample size s smiles = df.sample(frac=s / len(df.index))["smiles"].tolist() smiles = list(map(str, smiles)) #Convert all sampled smiles strings into mols mols = [Chem.MolFromSmiles(smi.strip()) for smi in smiles] mols = [m for m in mols if m != None] print("Retieved", len(mols), "random molecules") return mols
def sdf_text_worker(merged_results, vendors, num_mols, start_time, mol_counter, fragment_counter, drug_like_counter, big_counter, parent_fragment_collector, parent_drug_like_collector, parent_big_collector, failures, addhs, embed, verbose): if not verbose: RDLogger.DisableLog('rdApp.*') fragment_collector, drug_like_collector, big_collector = [], [], [] for index, row in merged_results.iterrows(): try: mol = Chem.MolFromSmiles(row['smiles']) if addhs: mol = Chem.AddHs(mol) if embed: AllChem.EmbedMolecule(mol) properties = {vendor: row[vendor] for vendor in vendors} mol_name = ','.join([ identifier for identifier in properties.values() if len(identifier) > 0 ]) if len(mol_name) > 20: mol_name = mol_name[:17] + '...' mol.SetProp('_Name', mol_name) properties['smiles'] = row['smiles'] molecular_weight = ExactMolWt(mol) except: failures.append(' '.join(['write_error', row['smiles']])) molecular_weight = 10000 if molecular_weight < 1200: if molecular_weight < 300: with fragment_counter.get_lock(): fragment_counter.value += 1 fragment_collector.append(sdf_text(mol, properties)) elif 300 <= molecular_weight < 700: with drug_like_counter.get_lock(): drug_like_counter.value += 1 drug_like_collector.append(sdf_text(mol, properties)) else: with big_counter.get_lock(): big_counter.value += 1 big_collector.append(sdf_text(mol, properties)) with mol_counter.get_lock(): mol_counter.value += 1 update_progress(mol_counter.value / num_mols, 'Progress of writing', ((time.time() - start_time) / mol_counter.value) * (num_mols - mol_counter.value)) parent_fragment_collector.extend(fragment_collector) parent_drug_like_collector.extend(drug_like_collector) parent_big_collector.extend(big_collector) return
def setUp(self): self.dataset = dict() self.dataset_inchi = dict() inf = gzip.open(os.path.join(RDConfig.RDCodeDir, 'Chem/test_data', 'pubchem-hard-set.sdf.gz'), 'r') self.dataset['problematic'] = ForwardSDMolSupplier(inf, sanitize=False, removeHs=False) with open(os.path.join(RDConfig.RDCodeDir, 'Chem/test_data', 'pubchem-hard-set.inchi'), 'r') as intF: buf = intF.read().replace('\r\n', '\n').encode('latin1') intF.close() with io.BytesIO(buf) as inF: pkl = inF.read() self.dataset_inchi['problematic'] = pickle.loads(pkl, encoding='latin1') # disable logging RDLogger.DisableLog('rdApp.warning')
def _gen_compound(mol): rkl.DisableLog("rdApp.*") try: if explicit_h: mol = RemoveHs(mol) # resolve potential tautomers and choose first one mol_smiles = MolToSmiles(mol, True) if "n" in mol_smiles: mol_smiles = utils.postsanitize_smiles([mol_smiles])[0][0] mol = MolFromSmiles(mol_smiles) SanitizeMol(mol) # TODO: logger # Get lots of "Explicit valence greater than permitted" errors here # This is for predicted compounds that are infeasible, so we throw them out except BaseException: return None rkl.EnableLog("rdApp.*") mol_smiles = MolToSmiles(mol, True) if "." in mol_smiles: return None cpd_id, inchi_key = utils.get_compound_hash(mol_smiles, "Predicted") if cpd_id: if cpd_id not in local_cpds: cpd_dict = { "ID": None, "_id": cpd_id, "SMILES": mol_smiles, "InChI_key": inchi_key, "Type": "Predicted", "Generation": generation, "atom_count": utils.get_atom_count(mol), "Reactant_in": [], "Product_of": [], "Expand": True, "Formula": CalcMolFormula(mol), "last_tani": 0, } else: cpd_dict = local_cpds[cpd_id] return cpd_dict else: return None
def sample(self, num=1, start='G'): sampled = [] if self.session == 'generate': for _ in tqdm(range(num)): sampled.append(self._generate(start)) return sampled else: from rdkit import Chem, RDLogger RDLogger.DisableLog('rdApp.*') while len(sampled) < num: sequence = self._generate(start) mol = Chem.MolFromSmiles(sequence) if mol is not None: canon_smiles = Chem.MolToSmiles(mol) sampled.append(canon_smiles) return sampled
def _preprocess(self): x = { 'product': [], 'substrates': [], } split = [] meta = [] split_keys = ['train', 'valid', 'test'] # there is a warning about hydrogen atoms that do not have neighbors that could not be deleted (that is OK) RDLogger.DisableLog('rdApp.*') for split_i, split_key in enumerate(split_keys): split_path = os.path.join(self.feat_dir, f'data/{split_key}.txt') file_len = sum(1 for _ in open(split_path, 'r')) for line in tqdm(open(split_path, 'r'), desc=f'reading {split_key} reactions', total=file_len): split_line = line.split(' ') reaction = split_line[0] meta_info = split_line[1].strip() subs, prod = tuple(reaction.split('>>')) subs = subs.strip() prod = prod.strip() x['substrates'].append(subs) x['product'].append(prod) split.append(split_i) meta.append(meta_info) logger.info(f'Saved {file_len} {split_key} reactions') split = np.asarray(split, dtype=int) split_df = dict( (k, (split == i).astype(int)) for i, k in enumerate(split_keys)) meta = {'uspto_mit_split': split, 'meta_info': meta} logger.info(f"Saving 'x' to {self.x_path}") pd.DataFrame(x).to_csv(self.x_path, sep='\t') logger.info(f"Saving {self.metadata_path}") pd.DataFrame(meta).to_csv(self.metadata_path, sep='\t') split_path = os.path.join(self.dir, 'default_split.csv') logger.info(f"Saving default split to {split_path}") pd.DataFrame(split_df).to_csv(split_path)
def formalCharge(molecule): """Compute the formal charge on a molecule. This function requires that the molecule has explicit hydrogen atoms. Parameters ---------- molecule : :class:`Molecule <BioSimSpace._SireWrappers.Molecule>` A molecule object. Returns ------- formal_charge : :class:`Charge <BioSimSpace.Types.Charge>` The total formal charge on the molecule. """ if type(molecule) is not _Molecule: raise TypeError("'molecule' must be of type 'BioSimSpace._SireWrappers.Molecule'") from rdkit import Chem as _Chem from rdkit import RDLogger as _RDLogger # Disable RDKit warnings. _RDLogger.DisableLog('rdApp.*') # Create a temporary working directory. tmp_dir = _tempfile.TemporaryDirectory() work_dir = tmp_dir.name # Zero the total formal charge. formal_charge = 0 # Run in the working directory. with _Utils.cd(work_dir): # Save the molecule to a PDB file. _IO.saveMolecules("tmp", molecule, "PDB") # Read the ligand PDB into an RDKit molecule. mol = _Chem.MolFromPDBFile("tmp.pdb") # Compute the formal charge. formal_charge = _Chem.rdmolops.GetFormalCharge(mol) return formal_charge * _electron_charge
def create_fingerprints(df_Without_Double_or_Triple, similarity_value=0.95): """ Gets a data frame with only a single entry in the taxonomy row. Uses RDkit modul to create Morgan-Fingerprints from the smiles code of each aglycon. Passes the input of the similarity value, the smiles code of the aglycons, the fingerprint of the aglycons and the created data frame with only single entries in the taxonomy row. """ with open(df_Without_Double_or_Triple, "rb") as infile: df_Without_Double_or_Triple = pickle.load(infile, encoding="utf-8") mol_From_Smiles = [] index_Mol_Explicit_Valence = [] index_Mol_Implicit_Valence = [] index = 0 RDLogger.DisableLog('rdApp.*') for smiles in df_Without_Double_or_Triple.deglycosilated_smiles: mol = Chem.MolFromSmiles(smiles) if mol == None: index_Mol_Explicit_Valence.append(index) else: mol_From_Smiles.append(mol) index_Mol_Implicit_Valence.append(index) index += 1 #print(index_Mol_Explicit_Valence) df_Without_Explicit_Valence = df_Without_Double_or_Triple.iloc[ index_Mol_Implicit_Valence[:]] df_Without_Explicit_Valence = df_Without_Explicit_Valence.reset_index() #df_Without_Explicit_Valence fps = [ AllChem.GetMorganFingerprint(mol, 2, useFeatures=True) for mol in mol_From_Smiles ] # create combinations of deglycosilated_smiles for indexing aglycon_formula_for_indexing = list( df_Without_Explicit_Valence.deglycosilated_smiles) aglycon_formulas = [ aglycon_pair for aglycon_pair in itertools.combinations( aglycon_formula_for_indexing, 2) ] #print(len(aglycon_formulas)) print("MORGAN FINGERPRINTS DONE") create_tanimoto_index(similarity_value, aglycon_formulas, fps, df_Without_Double_or_Triple)
def main(): # ### AGAVE TEST ### # agave_test() # #Test: Time Fragments for Earth atmosphere SMILES strings # cpd_smiles = open("Other/Earth_atmosphere_SMILES.txt", "rb").readlines() # ### Get KEGG Mol Objects ### # kegg_mols = read_KEGG_mols() # # ### Get Reaxys Mol Objects ### # #Read in full reaction database # df = pd.DataFrame() # for i in range(1,11): # df = df.append(read_cpds(str(i)), ignore_index=True) # print("Done with subset", i, "...") # print("Df size", len(df.index)) ### ADENINE TEST (FOR ERNEST) ### #adenine_fragments("C1=NC2=NC=NC(=C2N1)N", cpd_mols) ### PARALLEL FRAGMENT GENERTION ### pool = Pool(processes=8) RDLogger.DisableLog('rdApp.*') # #kegg_size = len(kegg_mols) # for i in range(10): # print("Analyzing sample", i) # fp = "Technology/Data/Reaxys_1000_Samples/" # reaxys_mols = sample_Reaxys(df, 1000) # # #Save mols for future occurrence testing # pickle.dump(reaxys_mols, open(fp + "sample_" + str(i) + "_ReaxysMols.p", "wb")) # # generate_fragments(pool, reaxys_mols, fp + "sample_" + str(i) + "frags.p") # # ### FIND UNIQUE FRAGMENTS ### # find_unique_frags(pool, fp + "sample_" + str(i) + "frags.p", fp + "sample_" + str(i) + "frags_unique.p") # print() # #for fp in os.listdir("Technology/Data/"): #Test on one file find_unique_frags(pool, "Technology/Data/Reaxys_fragments_keggSize_0.p", "Technology/Data/Reaxys_fragments_keggSize_0unique.p")
def test3SmilesSupplier(self): txt = """C1CC1,1 CC(=O)O,3 fail,4 CCOC,5 """ RDLogger.DisableLog('rdApp.error') fileN = tempfile.mktemp('.csv') try: with open(fileN, 'w+') as f: f.write(txt) suppl = Chem.SmilesMolSupplier(fileN, delimiter=',', smilesColumn=0, nameColumn=1, titleLine=0) ms = [x for x in suppl] while ms.count(None): ms.remove(None) self.assertEqual(len(ms), 3) finally: os.unlink(fileN)
def test3SmilesSupplier(self): txt = """C1CC1,1 CC(=O)O,3 fail,4 CCOC,5 """ RDLogger.DisableLog('rdApp.error') try: with tempfile.NamedTemporaryFile('w+', suffix='.csv', delete=False) as tmp: tmp.write(txt) suppl = Chem.SmilesMolSupplier(tmp.name, delimiter=',', smilesColumn=0, nameColumn=1, titleLine=0) ms = [x for x in suppl] suppl = None while ms.count(None): ms.remove(None) self.assertEqual(len(ms), 3) finally: os.unlink(tmp.name)
def search(query: str, min_mw: float, max_mw: float, layout: widgets.Box) -> None: with get_new_log_box(layout): clear_search_output(layout) results = get_synonym_matches(query) for cur in results: RDLogger.DisableLog("rdApp.*") # hide rdkit warnings cur["mol"] = cheminfo.normalize_molecule( Chem.inchi.MolFromInchi(cur["inchi"])) cur["norm_inchi"] = Chem.inchi.MolToInchi(cur["mol"]) RDLogger.EnableLog("rdApp.*") cur["MW"] = ExactMolWt(cur["mol"]) filtered = filter_by_mw(filter_to_norm_inchi_in_db(results), min_mw, max_mw) logger.debug("Found %d matches to %s.", len(filtered), query) if not is_valid_num_results(len(filtered), query, layout): return final = sorted(filtered, key=lambda x: x["MW"]) logger.debug("Num mols: %d", len(final)) column_names = ["", "Name", "MW", "Structure"] sheet = ipysheet.sheet( rows=len(final), columns=len(column_names), column_headers=column_names, column_resizing=False, column_width=[1, 4, 2, 10], ) buttons = [ widgets.Button(description="use", layout=widgets.Layout(width="100%")) for x in final ] for button in buttons: button.on_click( lambda current: on_use_button_clicked(current, final, layout)) ipysheet.column(0, buttons) ipysheet.column(1, [x["name"] for x in final]) ipysheet.column(2, [ExactMolWt(x["mol"]) for x in final]) ipysheet.column(3, [cheminfo.mol_to_image(x["mol"]) for x in final]) layout.children = swap_layout(layout.children, LayoutPosition.SEARCH_OUTPUT.value, sheet)
import pandas as pd from IPython import display from keras.layers import Input, Dense, Conv1D, MaxPooling2D, UpSampling2D, UpSampling1D, MaxPooling1D, Lambda from keras.layers.recurrent import GRU from keras.layers.core import Dense, Flatten, RepeatVector, Dropout from keras.losses import mse, binary_crossentropy, categorical_crossentropy from keras.layers.merge import Concatenate from keras.models import Model from keras import backend as K from keras.layers.normalization import BatchNormalization from keras.callbacks import ModelCheckpoint from rdkit import RDLogger from sklearn.model_selection import train_test_split import time RDLogger.DisableLog('rdApp.*') def add_space(raw_data, input_dim = 34): out = [] for i in raw_data: if len(i) < input_dim: out.append(i+' '*(input_dim - len(i))) else: out.append(i) return(out) def plot_auto(out, predict_st): size = (50, 50)
def from_smiles(smiles: str, with_hydrogen: bool = False, kekulize: bool = False): r"""Converts a SMILES string to a :class:`torch_geometric.data.Data` instance. Args: smiles (string, optional): The SMILES string. with_hydrogen (bool, optional): If set to :obj:`True`, will store hydrogens in the molecule graph. (default: :obj:`False`) kekulize (bool, optional): If set to :obj:`True`, converts aromatic bonds to single/double bonds. (default: :obj:`False`) """ from rdkit import Chem, RDLogger from torch_geometric.data import Data RDLogger.DisableLog('rdApp.*') mol = Chem.MolFromSmiles(smiles) if mol is None: mol = Chem.MolFromSmiles('') if with_hydrogen: mol = Chem.AddHs(mol) if kekulize: mol = Chem.Kekulize(mol) xs = [] for atom in mol.GetAtoms(): x = [] x.append(x_map['atomic_num'].index(atom.GetAtomicNum())) x.append(x_map['chirality'].index(str(atom.GetChiralTag()))) x.append(x_map['degree'].index(atom.GetTotalDegree())) x.append(x_map['formal_charge'].index(atom.GetFormalCharge())) x.append(x_map['num_hs'].index(atom.GetTotalNumHs())) x.append(x_map['num_radical_electrons'].index( atom.GetNumRadicalElectrons())) x.append(x_map['hybridization'].index(str(atom.GetHybridization()))) x.append(x_map['is_aromatic'].index(atom.GetIsAromatic())) x.append(x_map['is_in_ring'].index(atom.IsInRing())) xs.append(x) x = torch.tensor(xs, dtype=torch.long).view(-1, 9) edge_indices, edge_attrs = [], [] for bond in mol.GetBonds(): i = bond.GetBeginAtomIdx() j = bond.GetEndAtomIdx() e = [] e.append(e_map['bond_type'].index(str(bond.GetBondType()))) e.append(e_map['stereo'].index(str(bond.GetStereo()))) e.append(e_map['is_conjugated'].index(bond.GetIsConjugated())) edge_indices += [[i, j], [j, i]] edge_attrs += [e, e] edge_index = torch.tensor(edge_indices) edge_index = edge_index.t().to(torch.long).view(2, -1) edge_attr = torch.tensor(edge_attrs, dtype=torch.long).view(-1, 3) if edge_index.numel() > 0: # Sort indices. perm = (edge_index[0] * x.size(0) + edge_index[1]).argsort() edge_index, edge_attr = edge_index[:, perm], edge_attr[perm] return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, smiles=smiles)
import os import pickle from glob import glob import numpy as np import pandas as pd import requests from rdkit import RDLogger from rdkit.Chem import MolFromSmiles, MolFromSmarts from rdkit.Chem.inchi import MolFromInchi, MolToInchi from tqdm import tqdm from molgrad.utils import DATA_PATH, PROCESSED_DATA_PATH RDLogger.DisableLog("rdApp.*") IUPAC_REST = "http://cactus.nci.nih.gov/chemical/structure/{}/inchi" def smi_to_inchi_with_val(smiles, ovalues): inchis = [] values = [] for smi, val in zip(smiles, ovalues): mol = MolFromSmiles(smi) if mol is not None: try: inchi = MolToInchi(mol) m = MolFromInchi(inchi) if m is not None: # ensure rdkit can read an inchi it just wrote... inchis.append(inchi) values.append(val)
def test1InchiReadPubChem(self): for f in self.dataset.values(): same, diff, reasonable = 0, 0, 0 for m in f: if m is None: # pragma: nocover continue x = MolToInchi(m) y = None RDLogger.DisableLog('rdApp.error') mol = MolFromInchi(x) RDLogger.EnableLog('rdApp.error') if mol is not None: y = MolToInchi( MolFromSmiles(MolToSmiles(mol, isomericSmiles=True))) if y is None: # metal involved? try: MolToInchi(m, treatWarningAsError=True) except InchiReadWriteError as inst: _, error = inst.args if 'Metal' in error or \ 'Charges were rearranged' in error: reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # RDKit does not like the SMILES? use MolBlock instead inchiMol = MolFromInchi(x) if inchiMol: rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock( MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue # InChI messed up the radical? unsanitizedInchiMol = MolFromInchi(x, sanitize=False) if sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in m.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]) != sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in unsanitizedInchiMol.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]): reasonable += 1 continue diff += 1 cid = m.GetProp('PUBCHEM_COMPOUND_CID') print(COLOR_GREEN + 'Empty mol for PubChem Compound ' + cid + '\n' + COLOR_RESET) continue if x != y: # if there was warning in the first place, then this is # tolerable try: MolToInchi(m, treatWarningAsError=True) MolFromInchi(x, treatWarningAsError=True) except InchiReadWriteError as inst: reasonable += 1 continue # or if there are big rings SanitizeMol(m) if filter(lambda i: i >= 8, [len(r) for r in m.GetRingInfo().AtomRings()]): reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # or if RDKit loses bond stereo s = MolToSmiles(m, True) if MolToSmiles(MolFromSmiles(s), True) != s: reasonable += 1 continue # or if it is RDKit SMILES writer unhappy about the mol inchiMol = MolFromInchi(x) rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue diff += 1 print(COLOR_GREEN + 'Molecule mismatch for PubChem Compound ' + cid + COLOR_RESET) print(inchiDiff(x, y)) print() else: same += 1 fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}" print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET)) self.assertEqual(same, 621) self.assertEqual(diff, 0) self.assertEqual(reasonable, 560)
def process(self): try: import rdkit from rdkit import Chem, RDLogger from rdkit.Chem.rdchem import BondType as BT from rdkit.Chem.rdchem import HybridizationType RDLogger.DisableLog('rdApp.*') except ImportError: rdkit = None if rdkit is None: print(("Using a pre-processed version of the dataset. Please " "install 'rdkit' to alternatively process the raw data."), file=sys.stderr) data_list = torch.load(self.raw_paths[0]) data_list = [Data(**data_dict) for data_dict in data_list] if self.pre_filter is not None: data_list = [d for d in data_list if self.pre_filter(d)] if self.pre_transform is not None: data_list = [self.pre_transform(d) for d in data_list] torch.save(self.collate(data_list), self.processed_paths[0]) return types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4} bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3} with open(self.raw_paths[1], 'r') as f: target = f.read().split('\n')[1:-1] target = [[float(x) for x in line.split(',')[1:20]] for line in target] target = torch.tensor(target, dtype=torch.float) target = torch.cat([target[:, 3:], target[:, :3]], dim=-1) target = target * conversion.view(1, -1) with open(self.raw_paths[2], 'r') as f: skip = [int(x.split()[0]) - 1 for x in f.read().split('\n')[9:-2]] suppl = Chem.SDMolSupplier(self.raw_paths[0], removeHs=False, sanitize=False) data_list = [] for i, mol in enumerate(tqdm(suppl)): if i in skip: continue N = mol.GetNumAtoms() pos = suppl.GetItemText(i).split('\n')[4:4 + N] pos = [[float(x) for x in line.split()[:3]] for line in pos] pos = torch.tensor(pos, dtype=torch.float) type_idx = [] atomic_number = [] aromatic = [] sp = [] sp2 = [] sp3 = [] num_hs = [] for atom in mol.GetAtoms(): type_idx.append(types[atom.GetSymbol()]) atomic_number.append(atom.GetAtomicNum()) aromatic.append(1 if atom.GetIsAromatic() else 0) hybridization = atom.GetHybridization() sp.append(1 if hybridization == HybridizationType.SP else 0) sp2.append(1 if hybridization == HybridizationType.SP2 else 0) sp3.append(1 if hybridization == HybridizationType.SP3 else 0) z = torch.tensor(atomic_number, dtype=torch.long) row, col, edge_type = [], [], [] for bond in mol.GetBonds(): start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() row += [start, end] col += [end, start] edge_type += 2 * [bonds[bond.GetBondType()]] edge_index = torch.tensor([row, col], dtype=torch.long) edge_type = torch.tensor(edge_type, dtype=torch.long) edge_attr = F.one_hot(edge_type, num_classes=len(bonds)).to(torch.float) perm = (edge_index[0] * N + edge_index[1]).argsort() edge_index = edge_index[:, perm] edge_type = edge_type[perm] edge_attr = edge_attr[perm] row, col = edge_index hs = (z == 1).to(torch.float) num_hs = scatter(hs[row], col, dim_size=N).tolist() x1 = F.one_hot(torch.tensor(type_idx), num_classes=len(types)) x2 = torch.tensor([atomic_number, aromatic, sp, sp2, sp3, num_hs], dtype=torch.float).t().contiguous() x = torch.cat([x1.to(torch.float), x2], dim=-1) y = target[i].unsqueeze(0) name = mol.GetProp('_Name') data = Data(x=x, z=z, pos=pos, edge_index=edge_index, edge_attr=edge_attr, y=y, name=name, idx=i) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), self.processed_paths[0])
def _get_molecule_database(self, molecule_database_src, molecule_database_src_type): """Load molecular database and return it. Optionally return features if found in excel / csv file. Args: molecule_database_src (str): Source of molecular information. Can be a folder or a filepath. In case a folder is specified, all .pdb files in the folder are sequentially read. If a file path, it is assumed that the file is a .txt file with layout: SMILES string (column1) '\b' property (column2, optional). molecule_database_src_type (str): Type of source. Can be ['folder', 'text', 'excel', 'csv'] Returns: (list(Molecule), np.ndarray or None) Returns a tuple. First element of tuple is the molecule_database. Second element is array of features of shape (len(molecule_database), n_features) or None if None found. """ if not self.is_verbose: RDLogger.DisableLog('rdApp.*') molecule_database = [] features = None if molecule_database_src_type.lower() in ["folder", "directory"]: if self.is_verbose: print(f"Searching for *.pdb files in {molecule_database_src}") for molfile in glob(os.path.join(molecule_database_src, "*.pdb")): if self.is_verbose: print(f"Loading {molfile}") try: molecule_database.append(Molecule(mol_src=molfile)) except LoadingError as e: if self.is_verbose: print(f"{molfile} could not be imported. Skipping") elif molecule_database_src_type.lower() == "text": if self.is_verbose: print(f"Reading SMILES strings from {molecule_database_src}") with open(molecule_database_src, "r") as fp: smiles_data = fp.readlines() for count, line in enumerate(smiles_data): # Assumes that the first column contains the smiles string line_fields = line.split() smile = line_fields[0] mol_property_val = None if len(line_fields) > 1: mol_property_val = float(line_fields[1]) if self.is_verbose: print(f"Processing {smile} " f"({count + 1}/" f"{len(smiles_data)})") mol_text = smile try: molecule_database.append( Molecule( mol_smiles=smile, mol_text=mol_text, mol_property_val=mol_property_val, )) except LoadingError as e: if self.is_verbose: print(f"{smile} could not be imported. Skipping") elif molecule_database_src_type.lower() in ["excel", "csv"]: if self.is_verbose: print(f"Reading molecules from {molecule_database_src}") database_df = (pd.read_excel(molecule_database_src, engine="openpyxl") if molecule_database_src_type.lower() == "excel" else pd.read_csv(molecule_database_src)) # expects feature columns to be prefixed with feature_ # e.g. feature_smiles feature_cols = [ column for column in database_df.columns if column.split("_")[0] == "feature" ] database_feature_df = database_df[feature_cols] mol_names, mol_smiles, responses = None, None, None if "feature_name" in feature_cols: mol_names = database_feature_df["feature_name"].values.flatten( ) database_feature_df = database_feature_df.drop( ["feature_name"], axis=1) if "feature_smiles" in feature_cols: mol_smiles = database_df["feature_smiles"].values.flatten() database_feature_df = database_feature_df.drop( ["feature_smiles"], axis=1) response_col = [ column for column in database_df.columns if column.split("_")[0] == "response" ] if len(response_col) > 0: # currently handles one response responses = database_df[response_col].values.flatten() for mol_id, smile in enumerate(mol_smiles): if self.is_verbose: print(f"Processing {smile} " f"({mol_id + 1}/" f"{database_df['feature_smiles'].values.size})") mol_text = mol_names[mol_id] if mol_names is not None else smile mol_property_val = responses[ mol_id] if responses is not None else None try: molecule_database.append( Molecule( mol_smiles=smile, mol_text=mol_text, mol_property_val=mol_property_val, )) except LoadingError as e: if self.is_verbose: print(f"{smile} could not be imported. Skipping") if len(database_feature_df.columns) > 0: features = database_feature_df.values else: raise FileNotFoundError( f"{molecule_database_src} could not be found. " f"Please enter valid folder name or path of a " f"text/excel/csv") if len(molecule_database) == 0: raise UserWarning("No molecular files found in the location!") return molecule_database, features
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # import unittest import os, sys, copy import pickle from rdkit import rdBase from rdkit import Chem from rdkit.Chem.rdRGroupDecomposition import RGroupDecompose, RGroupDecomposition, RGroupDecompositionParameters from collections import OrderedDict # the RGD code can generate a lot of warnings. disable them from rdkit import RDLogger RDLogger.DisableLog("rdApp.warning") class TestCase(unittest.TestCase): def test_multicores(self): cores_smi_easy = OrderedDict() cores_smi_hard = OrderedDict() #cores_smi_easy['cephem'] = Chem.MolFromSmiles('O=C1C([1*])[C@@H]2N1C(C(O)=O)=C([3*])CS2') cores_smi_easy['cephem'] = Chem.MolFromSmarts('O=C1C([*:1])C2N1C(C(O)=O)=C([*:3])CS2') cores_smi_hard['cephem'] = Chem.MolFromSmarts('O=C1C([2*])([1*])[C@@H]2N1C(C(O)=O)=C([3*])CS2') #cores_smi_easy['carbacephem'] = Chem.MolFromSmiles('O=C1C([1*])[C@@H]2N1C(C(O)=O)=C([3*])CC2') cores_smi_easy['carbacephem'] = Chem.MolFromSmarts('O=C1C([1*])C2N1C(C(O)=O)=C([3*])CC2') cores_smi_hard['carbacephem'] = Chem.MolFromSmarts(