def desalt(mol): # This molecule escaped my patterns: InChI InChI=1S/2C6H11NO5.O.V/c2*1-3(5(8)9)7(12)4(2)6(10)11;;/h2*3-4,12H,1-2H3,(H,8,9)(H,10,11);;/q;;;+2/p-2/t2*3-,4-;;/m00../s1 gave an error Molecule must be fully connected by covalent bonds.: #input is an rdkit mol #returns an rdkit mol keeping the biggest component #returns original mol if only one component #returns a boolean indicated if cleaning was necessary mol = MolToInchi(mol) mol = MolFromInchi(mol) SanitizeMol(mol) d = Chem.rdmolops.GetMolFrags(mol) #these are atom indices if len( d ) == 1: #If there are fragments or multiple molecules this will be greater than 1 return mol, False my_smiles = Chem.MolToSmiles(mol, True) parent_atom_count = 0 disconnected = my_smiles.split('.') #With GetMolFrags, we've already established that there is more than one disconnected structure status = False for s in disconnected: little_mol = Chem.MolFromInchi( Chem.MolToInchi(Chem.MolFromSmiles(s, sanitize=False))) #Sanitize=True will fail for choline sulfate. Can't sanitize the radical. if little_mol is not None: count = little_mol.GetNumAtoms() if count > parent_atom_count: parent_atom_count = count parent_mol = little_mol status = True return parent_mol, status
def parallel_wrapper(mol, rest_inchis, n_total): sims = np.zeros(n_total, dtype=np.float32) n_rest = len(rest_inchis) fill_idx = n_total - n_rest for inchi in rest_inchis: mol_j = MolFromInchi(inchi) sims[fill_idx] = tanimoto_sim(mol, mol_j) fill_idx += 1 return sims
def render_structure(self): # Try to render structure from InChI or SMILES if RDKIT_AVAILABLE: mol = None if RDKIT_INCHI and self._inchi: # Use InChI first mol = MolFromInchi(self._inchi) elif self._smiles is not None: # If InChI not available, use SMILES as a fallback mol = MolFromSmiles(self._smiles) if mol is not None: if not mol.GetNumConformers(): rdDepictor.Compute2DCoords(mol) drawer = rdMolDraw2D.MolDraw2DSVG(self.size().width(), self.size().height()) drawer.DrawMolecule(mol) drawer.FinishDrawing() svg = drawer.GetDrawingText().replace('svg:', '') self.load(QByteArray(svg.encode())) else: self.load(QByteArray(b'')) elif OPENBABEL_AVAILABLE: # If RDkit not available, try to use OpenBabel mol = None try: if OPENBABEL_INCHI and self._inchi: mol = pybel.readstring('inchi', self._inchi) elif self._smiles: mol = pybel.readstring('smiles', self._smiles) except OSError: self.load(QByteArray(b'')) else: if mol is not None: # Convert to svg, code loosely based on _repr_svg_ from pybel's Molecule namespace = "http://www.w3.org/2000/svg" tree = etree.fromstring(mol.write("svg")) svg = tree.find(f"{{{namespace}}}g/{{{namespace}}}svg") self.load(QByteArray(etree.tostring(svg))) else: self.load(QByteArray(b'')) self.btShowStructure.setVisible(False)
def smi_to_inchi_with_val(smiles, ovalues): inchis = [] values = [] for smi, val in zip(smiles, ovalues): mol = MolFromSmiles(smi) if mol is not None: try: inchi = MolToInchi(mol) m = MolFromInchi(inchi) if m is not None: # ensure rdkit can read an inchi it just wrote... inchis.append(inchi) values.append(val) except: continue return inchis, values
def process_cyp(neutralize=False): df = pd.read_csv(os.path.join(DATA_PATH, "cyp", "CYP3A4.csv"), header=0, sep=";") df["Value"] = [1 if class_ == "Active" else 0 for class_ in df["Class"]] inchis, values = smi_to_inchi_with_val(df["SMILES"], df["Value"]) df = pd.DataFrame({"inchi": inchis, "values": values}) inchis, values = mean_by_key(df, "inchi", "values") if neutralize: inchis = [ MolToInchi(neutralize_atoms(MolFromInchi(inchi), PATTERN)) for inchi in inchis ] with open(os.path.join(DATA_PATH, "cyp", "data_cyp.pt"), "wb") as handle: pickle.dump([inchis, values], handle)
def smiles_from_inchi(inchi: str) -> Union[str, None]: """ Get a SMILES descriptor from an InChI descriptor. Uses RDKit for the conversion. Args: inchi (str): The InChI descriptor. Returns: str: The SMILES descriptor. """ try: rd_mol = MolFromInchi(inchi) smiles = MolToSmiles(rd_mol, isomericSmiles=True, canonical=True, allBondsExplicit=False, allHsExplicit=False) except: return None return smiles
def sim_matrix(inchis): """Computes pairwise similarity matrix between all compounds in the `inchis` list. Parameters ---------- inchis : list A list of inchi strings Returns ------- np.ndarray """ n_total = len(inchis) sims = Parallel(n_jobs=-1, verbose=100, backend="multiprocessing")( delayed(parallel_wrapper)(MolFromInchi(inchi), inchis[(idx + 1):], n_total) for idx, inchi in enumerate(inchis)) sims = np.stack(sims) sims += sims.copy().T sims += np.eye(n_total) return sims
def process_herg(list_csvs, keep_operators=False, neutralize=False): df = pd.read_csv(list_csvs[0], sep="\t") for idx, csv in enumerate(list_csvs): if idx > 0: df_next = pd.read_csv(csv, sep="\t") df = pd.concat([df, df_next]) # filter only IC50, nM, = data. condition = (df.Value_type == "IC50") & (df.Unit == "nM") if not keep_operators: condition = condition & (df.Relation == "=") df = df.loc[condition, ["Canonical_smiles", "Value"], ] df.Value = -np.log10(df.Value * 1e-9) # pIC50 conversion per_dup, stds = duplicate_analysis(df, "Canonical_smiles", "Value") print( "Percentage of duplicates for hERG dataset: {:.3f}, with average std.: {:.3f}, and median std.:{:.3f}" .format(per_dup, np.mean(stds), np.median(stds))) df.drop_duplicates(inplace=True) # average values with several measurements uq_smiles, uq_values = mean_by_key(df, "Canonical_smiles", "Value") # drop faulty molecules print("Dropping faulty molecules...") inchis, values = smi_to_inchi_with_val(uq_smiles, uq_values) if neutralize: inchis = [ MolToInchi(neutralize_atoms(MolFromInchi(inchi), PATTERN)) for inchi in inchis ] with open(os.path.join(DATA_PATH, "herg", "data_herg.pt"), "wb") as handle: pickle.dump([inchis, values], handle)
def is_valid_inchi(inchi: str) -> Tuple[bool, str]: """ Check whether a string represents a valid InChI descriptor. Args: inchi (str): The string to be checked. Returns: Tuple[bool, str]: - Whether the string represents a valid InChI descriptor. - A reason for invalidating the argument. """ if not isinstance(inchi, str): # this is important, not only a shortcut, since a try except block does not capture Boost.Python.ArgumentError # being raised if the argument does not match the C++ signature. return False, f'An InChI descriptor must be a string, got "{inchi}" which is a {type(inchi)}.' try: rd_mol = MolFromInchi(inchi) except Exception as e: return False, str(e) if rd_mol is None: return False, f'Could not generate an RDKit Molecule from InChI "{inchi}"' return True, ''
def draw_inchi(inchi, imgfile): molecule = Chem.AddHs(MolFromInchi(inchi)) AllChem.EmbedMolecule(molecule) AllChem.MMFFOptimizeMolecule(molecule) Draw.MolToFile(molecule, imgfile)
def process_caco2(neutralize=False): # peerJ data df1 = pd.read_excel( os.path.join(DATA_PATH, "caco2", "peerj-03-1405-s001.xls")) df1 = df1.loc[:, ["InChI", "Caco-2 Papp * 10^6 cm/s"]] df1.dropna(inplace=True) df1["Value"] = -np.log10(df1["Caco-2 Papp * 10^6 cm/s"] * 1e-6) new_inchis = [] values = [] for inchi, val in zip(df1["InChI"], df1["Value"]): mol = MolFromInchi(inchi) if mol is not None: new_inchis.append( MolToInchi(mol)) # ensure same inchi specification values.append(val) df1 = pd.DataFrame({"InChI": new_inchis, "Value": values}) # plos one data df2 = pd.read_csv(os.path.join(DATA_PATH, "caco2", "caco2perm_pone.csv")) df2["Value"] = -np.log10(df2["Papp (Caco-2) [cm/s]"]) df2 = df2.loc[:, ["name", "Value"]] df2.dropna(inplace=True) print("Querying InchI strings from IUPAC names...") inchis = [] values = [] for mol_name, val in tqdm(zip(df2["name"], df2["Value"]), total=len(df2)): ans = requests.get(IUPAC_REST.format(mol_name)) if ans.status_code == 200: inchi = ans.content.decode("utf8") new_mol = MolFromInchi(inchi) # ensure same inchi specification if new_mol is not None: new_inchi = MolToInchi(new_mol) inchis.append(new_inchi) values.append(val) inchis.extend(df1["InChI"].tolist()) values.extend(df1["Value"].tolist()) df = pd.DataFrame({"inchi": inchis, "values": values}) per_dup, stds = duplicate_analysis(df, "inchi", "values") print( "Percentage of duplicates for CaCO2 dataset: {:.5f}, with average std.: {:.3f}, and median std.:{:.3f}" .format(per_dup, np.mean(stds), np.median(stds))) uq_inchi = pd.unique(df["inchi"]).tolist() print("Averaging values and ensuring rdkit readability...") inchis = [] values = [] # Average values and make sure rdkit can read all inchis for inchi in tqdm(uq_inchi): mol = MolFromInchi(inchi) if mol is not None: df_uq = df.loc[df["inchi"] == inchi] inchis.append(inchi) values.append(df_uq["values"].mean()) if neutralize: inchis = [ MolToInchi(neutralize_atoms(MolFromInchi(inchi), PATTERN)) for inchi in inchis ] with open(os.path.join(DATA_PATH, "caco2", "data_caco2.pt"), "wb") as handle: pickle.dump([inchis, values], handle)
def process_ppb(neutralize=False): inchis = [] values = [] # first dataset xlsxs = glob( os.path.join(DATA_PATH, "ppb", "11095_2013_1023_MOESM[2-4]_ESM.xlsx")) for idx, xlsx in enumerate(xlsxs): ppb_col = "Experimental_%PPB" if idx < 3 else "Experimental PPB_[%]" df1 = pd.read_excel(xlsx) df1 = df1.loc[:, ["SMILES", ppb_col]] inchis_1, values_1 = smi_to_inchi_with_val(df1["SMILES"], df1[ppb_col]) inchis.extend(inchis_1) values.extend(values_1) # second dataset df2 = pd.read_excel(os.path.join(DATA_PATH, "ppb", "ci6b00291_si_001.xlsx")) df2 = df2.loc[:, ["SMILES", "Fub"]] df2["Value"] = 100 * (1 - df2["Fub"]) inchis_2, values_2 = smi_to_inchi_with_val(df2["SMILES"], df2["Value"]) inchis.extend(inchis_2) values.extend(values_2) # third dataset df3 = pd.read_excel( os.path.join(DATA_PATH, "ppb", "cmdc201700582-sup-0001-misc_information.xlsx"), sheet_name=4, ) df3 = df3.loc[:, ["SMILES", "PPB_Traditional_assay(serve as the true value)"]] df3["Value"] = 100 * df3["PPB_Traditional_assay(serve as the true value)"] inchis_3, values_3 = smi_to_inchi_with_val(df3["SMILES"], df3["Value"]) inchis.extend(inchis_3) values.extend(values_3) # fourth dataset df4 = pd.read_excel( os.path.join(DATA_PATH, "ppb", "jm051245vsi20061025_033631.xls")) df4 = df4.loc[:, ["NAME (Drug or chemical name)", "PBexp(%)"]] for mol_name, val in tqdm(zip(df4["NAME (Drug or chemical name)"], df4["PBexp(%)"]), total=len(df4)): ans = requests.get(IUPAC_REST.format(mol_name)) if ans.status_code == 200: inchi = ans.content.decode("utf8") mol = MolFromInchi(inchi) # Use same inchi specification as rdkit... new_inchi = MolToInchi(mol) new_mol = MolFromInchi(new_inchi) if new_mol is not None: inchis.append(new_inchi) values.append(val) # fifth dataset df5 = pd.read_excel(os.path.join(DATA_PATH, "ppb", "mp8b00785_si_002.xlsx")) df5 = df5.loc[:, ["canonical_smiles", "fup"]] df5["Value"] = 100 * (1 - df5["fup"]) inchis_5, values_5 = smi_to_inchi_with_val(df5["canonical_smiles"], df5["Value"]) inchis.extend(inchis_5) values.extend(values_5) # sixth dataset df6 = pd.read_html(os.path.join(DATA_PATH, "ppb", "kratochwil2002.html"), header=0)[0] df6 = df6.loc[:, ["Compound", "fb (%)b"]].dropna() for mol_name, val in tqdm(zip(df6["Compound"], df6["fb (%)b"]), total=len(df6)): ans = requests.get(IUPAC_REST.format(mol_name)) if ans.status_code == 200: inchi = ans.content.decode( "utf8") # maybe not the same standard as rdkit... mol = MolFromInchi(inchi) if mol is not None: new_inchi = MolToInchi(mol) new_mol = MolFromInchi(new_inchi) if new_mol is not None: inchis.append(new_inchi) values.append(val) # join them all together df = pd.DataFrame({"inchi": inchis, "values": values}) # checking duplicates per_dup, stds = duplicate_analysis(df, "inchi", "values") print( "Percentage of duplicates for PPB dataset: {:.5f}, with average std.: {}, and median std.:{}" .format(per_dup, np.mean(stds), np.median(stds))) # average values w. equal inchi and check readability print("Averaging values and ensuring rdkit readability...") inchis, values = mean_by_key(df, "inchi", "values") inchis, values = ensure_readability(inchis, values, MolFromInchi) if neutralize: inchis = [ MolToInchi(neutralize_atoms(MolFromInchi(inchi), PATTERN)) for inchi in inchis ] with open(os.path.join(DATA_PATH, "ppb", "data_ppb.pt"), "wb") as handle: pickle.dump([inchis, values], handle)
if __name__ == "__main__": mws = [] logps = [] nhdonors = [] values = [] dataset = [] for data in list(LABEL_GUIDE.keys()) + ["cyp"]: with open(os.path.join(DATA_PATH, data, f"data_{data}.pt"), "rb") as handle: inchis, v = pickle.load(handle) values.extend(v) for inchi in tqdm(inchis): mol = MolFromInchi(inchi) mws.append(MolWt(mol)) logps.append(MolLogP(mol)) nhdonors.append(NumHDonors(mol)) dataset.append(DATASET_GUIDE[data]) df = pd.DataFrame({ "Molecular weight (gr./mol)": mws, r"aLog$P$": logps, "No. hydrogen donors": nhdonors, "values": values, "dataset": dataset, }) f, axs = plt.subplots(1, 3, figsize=(18, 6))
def InchiToPixmap(inchi: str, size: QSize): if size.isNull() or not inchi: return QPixmap() return MolToPixmap(MolFromInchi(inchi), size)
elif task == "binary": base_model = RandomForestClassifier else: raise ValueError("Task not supported") with open( os.path.join(DATA_PATH, f"{data}", f"data_{data}.pt"), "rb" ) as handle: inchis, values = pickle.load(handle) inchis = np.array(inchis) values = np.array(values)[:, np.newaxis] kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=1337) fps = np.vstack([featurize_ecfp4(MolFromInchi(inchi)) for inchi in inchis]) for idx_split, (idx_train, idx_test) in enumerate(kf.split(inchis)): print(f"Fold {idx_split + 1}/{N_FOLDS}...") fps_train, fps_test = fps[idx_train, :], fps[idx_test, :] values_train, values_test = values[idx_train, :], values[idx_test, :] rf = base_model(n_estimators=N_ESTIMATORS, n_jobs=-1) rf.fit(fps_train, values_train.squeeze()) if task == "regression": yhat_test = rf.predict(fps_test) elif task == "binary": yhat_test = rf.predict_proba(fps_test) np.save(
elif task == "binary": base_model = RandomForestClassifier else: raise ValueError("Task not supported") with open(os.path.join(DATA_PATH, f"{data}", f"data_{data}.pt"), "rb") as handle: inchis, values = pickle.load(handle) inchis = np.array(inchis) values = np.array(values)[:, np.newaxis] kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=1337) fps = np.vstack( [featurize_ecfp4(MolFromInchi(inchi)) for inchi in inchis]) for idx_split, (idx_train, idx_test) in enumerate(kf.split(inchis)): print(f"Fold {idx_split + 1}/{N_FOLDS}...") fps_train, fps_test = fps[idx_train, :], fps[idx_test, :] values_train, values_test = values[idx_train, :], values[ idx_test, :] rf = base_model(n_estimators=N_ESTIMATORS, n_jobs=-1) rf.fit(fps_train, values_train.squeeze()) if task == "regression": yhat_test = rf.predict(fps_test) elif task == "binary": yhat_test = rf.predict_proba(fps_test)
def process(init_data, use_cache=True): ''' Examples -------- >>> res = process({'CAS': '10170-69-1', 'synonyms': ['14267-36-8', 'NSC 22319'], 'name': 'Manganese, decacarbonyldi-, (Mn-Mn)'}) >>> res['inchi'], res['smiles'], res['cid'], res['CAS'] ('InChI=1S/10CO.2Mn/c10*1-2;;', '[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[Mn].[Mn]', 517769, '10170-69-1') ''' # print(locals()) init_data = init_data.copy() cc = cc_CAS = cc_name = cc_inchi = cc_inchikey = cc_smiles = cc_synonyms = cc_deprecated_CASs = None if 'CAS' in init_data: try: cc = common_chemistry_data(init_data['CAS']) cc_CAS, cc_name, cc_inchi, cc_inchikey, cc_smiles, cc_synonyms, cc_deprecated_CASs = cc except ValueError: # Compund is not in common chemistry; this is OK pass cid = iupac_name = p_MW = p_inchi = p_inchikey = p_smiles = p_formula = p_synonyms = None if init_data.get('mol', None) is not None: # If not in common chemistry or no InChi there, but if we have a mol file, get the inchi and inchikey for the # pubchem lookup mol = Chem.MolFromMolFile(init_data['mol']) if mol is not None: init_data['inchi'] = MolToInchi(mol) init_data['inchikey'] = InchiToInchiKey(init_data['inchi']) can_search_pubchem = (init_data.get('pubchem') is not None or init_data.get('CASRN', cc_CAS) is not None or init_data.get('inchi', cc_inchi) is not None or init_data.get('inchikey', cc_inchikey) is not None or init_data.get('smiles', cc_smiles) is not None) if can_search_pubchem: try: p = find_pubchem_from_ids( pubchem=init_data.get('pubchem'), CASRN=init_data.get('CASRN', cc_CAS), inchi=init_data.get('inchi', cc_inchi), inchikey=init_data.get('inchikey', cc_inchikey), smiles=init_data.get('smiles', cc_smiles), use_cache=use_cache) except Exception as e: p = None print(e, 'exception') if p is not None: cid, iupac_name, p_MW, p_inchi, p_inchikey, p_smiles, p_formula, p_synonyms = p # print(locals()) mol = None # Be aware some smiles descriptions are wrong # Start with user overridding if 'mol' in init_data: mol = Chem.MolFromMolFile(init_data['mol']) if mol is None and 'smiles' in init_data: mol = Chem.MolFromSmiles(init_data['smiles']) if mol is None and 'inchi' in init_data: mol = MolFromInchi( init_data['inchi']) if init_data['inchi'].startswith( "InChI=1S/") else MolFromInchi("InChI=1S/" + init_data['inchi']) # Trust common chemistry next if mol is None and cc_smiles is not None: mol = Chem.MolFromSmiles(cc_smiles) if mol is None and cc_inchi is not None: mol = MolFromInchi(cc_inchi) if cc_inchi.startswith( "InChI=1S/") else MolFromInchi("InChI=1S/" + cc_inchi) # Did we pull up the structure from pubchem?? if mol is None and p_smiles is not None: mol = Chem.MolFromSmiles(p_smiles) if mol is None and p_inchi is not None: mol = MolFromInchi(p_inchi) if p_inchi.startswith( "InChI=1S/") else MolFromInchi("InChI=1S/" + p_inchi) if mol is None: raise ValueError("No structure found") smiles = Chem.MolToSmiles(mol, True) inchi = MolToInchi(mol) inchikey = InchiToInchiKey(inchi) #MW = Descriptors.ExactMolWt(mol) formula = CalcMolFormula(mol, True, True) formula = serialize_formula(formula) MW = molecular_weight(nested_formula_parser(formula)) # print(inchi, cc_inchi, p_inchi) # print(inchikey, cc_inchikey, p_inchikey) # print(smiles, cc_smiles, p_smiles) # output values if 'pubchem' in init_data: cid = init_data['pubchem'] elif cid is None: cid = -1 if cc_CAS is not None: CAS = cc_CAS elif 'CAS' in init_data: CAS = init_data['CAS'] else: raise ValueError("CAS culd not be found") if 'formula' in init_data: # Override rdkit formula = init_data['formula'] if 'MW' in init_data: # Override rdkit MW = init_data['MW'] if 'smiles' in init_data: smiles = init_data['smiles'] if 'inchi' in init_data: inchi = init_data['inchi'] if 'inchikey' in init_data: inchikey = init_data['inchikey'] if inchikey == '*' or smiles == '*' or inchi == '*': raise ValueError("Failure in rdkit") # Do we have a name specified in the settings? if 'name' in init_data: name = init_data['name'] elif cc_name is not None: name = cc_name elif iupac_name is not None: name = iupac_name else: raise ValueError("There is no name for this compound") synonyms = [] if cc_synonyms is not None: synonyms += cc_synonyms if cc_deprecated_CASs is not None: synonyms += cc_deprecated_CASs if p_synonyms is not None: synonyms += p_synonyms if 'synonyms' in init_data: synonyms += init_data['synonyms'] synonyms = list(set(synonyms)) if name in synonyms: synonyms.remove(name) if synonyms: def key_sort_str(s): return len(s), s.lower() synonyms = sorted(synonyms, key=key_sort_str) # synonyms = natsorted(synonyms) # synonyms = [] return { 'cid': cid, 'CAS': CAS, 'formula': formula, 'MW': MW, 'smiles': smiles, 'inchi': inchi, 'inchikey': inchikey, 'name': name, 'synonyms': synonyms }
def smiles_from_inchi(inchi): mol = MolFromInchi(inchi) return MolToSmiles(mol, isomericSmiles=True)