def check_if_two_molecules_are_equal_from_smiles(smiles1, smiles2): mol1 = MolFromSmiles(smiles1) mol2 = MolFromSmiles(smiles2) fgp1 = AllChem.GetMorganFingerprint(mol1, 1, useFeatures=True, useChirality=True) fgp2 = AllChem.GetMorganFingerprint(mol2, 1, useFeatures=True, useChirality=True) similarity = DataStructs.TanimotoSimilarity(fgp1, fgp2) if similarity == 1: return True else: return False
def fingerprint_features(smile_string, radius=2, size=2048): mol = MolFromSmiles(smile_string) new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=size, useChirality=True, useBondTypes=True, useFeatures=False)
def check_similarity_between_generic_and_complete_representation( generic_smiles, complete_smiles): complete_smiles, _ = NeutraliseCharges(complete_smiles) generic_smiles, _ = NeutraliseCharges(generic_smiles) complete_mol = MolFromSmiles(complete_smiles) generic_mol = MolFromSmarts(generic_smiles) match = complete_mol.GetSubstructMatch(generic_mol) if match: return True return False
def test_nonexistent_mordred_descriptors(self): """Test ability to pass through descriptors to Mordred.""" mol_graph = MolFromSmiles("C") for desc in ["", "ReallyInvalidDescriptorName"]: descriptor = Descriptor() with self.assertRaises(MordredCalculatorError): descriptor.make_fingerprint( molecule_graph=mol_graph, fingerprint_type="mordred:" + desc, )
def IsCorrectSMILES(smiles): try: resMol = MolFromSmiles(smiles, sanitize=True) except Exception: resMol = None if resMol == None: return 0 else: return 1
def _filter_by_mass_and_rt( self, possible_ranges: List[Tuple[float, float, str, str]], cpd_info: List[Tuple[str]], ) -> Tuple[Optional[str], Dict]: """Check to see if compound masses (and optionally, retention time) each lie in any possible mass ranges. Parameters ---------- possible_ranges : List[Tuple[float, float, str, str]] Possible mass ranges based on peak masses and tolerance. cpd_info : List[Tuple[str]] Tuple of compound ID, SMILES, peak ID, and adduct name. Returns ------- c_id_if_matched : str, optional Contains the compound ID if a hit is found, None by default. cpd_dict : Dict Contains predicted retention time, matched peak IDs (if any), and matched adduct names (if any). """ c_id_if_matched = None cpd_dict = {"Predicted_RT": None, "Matched_Peak_IDs": [], "Matched_Adducts": []} cpd_exact_mass = ExactMolWt(MolFromSmiles(cpd_info[1])) predicted_rt = None for possible_range in possible_ranges: if possible_range[0] < cpd_exact_mass < possible_range[1]: c_id = cpd_info[0] smiles = cpd_info[1] peak_id = possible_range[2] adduct = possible_range[3] if self.filter_by_rt: if not predicted_rt: predicted_rt = self._predict_rt(smiles) if not predicted_rt: # sometimes can't predict RT due to missing vals in fingerprint continue expt_rt = self.metabolomics_dataset.get_rt(peak_id) if not expt_rt: raise ValueError(f"No retention time found for peak, {peak_id}") cpd_dict["Predicted_RT"] = predicted_rt if abs(expt_rt - predicted_rt) > self.rt_threshold: continue # if outside threshold, don"t add to matched peaks c_id_if_matched = c_id cpd_dict["Matched_Peak_IDs"].append(peak_id) cpd_dict["Matched_Adducts"].append(adduct) return c_id_if_matched, cpd_dict
def save(vertices, edges, out='out.png'): from rdkit.Chem import Draw, MolFromSmiles s = deprocess(vertices, edges) m = MolFromSmiles(s) if s == '': raise ValueError() if m: Draw.MolToFile(m, out, size=(800, 800)) return s else: raise ValueError()
def calculate_pIC50(mols): scores = [] for i in range(len(mols)): m = MolFromSmiles(mols[i]) G = convert_rdkit_to_nx(m) reward = MPNNReward(model, atom_types=atom_types, bond_types=bond_types, maximize=False) scores.append(reward._call(G)) return scores
def __compound_to_dir__(compound): compounds_dir = __mkd__(f'{compound["Compound Id"]}') with open('smiles', 'w') as f: f.write(compound["smiles"]) with open('molfile', 'w') as f: mol = MolFromSmiles(compound["smiles"]) f.write(MolToMolBlock(mol)) os.chdir(compounds_dir) comp = ET.SubElement(root, "Compound") ET.SubElement(comp, "Id").text = compound["Compound Id"] ET.SubElement(comp, "Cargos").text = "smiles molfile"
def tensorize(junc_tree_batch, vocab, use_graph_conv, assm=True): set_batch_nodeID(junc_tree_batch, vocab) smiles_batch = [junc_tree.smiles for junc_tree in junc_tree_batch] jtenc_holder, mess_dict = JTNNEncoder.tensorize(junc_tree_batch) prop_batch = [] for smiles in smiles_batch: prop_batch.append(Descriptors.MolLogP(MolFromSmiles(smiles))) if use_graph_conv: molenc_holder = MolGraphEncoder.tensorize(smiles_batch) if assm is False: return junc_tree_batch, jtenc_holder, molenc_holder candidate_smiles = [] cand_batch_idx = [] for idx, junc_tree in enumerate(junc_tree_batch): for node in junc_tree.nodes: # leaf node's attachment is determined by neighboring node's attachment if node.is_leaf or len(node.candidates) == 1: continue candidate_smiles.extend( [candidate for candidate in node.candidates]) cand_batch_idx.extend([idx] * len(node.candidates)) cand_molenc_holder = MolGraphEncoder.tensorize(candidate_smiles) cand_batch_idx = torch.LongTensor(cand_batch_idx) return junc_tree_batch, jtenc_holder, molenc_holder, ( cand_molenc_holder, cand_batch_idx), prop_batch else: mpn_holder = MessPassNet.tensorize(smiles_batch) if assm is False: return junc_tree_batch, jtenc_holder, mpn_holder candidates = [] cand_batch_idx = [] for idx, junc_tree in enumerate(junc_tree_batch): for node in junc_tree.nodes: # leaf node's attachment is determined by neighboring node's attachment if node.is_leaf or len(node.candidates) == 1: continue candidates.extend([(candidate, junc_tree.nodes, node) for candidate in node.candidates]) cand_batch_idx.extend([idx] * len(node.candidates)) jtmpn_holder = JTMessPassNet.tensorize(candidates, mess_dict) cand_batch_idx = torch.LongTensor(cand_batch_idx) return junc_tree_batch, jtenc_holder, mpn_holder, ( jtmpn_holder, cand_batch_idx), prop_batch
def test_bad_descriptors_padelpy_descriptors(self): """Test ability to pass through invalid descriptors to padelpy.""" mol_graph = MolFromSmiles("C") for desc in ["", "ReallyInvalidDescriptorName"]: descriptor = Descriptor() with self.assertRaises(RuntimeError): descriptor.make_fingerprint( molecule_graph=mol_graph, fingerprint_type="padelpy:" + desc, fingerprint_params={'timeout': 2}, )
def fingerprints(): rdkit_mols = [MolFromSmiles(smiles) for smiles in self.features] fps = [ AllChem.GetMorganFingerprintAsBitVect(mol, bond_radius, nBits=nBits) for mol in rdkit_mols ] return np.asarray(fps)
def smiles_validator(smiles): if isinstance(smiles,numbers.Number): raise ValueError("Molecules must be valid SMILES notation not integer.") if smiles is None or (smiles.strip() == ""): raise ValueError("smiles field must not be empty") if isinstance(MolFromSmiles(smiles),rdkit.Chem.rdchem.Mol): return True else: raise ValueError("Molecules must be valid SMILE notation of chemical.")
def is_correct_smiles(smiles): """ Using RDKit to calculate whether molecule is syntactically and semantically valid. """ if smiles == "": return 0 try: return int(MolFromSmiles(smiles, sanitize=True) is not None) except Exception: return 0
def depict(self, filename=None, ipython=False): from rdkit.Chem.Draw import IPythonConsole from rdkit.Chem.Draw import MolToImage from rdkit.Chem.Draw import rdMolDraw2D from rdkit.Chem.AllChem import EmbedMolecule from IPython.display import SVG from rdkit.Chem import RWMol, MolFromSmiles, Atom, BondType, ChiralType _ = MolFromSmiles('C') rmol = RWMol(_) dict_old_new_idx = {} n = 1 for a in self.atoms: old_idx = a.GetIdx() rmol.AddAtom(a) dict_old_new_idx[old_idx] = n n += 1 for a in self.enviroments: old_idx = a.GetIdx() a.SetChiralTag(ChiralType.CHI_UNSPECIFIED) a.SetIsAromatic(0) rmol.AddAtom(a) dict_old_new_idx[old_idx] = n n += 1 for b in self.Bonds: rmol.AddBond(dict_old_new_idx[b.GetBeginAtomIdx()], dict_old_new_idx[b.GetEndAtomIdx()], b.GetBondType()) for b in self.bondsenvironments: rmol.AddBond(dict_old_new_idx[b.GetBeginAtomIdx()], dict_old_new_idx[b.GetEndAtomIdx()], b.GetBondType()) rmol.RemoveAtom(0) EmbedMolecule(rmol) drawer = rdMolDraw2D.MolDraw2DSVG(400, 200) drawer.DrawMolecule(rmol) drawer.FinishDrawing() svg = drawer.GetDrawingText() if filename != None: f = open(filename, 'w') f.write(svg) f.close() if ipython: svg = svg.replace('svg:', '') return SVG(svg) else: return None
def batch_diversity(smiles, train_smiles): """ Compares the Tanimoto distance of a given molecule with a random sample of the training smiles. """ rand_smiles = random.sample(train_smiles, 100) rand_mols = [MolFromSmiles(s) for s in rand_smiles] fps = [Chem.GetMorganFingerprintAsBitVect( m, 4, nBits=2048) for m in rand_mols] vals = [apply_to_valid(s, diversity, fps=fps) for s in smiles] return vals
def generate_drug_list(): filename = 'drug_list_copy.csv' filepath = os.path.join(DRUG_LIST_PATH, filename) df = pd.read_csv(filepath) data = list() for row_id, row_series in df.iterrows(): row_dict = dict(row_series) row_dict.pop('Unnamed: 0') if MolFromSmiles(row_dict['smiles']) is not None: data.append(row_dict) new_filename = 'drug_list.csv' new_filepath = os.path.join(DRUG_LIST_PATH, new_filename) new_df = pd.DataFrame(data=data) new_df.to_csv(new_filepath) new_df = pd.read_csv(new_filepath) assert sum([MolFromSmiles(smiles) is None for smiles in new_df['smiles']]) == 0
def test_hypergraph_rpe_parser_bad_smiles(self): g = HypergraphGrammar() trees = [] for smile in bad_smiles: try: trees.append( g.normalize_tree(hypergraph_parser(MolFromSmiles(smile)))) except (AssertionError, IndexError): print('Failed for {}'.format(smile)) raise
def check_node_type(new_compound): node_index = [] valid_compound = [] all_smile = [] distance = [] score = [] for i in range(len(new_compound)): try: ko = Chem.MolFromSmiles(new_compound[i]) except: ko = None if ko != None: try: molscore = MolFromSmiles(new_compound[i]) except: molscore = None if molscore != None: SA_score = -sascorer.calculateScore(molscore) else: SA_score = 1000 cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles( new_compound[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 if cycle_length == 0: m = rdock_score(new_compound[i]) if m < 10**10: node_index.append(i) valid_compound.append(new_compound[i]) score.append(m) return node_index, score, valid_compound
def scorer(smiles): smiles_rdkit = [] for i in range(len(smiles)): smiles_rdkit.append( MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True)) logP_values = [] for i in range(len(smiles)): logP_values.append(Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i]))) SA_scores = [] for i in range(len(smiles)): SA_scores.append( -sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i]))) cycle_scores = [] for i in range(len(smiles)): cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles(smiles_rdkit[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_scores.append(-cycle_length) SA_scores_normalized = (np.array(SA_scores) - np.mean(SA_scores)) / np.std(SA_scores) logP_values_normalized = (np.array(logP_values) - np.mean(logP_values)) / np.std(logP_values) cycle_scores_normalized = (np.array(cycle_scores) - np.mean(cycle_scores)) / np.std(cycle_scores) targets = (SA_scores_normalized + logP_values_normalized + cycle_scores_normalized) return (SA_scores, logP_values, cycle_scores, targets)
def canonicalize_and_filter(smi_list, showprogress=False): """ Function that returns the set of unique RDKit molecules from a list of input RDKit molecules by turning them into canonical SMILES and checking the strings for uniqueness. Also performs rudimentary Lipinski rule-of-5 filtering by dropping molecules with logP >5 and more than 17 heavy atoms. """ mol_list = [] if showprogress: print('Canonicalising mols') for smi in tqdm(smi_list): mol = MolFromSmiles(smi) if mol is not None: mol_list.append(MolToSmiles(mol)) else: for smi in smi_list: mol = MolFromSmiles(smi) if mol is not None: mol_list.append(mol) mol_list = list(set(mol_list)) final_list = [] if showprogress: print('Size of unfiltered final library: {}'.format(len(mol_list))) print('Filtering by n_heavy and logP:') for smi in tqdm(mol_list): mol = MolFromSmiles(smi) n_heavy = mol.GetNumHeavyAtoms() if n_heavy > 17: logP = Crippen.MolLogP(mol) if logP <= 5: final_list.append(smi) else: for smi in mol_list: mol = MolFromSmiles(smi) n_heavy = mol.GetNumHeavyAtoms() if n_heavy > 17: logP = Crippen.MolLogP(mol) if logP <= 5: final_list.append(smi) return final_list
def logp(smiles): mol = MolFromSmiles(smiles) try: log_p = Descriptors.MolLogP(mol) except: print(mol) print(smiles) return -np.inf log_p = (log_p - LOGP_MEAN) / LOGP_STD return log_p
def contains(self, smiles): """ Returns true if the given SMILES string is a substructure of this RDMol. Uses a client-side RDKit installation. Returns ------- contains : boolean True if the rdmol molecule attribute contains the specified substructure in SMILES format. """ return self.rdmol.HasSubstructMatch(MolFromSmiles(str(smiles)))
def analyze(self, smiles: List[str], only_drugs=True) -> pd.DataFrame: features = self.preprocessor.transform(smiles) # RDKit molecular properties inchikey = [] weight = [] logp = [] hdonors = [] hacceptors = [] for example in smiles: mol = MolFromSmiles(example) if not mol: raise ValueError("Malformed molecule passed in to analyze") inchikey.append(MolToInchiKey(mol)) weight.append(ExactMolWt(mol)) logp.append(MolLogP(mol)) hdonors.append(NumHDonors(mol)) hacceptors.append(NumHAcceptors(mol)) # Scores safety = self.safety.predict(features) feasibility = self.feasibility.predict(features) bbbp = self.bbbp.predict_proba(features) dataframe = pd.DataFrame( { "key": inchikey, "smiles": smiles, "weight": weight, "logp": logp, "hdonors": hdonors, "hacceptors": hacceptors, "safety": safety, "feasibility": feasibility, "bbbp": (i[1] for i in bbbp), } ) if only_drugs: # Lipinsky's rules dataframe = dataframe[dataframe.weight < 500] dataframe = dataframe[dataframe.hdonors <= 5] dataframe = dataframe[dataframe.hacceptors <= 10] dataframe = dataframe[dataframe.logp <= 5] # Filter too toxic and infeasible compounds dataframe = dataframe[dataframe.safety > 0.75] dataframe = dataframe[dataframe.feasibility > 0.75] dataframe = dataframe.reset_index(drop=True) return dataframe
def make_mass_spectra(smiles_list): molecules = [MolFromSmiles(smiles) for smiles in smiles_list] weights = [ExactMolWt(mol) for mol in molecules] highest_mass = max(weights) least_mass = min(weights) # make a bar graph of the masses simulated by MOD. plt.hist(weights, bins=range(500)) plt.xlabel("Exact Mass") plt.ylabel("Frequency") plt.title( "Mass spectra of the molecules simulated in the reaction network.") plt.show()
def get_target_data(data, target_id, act_type='IC50'): """Returns a data frame of all the ligands for a given target Also makes sure that all the smiles are valid, and filters by weight.""" if act_type is not None: data = data[data.act_type == act_type] target_data = data[data.target_id == target_id] # Filter by molecules that can be converted by rdkit n_ligs = target_data.shape[0] mols = np.zeros(n_ligs, dtype=object) for i in range(n_ligs): try: mols[i] = MolFromSmiles(target_data.smiles.iloc[i]) except: mols[i] = None mols = pd.Series(mols) target_data = target_data[[not m for m in mols.isna()]] # Filter by weight weights = target_data.smiles.apply(lambda x: ExactMolWt(MolFromSmiles(x))) target_data = target_data[(weights >= 100) & (weights <= 600)] return target_data
def canonicalize(mol_list, showprogress=False): """ Function that returns the set of unique RDKit molecules from a list of input RDKit molecules by turning them into canonical SMILES and checking the strings for uniqueness. """ smi_list = [] if showprogress: print('Canonicalising mols') for mol in tqdm(mol_list): if mol is not None: smi_list.append(MolToSmiles(mol)) else: for mol in mol_list: if mol is not None: smi_list.append(MolToSmiles(mol)) mol_list = list(set(smi_list)) if showprogress: mol_list = [MolFromSmiles(smi) for smi in tqdm(mol_list)] else: mol_list = [MolFromSmiles(smi) for smi in mol_list] return mol_list
def smiles_to_bits(smiles, nBits): mols = [MolFromSmiles(s) for s in smiles] fps = [ AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=nBits) for m in mols ] np_fps = [] for fp in fps: arr = np.zeros((1, ), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) df = pd.DataFrame(np_fps) return df
def find_ptn_from(self, node, ptn_smiles): ptn = MolFromSmiles(ptn_smiles) matches = self.mol.GetSubstructMatches(ptn) for sub in matches: if node in sub: continue for n1 in sub: if self.G.has_edge(node, n1): return sub return None
def test_remove_stereo(): mol = Filters.remove_stereo(MolFromSmiles('C[C@@H](C(=O)[O-])O')) assert MolToSmiles(mol) == 'CC(O)C(=O)[O-]' mol = Filters.remove_stereo( MolFromInchi( 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' )) assert MolToSmiles( mol) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(O)=Nc2ccccc21' mol = Filters.commute_inchi(mol) # Expected to change tautomerism assert MolToSmiles( mol) == 'O=C1NC(C2=CNC3=C2C=C(O)C=C3)=CC1=C1C(=O)NC2=CC=CC=C21'