def get_SMILES_objects(mols): if type(mols) == list: SMILES = [MolToSmiles(mol) for mol in mols] return [CanonSmiles(SMILES) for SMILES in SMILES] if str(type(mols)) == "<class 'rdkit.Chem.rdchem.Mol'>": SMILES = MolToSmiles(mols) return CanonSmiles(SMILES)
def _canonicalize_smiles(smiles: str) -> None | str: """Attempt to canonicalize a **smiles** string.""" try: return CanonSmiles(smiles) except Exception as ex: warn = RuntimeWarning(f"Failed to canonicalize {smiles!r}") warn.__cause__ = ex warnings.warn(warn) return None
def compare_mollists(smiles, reference, canonicalize=True): """ get the molecules from ``smiles`` that are not in ``reference`` :param smiles: {list} list of SMILES strings to check for known reference in ``reference`` :param reference: {list} reference molecules as SMILES strings to compare to ``smiles`` :param canonicalize: {bool} whether SMILES should be canonicalized before comparison :return: {list} unique molecules from ``smiles`` as SMILES strings """ smiles = [s.replace('^', '').replace('$', '').strip() for s in smiles] reference = [ s.replace('^', '').replace('$', '').strip() for s in reference ] if canonicalize: mols = set([CanonSmiles(s, 1) for s in smiles if MolFromSmiles(s)]) refs = set([CanonSmiles(s, 1) for s in reference if MolFromSmiles(s)]) else: mols = set(smiles) refs = set(reference) return [m for m in mols if m not in refs]
def is_valid_mol(smiles, return_smiles=False): """ function to check a generated SMILES string for validity """ try: m = CanonSmiles(smiles.strip(), 1) except: m = None if return_smiles: return m is not None, m else: return m is not None
def process(s, q): smls = keep_longest(s) smls = harmonize_sc(smls) mols = list() for s in smls: try: mols.append(CanonSmiles(s, stereochem)) except: print("Error! Can not process SMILES string %s" % s) mols.append(None) q.put(mols)
def is_valid_mol(smiles, return_smiles=False): """ function to check a generated SMILES string for validity :param smiles: {str} SMILES string to be checked :param return_smiles: {bool} whether the checked valid SMILES string should be returned :return: {bool} validity """ try: m = CanonSmiles(smiles.replace('^', '').replace('$', '').strip(), 1) except: m = None if return_smiles: return m is not None, m else: return m is not None
def test_pass(self, df: pd.DataFrame, column_levels: int, column_padding: Hashable) -> None: """Test that whether the code passes as expected.""" out = sanitize_smiles_df(df, column_levels, column_padding) assertion.is_(out, df, invert=True) assertion.is_(out.columns, df.columns, invert=True) assertion.is_(out.index, df.index, invert=True) assertion.eq(len(out.columns.levels), column_levels) np.testing.assert_array_equal([CanonSmiles(i) for i in out.index], out.index) offset = len(df.columns.levels) if isinstance(df.columns, pd.MultiIndex) else 1 for idx in out.columns.levels[offset:]: np.testing.assert_array_equal(idx, column_padding)
def decorate_scaffold(scaffold, sidechains, num=10): """ Decorate a given scaffold containing marked attachment points ([*]) randomly with the given side chains :param scaffold: {str} smiles string of a scaffold with attachment points marked as [*] :param sidechains: {str} point-separated side chains as smiles strings :param num: {int} number of unique molecules to generate :return: ``num``-molecules in a list """ # check if side chains contain rings & adapt the ring number to not confuse them with the ones already in the scaff try: ring_scaff = int(max(list(filter( str.isdigit, scaffold)))) # get highest number of ring in scaffold ring_sc = list(filter(str.isdigit, scaffold)) # get number of rings in side chains for r in ring_sc: sidechains = sidechains.replace( r, str(ring_scaff + int(r))) # replace the ring number with the adapted one except ValueError: pass # do the decoration mols = list() tmp = scaffold.replace('[*]', '*') schns = sidechains.split('.') invalcntr = 0 while len(mols) < num and invalcntr < 50: scaff = tmp while '*' in scaff: scafflist = list(scaff) scafflist[scafflist.index('*')] = np.random.choice(schns, replace=False) scaff = ''.join(scafflist) if is_valid_mol(scaff) and (scaff not in mols): scaff = CanonSmiles(scaff) print(sidechains + "." + scaffold + ">>" + scaff) mols.append(sidechains + "." + scaffold + ">>" + scaff) else: invalcntr += 1 return mols
def __init__( self, filter_name: str, met_data_name: str, met_data_path: str, possible_adducts: List[str], mass_tolerance: float, rt_predictor: RandomForestRegressor = None, rt_threshold: float = None, rt_important_features: List[str] = None, ) -> None: """Load metabolomics data into a MetabolomicsDataset object.""" self._filter_name = filter_name self.met_data_name = met_data_name self.rt_predictor = rt_predictor self.rt_threshold = rt_threshold self.rt_important_features = rt_important_features if self.rt_predictor and self.rt_threshold: self.filter_by_rt = True self.fp_calculator = Calculator(descriptors, ignore_3D=False) else: self.filter_by_rt = False self.fp_calculator = None if met_data_path: self.met_df = pd.read_csv(met_data_path).fillna("") else: self.met_df = None self.possible_adducts = possible_adducts self.mass_tolerance = mass_tolerance self.metabolomics_dataset = MetabolomicsDataset( name=self.met_data_name, adducts=self.possible_adducts, tolerance=self.mass_tolerance, ) self.metabolomics_dataset.known_peaks = [] self.metabolomics_dataset.unknown_peaks = [] # Load Metabolomics peaks for _, row in self.met_df.iterrows(): smiles = row["Predicted Structure (smiles)"] if smiles: smiles = CanonSmiles(smiles) mol = MolFromSmiles(smiles) mol = neutralise_charges(mol) inchi_key = MolToInchiKey(mol) else: mol = None inchi_key = None peak = Peak( name=row["Peak ID"], r_time=row["Retention Time"], mz=row["Aggregate M/Z"], charge=row["Polarity"], inchi_key=inchi_key, ) if inchi_key: self.metabolomics_dataset.known_peaks.append(peak) else: self.metabolomics_dataset.unknown_peaks.append(peak) # Calculate possible peak masses, they get saved to object self.metabolomics_dataset.enumerate_possible_masses(self.mass_tolerance)
R_cnn_20 = calcLRPConvStride(l_embed, [d[143], d[144]], d_20, 20) LRPCheck(" Conv20:", R_cnn_20, np.sum(d_20), verbose) R_cnn = R_cnn_1 + R_cnn_2 + R_cnn_3 + R_cnn_4 + R_cnn_5 + R_cnn_6 + \ R_cnn_7 + R_cnn_8 + R_cnn_9 + R_cnn_10 + R_cnn_15 + R_cnn_20 LRPCheck("Deconvolution:", R_cnn, l_out, verbose) scores = np.sum(R_cnn, axis=1) return y_real[0], scores, np.sum(l_out) - np.sum(R_cnn) # Main Code smiles = CanonSmiles(smiles, useChiral=0) mol = MolFromSmiles(smiles) mw = Descriptors.ExactMolWt(mol) atoms = {a.GetIdx(): a.GetSmarts() for a in mol.GetAtoms()} impacts = np.zeros(len(atoms), dtype='float') print("Predicting %i atoms..." % (len(atoms))) vals = [] for idx, a in tqdm(atoms.items()): val, scores, _ = calcQSAR(mol, idx, mw, verbose=False) vals.append(val) impacts[idx] = scores[0] res = np.mean(vals) std = np.std(vals)
def compute_unique_smiles(self, interp_df, embedding_funct, scaled_radius=None): """ Identify duplicate SMILES and distorts the embedding. The input df must have columns 'SMILES' and 'Generated' at 0th and 1st position. 'Generated' colunm must contain boolean to classify SMILES into input SMILES(False) and generated SMILES(True). This function does not make any assumptions about order of embeddings. Instead it simply orders the df by SMILES to identify the duplicates. """ distance = self._compute_radius(scaled_radius) embeddings = interp_df['embeddings'] embeddings_dim = interp_df['embeddings_dim'] for index, row in interp_df.iterrows(): smile_string = row['SMILES'] try: canonical_smile = CanonSmiles(smile_string) except: # If a SMILES cannot be canonicalized, just use the original canonical_smile = smile_string row['SMILES'] = canonical_smile for i in range(5): smiles = interp_df['SMILES'].sort_values() duplicates = set() for idx in range(0, smiles.shape[0] - 1): if smiles.iat[idx] == smiles.iat[idx + 1]: duplicates.add(smiles.index[idx]) duplicates.add(smiles.index[idx + 1]) if len(duplicates) > 0: for dup_idx in duplicates: if interp_df.iat[dup_idx, 3]: # add jitter to generated molecules only distored = self.addjitter( embeddings[dup_idx], distance, cnt=1, shape=embeddings_dim[dup_idx]) embeddings[dup_idx] = distored[0] interp_df['SMILES'] = embedding_funct(embeddings.to_list()) interp_df['embeddings'] = embeddings else: break # Ensure all generated molecules are valid. for i in range(5): PandasTools.AddMoleculeColumnToFrame(interp_df, 'SMILES') invalid_mol_df = interp_df[interp_df['ROMol'].isnull()] if not invalid_mol_df.empty: invalid_index = invalid_mol_df.index.to_list() for idx in invalid_index: embeddings[idx] = self.addjitter( embeddings[idx], distance, cnt=1, shape=embeddings_dim[idx])[0] interp_df['SMILES'] = embedding_funct(embeddings.to_list()) interp_df['embeddings'] = embeddings else: break # Cleanup if 'ROMol' in interp_df.columns: interp_df = interp_df.drop('ROMol', axis=1) return interp_df