Esempio n. 1
0
def get_SMILES_objects(mols):
    if type(mols) == list:
        SMILES = [MolToSmiles(mol) for mol in mols]
        return [CanonSmiles(SMILES) for SMILES in SMILES]
    if str(type(mols)) == "<class 'rdkit.Chem.rdchem.Mol'>":
        SMILES = MolToSmiles(mols)
        return CanonSmiles(SMILES)
Esempio n. 2
0
def _canonicalize_smiles(smiles: str) -> None | str:
    """Attempt to canonicalize a **smiles** string."""
    try:
        return CanonSmiles(smiles)
    except Exception as ex:
        warn = RuntimeWarning(f"Failed to canonicalize {smiles!r}")
        warn.__cause__ = ex
        warnings.warn(warn)
        return None
Esempio n. 3
0
def compare_mollists(smiles, reference, canonicalize=True):
    """ get the molecules from ``smiles`` that are not in ``reference``

    :param smiles: {list} list of SMILES strings to check for known reference in ``reference``
    :param reference: {list} reference molecules as SMILES strings to compare to ``smiles``
    :param canonicalize: {bool} whether SMILES should be canonicalized before comparison
    :return: {list} unique molecules from ``smiles`` as SMILES strings
    """
    smiles = [s.replace('^', '').replace('$', '').strip() for s in smiles]
    reference = [
        s.replace('^', '').replace('$', '').strip() for s in reference
    ]
    if canonicalize:
        mols = set([CanonSmiles(s, 1) for s in smiles if MolFromSmiles(s)])
        refs = set([CanonSmiles(s, 1) for s in reference if MolFromSmiles(s)])
    else:
        mols = set(smiles)
        refs = set(reference)
    return [m for m in mols if m not in refs]
Esempio n. 4
0
def is_valid_mol(smiles, return_smiles=False):
    """ function to check a generated SMILES string for validity """
    try:
        m = CanonSmiles(smiles.strip(), 1)
    except:
        m = None
    if return_smiles:
        return m is not None, m
    else:
        return m is not None
Esempio n. 5
0
 def process(s, q):
     smls = keep_longest(s)
     smls = harmonize_sc(smls)
     mols = list()
     for s in smls:
         try:
             mols.append(CanonSmiles(s, stereochem))
         except:
             print("Error! Can not process SMILES string %s" % s)
             mols.append(None)
     q.put(mols)
Esempio n. 6
0
def is_valid_mol(smiles, return_smiles=False):
    """ function to check a generated SMILES string for validity

    :param smiles: {str} SMILES string to be checked
    :param return_smiles: {bool} whether the checked valid SMILES string should be returned
    :return: {bool} validity
    """
    try:
        m = CanonSmiles(smiles.replace('^', '').replace('$', '').strip(), 1)
    except:
        m = None
    if return_smiles:
        return m is not None, m
    else:
        return m is not None
Esempio n. 7
0
    def test_pass(self, df: pd.DataFrame, column_levels: int,
                  column_padding: Hashable) -> None:
        """Test that whether the code passes as expected."""
        out = sanitize_smiles_df(df, column_levels, column_padding)
        assertion.is_(out, df, invert=True)
        assertion.is_(out.columns, df.columns, invert=True)
        assertion.is_(out.index, df.index, invert=True)

        assertion.eq(len(out.columns.levels), column_levels)
        np.testing.assert_array_equal([CanonSmiles(i) for i in out.index],
                                      out.index)

        offset = len(df.columns.levels) if isinstance(df.columns,
                                                      pd.MultiIndex) else 1
        for idx in out.columns.levels[offset:]:
            np.testing.assert_array_equal(idx, column_padding)
Esempio n. 8
0
def decorate_scaffold(scaffold, sidechains, num=10):
    """ Decorate a given scaffold containing marked attachment points ([*]) randomly with the given side chains

    :param scaffold: {str} smiles string of a scaffold with attachment points marked as [*]
    :param sidechains: {str} point-separated side chains as smiles strings
    :param num: {int} number of unique molecules to generate
    :return: ``num``-molecules in a list
    """
    # check if side chains contain rings & adapt the ring number to not confuse them with the ones already in the scaff
    try:
        ring_scaff = int(max(list(filter(
            str.isdigit, scaffold))))  # get highest number of ring in scaffold
        ring_sc = list(filter(str.isdigit,
                              scaffold))  # get number of rings in side chains
        for r in ring_sc:
            sidechains = sidechains.replace(
                r, str(ring_scaff +
                       int(r)))  # replace the ring number with the adapted one
    except ValueError:
        pass

    # do the decoration
    mols = list()
    tmp = scaffold.replace('[*]', '*')
    schns = sidechains.split('.')
    invalcntr = 0
    while len(mols) < num and invalcntr < 50:
        scaff = tmp
        while '*' in scaff:
            scafflist = list(scaff)
            scafflist[scafflist.index('*')] = np.random.choice(schns,
                                                               replace=False)
            scaff = ''.join(scafflist)
        if is_valid_mol(scaff) and (scaff not in mols):
            scaff = CanonSmiles(scaff)
            print(sidechains + "." + scaffold + ">>" + scaff)
            mols.append(sidechains + "." + scaffold + ">>" + scaff)
        else:
            invalcntr += 1
    return mols
Esempio n. 9
0
    def __init__(
        self,
        filter_name: str,
        met_data_name: str,
        met_data_path: str,
        possible_adducts: List[str],
        mass_tolerance: float,
        rt_predictor: RandomForestRegressor = None,
        rt_threshold: float = None,
        rt_important_features: List[str] = None,
    ) -> None:
        """Load metabolomics data into a MetabolomicsDataset object."""

        self._filter_name = filter_name
        self.met_data_name = met_data_name

        self.rt_predictor = rt_predictor
        self.rt_threshold = rt_threshold
        self.rt_important_features = rt_important_features

        if self.rt_predictor and self.rt_threshold:
            self.filter_by_rt = True
            self.fp_calculator = Calculator(descriptors, ignore_3D=False)
        else:
            self.filter_by_rt = False
            self.fp_calculator = None

        if met_data_path:
            self.met_df = pd.read_csv(met_data_path).fillna("")
        else:
            self.met_df = None

        self.possible_adducts = possible_adducts
        self.mass_tolerance = mass_tolerance

        self.metabolomics_dataset = MetabolomicsDataset(
            name=self.met_data_name,
            adducts=self.possible_adducts,
            tolerance=self.mass_tolerance,
        )
        self.metabolomics_dataset.known_peaks = []
        self.metabolomics_dataset.unknown_peaks = []

        # Load Metabolomics peaks
        for _, row in self.met_df.iterrows():

            smiles = row["Predicted Structure (smiles)"]
            if smiles:
                smiles = CanonSmiles(smiles)

                mol = MolFromSmiles(smiles)
                mol = neutralise_charges(mol)
                inchi_key = MolToInchiKey(mol)
            else:
                mol = None
                inchi_key = None

            peak = Peak(
                name=row["Peak ID"],
                r_time=row["Retention Time"],
                mz=row["Aggregate M/Z"],
                charge=row["Polarity"],
                inchi_key=inchi_key,
            )

            if inchi_key:
                self.metabolomics_dataset.known_peaks.append(peak)
            else:
                self.metabolomics_dataset.unknown_peaks.append(peak)

        # Calculate possible peak masses, they get saved to object
        self.metabolomics_dataset.enumerate_possible_masses(self.mass_tolerance)
Esempio n. 10
0
    R_cnn_20 = calcLRPConvStride(l_embed, [d[143], d[144]], d_20, 20)
    LRPCheck("  Conv20:", R_cnn_20, np.sum(d_20), verbose)

    R_cnn = R_cnn_1 + R_cnn_2 + R_cnn_3 + R_cnn_4 + R_cnn_5 + R_cnn_6 + \
            R_cnn_7 + R_cnn_8 + R_cnn_9 + R_cnn_10 + R_cnn_15 + R_cnn_20

    LRPCheck("Deconvolution:", R_cnn, l_out, verbose)

    scores = np.sum(R_cnn, axis=1)

    return y_real[0], scores, np.sum(l_out) - np.sum(R_cnn)


# Main Code
smiles = CanonSmiles(smiles, useChiral=0)
mol = MolFromSmiles(smiles)
mw = Descriptors.ExactMolWt(mol)
atoms = {a.GetIdx(): a.GetSmarts() for a in mol.GetAtoms()}
impacts = np.zeros(len(atoms), dtype='float')

print("Predicting %i atoms..." % (len(atoms)))
vals = []
for idx, a in tqdm(atoms.items()):
    val, scores, _ = calcQSAR(mol, idx, mw, verbose=False)
    vals.append(val)
    impacts[idx] = scores[0]

res = np.mean(vals)
std = np.std(vals)
Esempio n. 11
0
    def compute_unique_smiles(self,
                              interp_df,
                              embedding_funct,
                              scaled_radius=None):
        """
        Identify duplicate SMILES and distorts the embedding. The input df
        must have columns 'SMILES' and 'Generated' at 0th and 1st position.
        'Generated' colunm must contain boolean to classify SMILES into input
        SMILES(False) and generated SMILES(True).

        This function does not make any assumptions about order of embeddings.
        Instead it simply orders the df by SMILES to identify the duplicates.
        """

        distance = self._compute_radius(scaled_radius)
        embeddings = interp_df['embeddings']
        embeddings_dim = interp_df['embeddings_dim']
        for index, row in interp_df.iterrows():
            smile_string = row['SMILES']
            try:
                canonical_smile = CanonSmiles(smile_string)
            except:
                # If a SMILES cannot be canonicalized, just use the original
                canonical_smile = smile_string

            row['SMILES'] = canonical_smile

        for i in range(5):
            smiles = interp_df['SMILES'].sort_values()
            duplicates = set()
            for idx in range(0, smiles.shape[0] - 1):
                if smiles.iat[idx] == smiles.iat[idx + 1]:
                    duplicates.add(smiles.index[idx])
                    duplicates.add(smiles.index[idx + 1])

            if len(duplicates) > 0:
                for dup_idx in duplicates:
                    if interp_df.iat[dup_idx, 3]:
                        # add jitter to generated molecules only
                        distored = self.addjitter(
                            embeddings[dup_idx],
                            distance,
                            cnt=1,
                            shape=embeddings_dim[dup_idx])
                        embeddings[dup_idx] = distored[0]
                interp_df['SMILES'] = embedding_funct(embeddings.to_list())
                interp_df['embeddings'] = embeddings
            else:
                break

        # Ensure all generated molecules are valid.
        for i in range(5):
            PandasTools.AddMoleculeColumnToFrame(interp_df, 'SMILES')
            invalid_mol_df = interp_df[interp_df['ROMol'].isnull()]

            if not invalid_mol_df.empty:
                invalid_index = invalid_mol_df.index.to_list()
                for idx in invalid_index:
                    embeddings[idx] = self.addjitter(
                        embeddings[idx],
                        distance,
                        cnt=1,
                        shape=embeddings_dim[idx])[0]
                interp_df['SMILES'] = embedding_funct(embeddings.to_list())
                interp_df['embeddings'] = embeddings
            else:
                break

        # Cleanup
        if 'ROMol' in interp_df.columns:
            interp_df = interp_df.drop('ROMol', axis=1)

        return interp_df