Exemple #1
0
 def test_ccbmlib_descriptors(self):
     """Test ability to passthrough descriptors to ccbmlib."""
     mol_graph = MolFromSmiles("CCOCC")
     fprint_list = [
         "atom_pairs",
         "hashed_atom_pairs",
         "avalon",
         "maccs_keys",
         "morgan",
         "hashed_morgan",
         "rdkit_fingerprint",
         "torsions",
         "hashed_torsions",
     ]
     for desc in fprint_list:
         descriptor = Descriptor()
         descriptor.make_fingerprint(molecule_graph=mol_graph,
                                     fingerprint_type="ccbmlib:" + desc)
         self.assertTrue(
             descriptor.check_init(),
             "Expected Descriptor object to be initialized",
         )
         self.assertEqual(
             descriptor.label_,
             desc,
             "Expected label of descriptor initialized with "
             "{} to match the fingerprint".format(desc),
         )
Exemple #2
0
 def test_nonexistent_mordred_descriptors(self):
     """Test ability to pass through descriptors to Mordred."""
     mol_graph = MolFromSmiles("C")
     for desc in ["", "ReallyInvalidDescriptorName"]:
         descriptor = Descriptor()
         with self.assertRaises(MordredCalculatorError):
             descriptor.make_fingerprint(
                 molecule_graph=mol_graph,
                 fingerprint_type="mordred:" + desc,
             )
Exemple #3
0
    def test_descriptor_empty_init(self):
        """
        Test to verify empty Descriptor object can be created.

        """
        descriptor = Descriptor()
        self.assertFalse(
            descriptor.check_init(),
            "Expected Descriptor object to be uninitialized",
        )
Exemple #4
0
 def test_bad_descriptors_padelpy_descriptors(self):
     """Test ability to pass through invalid descriptors to padelpy."""
     mol_graph = MolFromSmiles("C")
     for desc in ["", "ReallyInvalidDescriptorName"]:
         descriptor = Descriptor()
         with self.assertRaises(RuntimeError):
             descriptor.make_fingerprint(
                 molecule_graph=mol_graph,
                 fingerprint_type="padelpy:" + desc,
                 fingerprint_params={'timeout': 2},
             )
Exemple #5
0
 def test_mordred_descriptors(self):
     """Test ability to passthrough descriptors to Mordred."""
     mol_graph = MolFromSmiles(
         "CC(C)C1=CC(=C(C(=C1)C(C)C)C2=CC=CC=C2P(C3CCCCC3)C4CCCCC4)C(C)C")
     for desc in ["MW", "LogEE_Dt", "BalabanJ"]:
         descriptor = Descriptor()
         descriptor.make_fingerprint(molecule_graph=mol_graph,
                                     fingerprint_type="mordred:" + desc)
         self.assertTrue(
             descriptor.check_init(),
             "Expected Descriptor object to be initialized",
         )
         self.assertEqual(
             descriptor.label_,
             desc,
             "Expected label of descriptor initialized with "
             "{} to match the fingerprint".format(desc),
         )
         self.assertIsInstance(
             descriptor.to_numpy(),
             np.ndarray,
             "Expected numpy.ndarray from to_numpy()",
         )
         with self.assertRaises(ValueError):
             descriptor.to_rdkit()
Exemple #6
0
    def test_descriptor_arbitrary_numpy_init(self):
        """
        Test to verify creation of Descriptor object initialized
        by arbitrary numpy array.

        """
        descriptor_value = np.array([1, 2, 3])
        descriptor = Descriptor(value=descriptor_value)
        self.assertTrue(
            descriptor.check_init(),
            "Expected Descriptor object to be initialized",
        )
        self.assertEqual(
            descriptor.label_,
            "arbitrary",
            "Expected label of descriptor initialized with "
            'arbitrary vector to be "arbitrary"',
        )
        self.assertIsInstance(descriptor.to_numpy(), np.ndarray,
                              "Expected numpy.ndarray from to_numpy()")
        self.assertTrue(
            (descriptor.to_numpy() == descriptor_value).all(),
            "Expected descriptor value to match init value",
        )
        with self.assertRaises(ValueError):
            descriptor.to_rdkit()
Exemple #7
0
    def test_descriptor_make_fingerprint(self):
        """
        Test to verify creation of Descriptor object by
        creating molecular fingerprints from the molecule graph.

        """
        mol_graph = MolFromSmiles("CCC")
        for fprint in SUPPORTED_FPRINTS:
            descriptor = Descriptor()
            descriptor.make_fingerprint(molecule_graph=mol_graph,
                                        fingerprint_type=fprint)
            self.assertTrue(
                descriptor.check_init(),
                "Expected Descriptor object to be initialized",
            )
            self.assertEqual(
                descriptor.label_,
                fprint,
                "Expected label of descriptor initialized with "
                "fingerprint to match the fingerprint",
            )
            self.assertIsInstance(
                descriptor.to_numpy(),
                np.ndarray,
                "Expected numpy.ndarray from to_numpy()",
            )
            self.assertIsInstance(
                descriptor.to_rdkit(),
                ExplicitBitVect,
                "Expected to_rdkit() to return "
                "ExplicitBitVect representation "
                f"of {fprint} fingerprint",
            )
Exemple #8
0
 def test_exptl_descriptors(self):
     """Test ability to use experimental descriptors."""
     mol_graph = MolFromSmiles("CCOCC")
     fprint_list = [
         "maccs_keys",
         "atom-pair_fingerprint",
         "torsion_fingerprint",
     ]
     for desc in fprint_list:
         descriptor = Descriptor()
         descriptor.make_fingerprint(molecule_graph=mol_graph,
                                     fingerprint_type=desc)
         self.assertTrue(
             descriptor.check_init(),
             "Expected Descriptor object to be initialized",
         )
         self.assertEqual(
             descriptor.label_,
             desc,
             "Expected label of descriptor initialized with "
             "{} to match the fingerprint".format(desc),
         )
 def _check_abcd(true_vals, arr1, arr2):
     fp1 = Descriptor(arr1)
     fp1.label_ = 'arbitrary_fingerprint'
     fp2 = Descriptor(arr2)
     fp2.label_ = 'arbitrary_fingerprint'
     abcd_calc = similarity_measure._get_abcd(fp1, fp2)
     for var_id, var in enumerate(['a', 'b', 'c', 'd']):
         self.assertEqual(
             true_vals[var], abcd_calc[var_id],
             f'Expected true {var} to match calculated val '
             f'for arrays {arr1}, {arr2}')
Exemple #10
0
 def test_padelpy_descriptors(self):
     """Test ability to passthrough descriptors to PadelPy."""
     mol_graph = MolFromSmiles("CCOCC")
     for desc in ["MATS7e", "Ti", "ATSC6p"]:
         descriptor = Descriptor()
         descriptor.make_fingerprint(molecule_graph=mol_graph,
                                     fingerprint_type="padelpy:" + desc)
         self.assertTrue(
             descriptor.check_init(),
             "Expected Descriptor object to be initialized",
         )
         self.assertEqual(
             descriptor.label_,
             desc,
             "Expected label of descriptor initialized with "
             "{} to match the fingerprint".format(desc),
         )
         self.assertIsInstance(
             descriptor.to_numpy(),
             np.ndarray,
             "Expected numpy.ndarray from to_numpy()",
         )
         with self.assertRaises(ValueError):
             descriptor.to_rdkit()
Exemple #11
0
import numpy as np
import pandas as pd
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.rdmolfiles import MolToPDBFile
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, TSNE, Isomap, SpectralEmbedding
from sklearn.preprocessing import StandardScaler

from AIMSim.chemical_datastructures import Molecule, MoleculeSet
from AIMSim.ops import Descriptor, SimilarityMeasure
from AIMSim.exceptions import NotInitializedError, InvalidConfigurationError


SUPPORTED_SIMILARITIES = SimilarityMeasure.get_supported_metrics()

SUPPORTED_FPRINTS = Descriptor.get_supported_fprints()


class TestMoleculeSet(unittest.TestCase):
    test_smarts = [
        "[CH3:1][S:2][c:3]1[cH:4][cH:5][c:6]([B:7]([OH:8])[OH:9])[cH:10][cH:11]1",
        "[NH:1]1[CH2:2][CH2:3][O:4][CH2:5][CH2:6]1.[O:7]=[S:8]=[O:9]",
    ]

    test_smiles = [
        "CCCCCCC",
        "CCCC",
        "CCC",
        "CO",
        "CN",
        "C1=CC=CC=C1",
Exemple #12
0
    def __call__(
        self,
        molecule_set_configs,
        fingerprint_type=None,
        fingerprint_params=None,
        similarity_measure=None,
        subsample_subset_size=0.01,
        optim_algo='max_min',
        show_top=0,
        only_metric=True,
    ):
        """
        Calculate the correlation in the properties of molecules in set
        and their nearest and furthest neighbors using different
        fingerprints / similarity measure choices. Choose the best fingerprint
        and similarity measure pair (called measure choice for brevity)
        based on an optimization strategy.

        Args:
            molecule_set_configs (dict): All configurations (except
                fingerprint_type, fingerprint_params and similarity_measure)
                needed to form the moleculeSet.
            fingerprint_type (str): Label to indicate which fingerprint to
                use. If supplied, fingerprint is fixed and optimization
                carried out over similarity measures. Use None to indicate
                that optimization needs to be carried out over fingerprints.
                Default is None.
            fingerprint_params (dict): Hyper-parameters for fingerprints.
                Passed to the MoleculeSet constructor. If None is passed,
                set to empty dictionary before passing to MoleculeSet.
            similarity_measure (str): Label to indicate which similarity
                measure to use. If supplied, similarity measure is fixed
                and optimization carried out over similarity measures.
                Use None to indicate that optimization needs to be carried
                out over fingerprints. Default is None.
            subsample_subset_size (float): Fraction of molecule_set to
                subsample. This is separate from the sample_ratio parameter
                used when creating a moleculeSet since it is recommended
                to have an more aggressive subsampling strategy for this task
                due to the combinatorial explosion of looking at multiple
                fingerprints and similarity measures. Default is 0.01.
            optim_algo (str): Label to indicate the optimization algorithm
                chosen. Options are:
                'max': The measure choice which maximizes correlation
                    of properties between nearest neighbors (most similar).
                    This is the default.
                'min': The measure choice which minimizes the absolute value
                    of property correlation
                    between furthest neighbors (most dissimilar).
                'max_min': The measure choice which maximizes correlation
                    of properties between nearest neighbors (most similar)
                    and minimizes he absolute value of property correlation
                    between furthest neighbors (most dissimilar).
                    This is the default.
            show_top (int): Number of top performing measures to show in plot.
                If 0, no plots are generated and the top performer is returned.
            only_metric (bool): If True only similarity measures satisfying
                the metricity property
                (i.e. can be converted to distance metrics) are selected.

        Returns:
            (NamedTuple): Top performer with fields:
                fingerprint_type (str): Label for fingerprint type
               similarity_measure (str): Label for similarity measure
               nearest_neighbor_correlation (float): Correlation of property
                   of molecule and its nearest neighbor.
               furthest_neighbor_correlation (float): Correlation of property
                   of molecule and its furthest neighbor.
               score_ (float): Overall score based on optimization strategy.
                   More is better.

        """
        print(f'Using subsample size {subsample_subset_size} for '
              f'measure search')
        trial_ = namedtuple('trial_', [
            'fingerprint_type', 'similarity_measure',
            'nearest_neighbor_correlation', 'furthest_neighbor_correlation',
            'score_'
        ])
        if fingerprint_type is None:
            all_fingerprint_types = Descriptor.get_supported_fprints()
            fingerprint_params = None
        else:
            all_fingerprint_types = [fingerprint_type]
        if similarity_measure is None:
            if only_metric:
                print('Only trying measures with valid distance metrics')
            all_similarity_measures = SimilarityMeasure.get_uniq_metrics()
        else:
            all_similarity_measures = [similarity_measure]
        is_verbose = molecule_set_configs.get("is_verbose", False)
        all_scores = []
        if fingerprint_params is None:
            fingerprint_params = {}
        for similarity_measure in all_similarity_measures:
            if only_metric and not SimilarityMeasure(
                    metric=similarity_measure).is_distance_metric():
                continue
            if is_verbose:
                print(f'Trying {similarity_measure} similarity')
            for fingerprint_type in all_fingerprint_types:
                if is_verbose:
                    print(f'Trying {fingerprint_type} fingerprint')
                try:
                    molecule_set = MoleculeSet(
                        molecule_database_src=molecule_set_configs[
                            'molecule_database_src'],
                        molecule_database_src_type=molecule_set_configs[
                            'molecule_database_src_type'],
                        similarity_measure=similarity_measure,
                        fingerprint_type=fingerprint_type,
                        fingerprint_params=fingerprint_params,
                        is_verbose=is_verbose,
                        n_threads=molecule_set_configs.get('n_threads', 1),
                        sampling_ratio=subsample_subset_size)
                except (InvalidConfigurationError, ValueError) as e:
                    if is_verbose:
                        print(
                            f'Could not try {fingerprint_type} with '
                            f'similarity measure {similarity_measure} due to '
                            f'{e}')
                    continue
                nearest_corr, nearest_p_val = self.prop_var_w_similarity. \
                    get_property_correlations_in_most_similar(
                        molecule_set)
                furthest_corr, furthest_p_val = self.prop_var_w_similarity. \
                    get_property_correlations_in_most_dissimilar(
                        molecule_set)
                if optim_algo == 'max_min':
                    score_ = nearest_corr - abs(furthest_corr)
                elif optim_algo == 'max':
                    score_ = nearest_corr
                elif optim_algo == 'min':
                    score_ = -abs(furthest_corr)
                else:
                    raise InvalidConfigurationError(f'{optim_algo} '
                                                    f'not implemented')
                all_scores.append(
                    trial_(fingerprint_type=fingerprint_type,
                           similarity_measure=similarity_measure,
                           nearest_neighbor_correlation=nearest_corr,
                           furthest_neighbor_correlation=furthest_corr,
                           score_=score_))
        all_scores.sort(key=lambda x: x[-1], reverse=True)
        if self.log_fpath is not None:
            print('Writing to ', self.log_fpath)
            log_data = [trial._asdict() for trial in all_scores]
            with open(self.log_fpath, "w") as fp:
                json.dump(log_data, fp)

        if show_top > 0:
            top_performers = all_scores[:show_top]
            all_nearest_neighbor_correlations = []
            all_furthest_neighbor_correlations = []
            top_scores = []
            all_measures = []
            for trial in top_performers:
                all_nearest_neighbor_correlations.append(
                    trial.nearest_neighbor_correlation)
                all_furthest_neighbor_correlations.append(
                    trial.furthest_neighbor_correlation)
                top_scores.append(trial.score_)
                all_measures.append(
                    Descriptor.shorten_label(trial.fingerprint_type) + '\n' +
                    trial.similarity_measure)
            bar_heights = np.array([
                top_scores, all_nearest_neighbor_correlations,
                all_furthest_neighbor_correlations
            ])
            colors = self.plot_settings.pop('colors')
            plot_multiple_barchart(x=[_ for _ in range(len(top_performers))],
                                   heights=bar_heights,
                                   legend_labels=[
                                       'Overall scores',
                                       'Nearest neighbor property '
                                       'correlation',
                                       'Furthest neighbor property '
                                       'correlations'
                                   ],
                                   colors=colors,
                                   xtick_labels=all_measures,
                                   ylabel='Value',
                                   xlabel='Measure',
                                   **self.plot_settings)

        return all_scores[0]
Exemple #13
0
    def test_topological_fprint_min_path_lesser_than_atoms(self):
        atomic_mols = [
            MolFromSmiles(smiles) for smiles in ['C', 'O', 'N', 'P']
        ]
        diatomic_mols = [
            MolFromSmiles(smiles) for smiles in ['CC', 'CO', 'CN', 'CP']
        ]
        triatomic_mols = [
            MolFromSmiles(smiles) for smiles in ['CCC', 'COO', 'CCN', 'CCP']
        ]
        min_path = 1
        for mol in atomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
        for diatomic_mol in diatomic_mols:
            descriptor = Descriptor()
            try:
                descriptor.make_fingerprint(
                    molecule_graph=diatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
            except InvalidConfigurationError:
                self.fail("Did not expect Descriptor to raise "
                          "InvalidConfigurationError")
        for triatomic_mol in triatomic_mols:
            descriptor = Descriptor()
            try:
                descriptor.make_fingerprint(
                    molecule_graph=triatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
            except InvalidConfigurationError:
                self.fail("Did not expect Descriptor to raise "
                          "InvalidConfigurationError")

        min_path = 2
        for mol in atomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
        for diatomic_mol in diatomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=diatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
        for triatomic_mol in triatomic_mols:
            descriptor = Descriptor()
            try:
                descriptor.make_fingerprint(
                    molecule_graph=triatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
            except InvalidConfigurationError:
                self.fail("Did not expect Descriptor to raise "
                          "InvalidConfigurationError")

        min_path = 3
        for mol in atomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
        for diatomic_mol in diatomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=diatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
        for triatomic_mol in triatomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=triatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
Exemple #14
0
    def test_fingerprint_folding(self):
        """Create arbitrary fingerprint vector to check fold method"""
        # Case 1
        arbit_vector = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
        arbit_label = 'arbitrary'
        desc = Descriptor()
        desc.label_ = arbit_label
        desc.numpy_ = arbit_vector
        with self.assertRaises(ValueError):
            desc.get_folded_fprint(fold_to_length=2)

        # Case 2
        arbit_vector = np.array([1, 0, 1, 0, 1, 0])
        folded_vector = np.array([1, 1, 1])
        arbit_label = 'arbitrary_fingerprint'
        desc = Descriptor()
        desc.label_ = arbit_label
        desc.numpy_ = arbit_vector
        with self.assertRaises(InvalidConfigurationError):
            desc.get_folded_fprint(fold_to_length=4)
        with self.assertRaises(InvalidConfigurationError):
            desc.get_folded_fprint(fold_to_length=10)
        self.assertTrue(((desc.get_folded_fprint(
            fold_to_length=3) == folded_vector).all()))

        # Case 3
        arbit_vector = np.array([1, 0, 1, 0, 0, 0, 0, 0])
        folded_once_vector = np.array([1, 0, 1, 0])
        folded_twice_vector = np.array([1, 0])
        arbit_label = 'arbitrary_fingerprint'
        desc = Descriptor()
        desc.label_ = arbit_label
        desc.numpy_ = arbit_vector
        with self.assertRaises(InvalidConfigurationError):
            desc.get_folded_fprint(fold_to_length=3)
        with self.assertRaises(InvalidConfigurationError):
            desc.get_folded_fprint(fold_to_length=10)
        self.assertTrue(((desc.get_folded_fprint(
            fold_to_length=4) == folded_once_vector).all()))
        self.assertTrue(((desc.get_folded_fprint(
            fold_to_length=2) == folded_twice_vector).all()))

        # Case 3
        arbit_vector = np.array([0, 0, 0, 0, 0, 0, 0, 0])
        folded_once_vector = np.array([0, 0, 0, 0])
        folded_twice_vector = np.array([0, 0])
        arbit_label = 'arbitrary_fingerprint'
        desc = Descriptor()
        desc.label_ = arbit_label
        desc.numpy_ = arbit_vector
        with self.assertRaises(InvalidConfigurationError):
            desc.get_folded_fprint(fold_to_length=3)
        with self.assertRaises(InvalidConfigurationError):
            desc.get_folded_fprint(fold_to_length=10)
        self.assertTrue(((desc.get_folded_fprint(
            fold_to_length=4) == folded_once_vector).all()))
        self.assertTrue(((desc.get_folded_fprint(
            fold_to_length=2) == folded_twice_vector).all()))

        # Case 4
        arbit_vector = np.array([1, 1, 1, 1, 1, 1, 1, 1])
        folded_once_vector = np.array([1, 1, 1, 1])
        folded_twice_vector = np.array([1, 1])
        arbit_label = 'arbitrary_fingerprint'
        desc = Descriptor()
        desc.label_ = arbit_label
        desc.numpy_ = arbit_vector
        with self.assertRaises(InvalidConfigurationError):
            desc.get_folded_fprint(fold_to_length=3)
        with self.assertRaises(InvalidConfigurationError):
            desc.get_folded_fprint(fold_to_length=10)
        self.assertTrue(((desc.get_folded_fprint(
            fold_to_length=4) == folded_once_vector).all()))
        self.assertTrue(((desc.get_folded_fprint(
            fold_to_length=2) == folded_twice_vector).all()))