def test_ccbmlib_descriptors(self): """Test ability to passthrough descriptors to ccbmlib.""" mol_graph = MolFromSmiles("CCOCC") fprint_list = [ "atom_pairs", "hashed_atom_pairs", "avalon", "maccs_keys", "morgan", "hashed_morgan", "rdkit_fingerprint", "torsions", "hashed_torsions", ] for desc in fprint_list: descriptor = Descriptor() descriptor.make_fingerprint(molecule_graph=mol_graph, fingerprint_type="ccbmlib:" + desc) self.assertTrue( descriptor.check_init(), "Expected Descriptor object to be initialized", ) self.assertEqual( descriptor.label_, desc, "Expected label of descriptor initialized with " "{} to match the fingerprint".format(desc), )
def test_nonexistent_mordred_descriptors(self): """Test ability to pass through descriptors to Mordred.""" mol_graph = MolFromSmiles("C") for desc in ["", "ReallyInvalidDescriptorName"]: descriptor = Descriptor() with self.assertRaises(MordredCalculatorError): descriptor.make_fingerprint( molecule_graph=mol_graph, fingerprint_type="mordred:" + desc, )
def test_descriptor_empty_init(self): """ Test to verify empty Descriptor object can be created. """ descriptor = Descriptor() self.assertFalse( descriptor.check_init(), "Expected Descriptor object to be uninitialized", )
def test_bad_descriptors_padelpy_descriptors(self): """Test ability to pass through invalid descriptors to padelpy.""" mol_graph = MolFromSmiles("C") for desc in ["", "ReallyInvalidDescriptorName"]: descriptor = Descriptor() with self.assertRaises(RuntimeError): descriptor.make_fingerprint( molecule_graph=mol_graph, fingerprint_type="padelpy:" + desc, fingerprint_params={'timeout': 2}, )
def test_mordred_descriptors(self): """Test ability to passthrough descriptors to Mordred.""" mol_graph = MolFromSmiles( "CC(C)C1=CC(=C(C(=C1)C(C)C)C2=CC=CC=C2P(C3CCCCC3)C4CCCCC4)C(C)C") for desc in ["MW", "LogEE_Dt", "BalabanJ"]: descriptor = Descriptor() descriptor.make_fingerprint(molecule_graph=mol_graph, fingerprint_type="mordred:" + desc) self.assertTrue( descriptor.check_init(), "Expected Descriptor object to be initialized", ) self.assertEqual( descriptor.label_, desc, "Expected label of descriptor initialized with " "{} to match the fingerprint".format(desc), ) self.assertIsInstance( descriptor.to_numpy(), np.ndarray, "Expected numpy.ndarray from to_numpy()", ) with self.assertRaises(ValueError): descriptor.to_rdkit()
def test_descriptor_arbitrary_numpy_init(self): """ Test to verify creation of Descriptor object initialized by arbitrary numpy array. """ descriptor_value = np.array([1, 2, 3]) descriptor = Descriptor(value=descriptor_value) self.assertTrue( descriptor.check_init(), "Expected Descriptor object to be initialized", ) self.assertEqual( descriptor.label_, "arbitrary", "Expected label of descriptor initialized with " 'arbitrary vector to be "arbitrary"', ) self.assertIsInstance(descriptor.to_numpy(), np.ndarray, "Expected numpy.ndarray from to_numpy()") self.assertTrue( (descriptor.to_numpy() == descriptor_value).all(), "Expected descriptor value to match init value", ) with self.assertRaises(ValueError): descriptor.to_rdkit()
def test_descriptor_make_fingerprint(self): """ Test to verify creation of Descriptor object by creating molecular fingerprints from the molecule graph. """ mol_graph = MolFromSmiles("CCC") for fprint in SUPPORTED_FPRINTS: descriptor = Descriptor() descriptor.make_fingerprint(molecule_graph=mol_graph, fingerprint_type=fprint) self.assertTrue( descriptor.check_init(), "Expected Descriptor object to be initialized", ) self.assertEqual( descriptor.label_, fprint, "Expected label of descriptor initialized with " "fingerprint to match the fingerprint", ) self.assertIsInstance( descriptor.to_numpy(), np.ndarray, "Expected numpy.ndarray from to_numpy()", ) self.assertIsInstance( descriptor.to_rdkit(), ExplicitBitVect, "Expected to_rdkit() to return " "ExplicitBitVect representation " f"of {fprint} fingerprint", )
def test_exptl_descriptors(self): """Test ability to use experimental descriptors.""" mol_graph = MolFromSmiles("CCOCC") fprint_list = [ "maccs_keys", "atom-pair_fingerprint", "torsion_fingerprint", ] for desc in fprint_list: descriptor = Descriptor() descriptor.make_fingerprint(molecule_graph=mol_graph, fingerprint_type=desc) self.assertTrue( descriptor.check_init(), "Expected Descriptor object to be initialized", ) self.assertEqual( descriptor.label_, desc, "Expected label of descriptor initialized with " "{} to match the fingerprint".format(desc), )
def _check_abcd(true_vals, arr1, arr2): fp1 = Descriptor(arr1) fp1.label_ = 'arbitrary_fingerprint' fp2 = Descriptor(arr2) fp2.label_ = 'arbitrary_fingerprint' abcd_calc = similarity_measure._get_abcd(fp1, fp2) for var_id, var in enumerate(['a', 'b', 'c', 'd']): self.assertEqual( true_vals[var], abcd_calc[var_id], f'Expected true {var} to match calculated val ' f'for arrays {arr1}, {arr2}')
def test_padelpy_descriptors(self): """Test ability to passthrough descriptors to PadelPy.""" mol_graph = MolFromSmiles("CCOCC") for desc in ["MATS7e", "Ti", "ATSC6p"]: descriptor = Descriptor() descriptor.make_fingerprint(molecule_graph=mol_graph, fingerprint_type="padelpy:" + desc) self.assertTrue( descriptor.check_init(), "Expected Descriptor object to be initialized", ) self.assertEqual( descriptor.label_, desc, "Expected label of descriptor initialized with " "{} to match the fingerprint".format(desc), ) self.assertIsInstance( descriptor.to_numpy(), np.ndarray, "Expected numpy.ndarray from to_numpy()", ) with self.assertRaises(ValueError): descriptor.to_rdkit()
import numpy as np import pandas as pd from rdkit.Chem import MolFromSmiles from rdkit.Chem.rdmolfiles import MolToPDBFile from sklearn.decomposition import PCA from sklearn.manifold import MDS, TSNE, Isomap, SpectralEmbedding from sklearn.preprocessing import StandardScaler from AIMSim.chemical_datastructures import Molecule, MoleculeSet from AIMSim.ops import Descriptor, SimilarityMeasure from AIMSim.exceptions import NotInitializedError, InvalidConfigurationError SUPPORTED_SIMILARITIES = SimilarityMeasure.get_supported_metrics() SUPPORTED_FPRINTS = Descriptor.get_supported_fprints() class TestMoleculeSet(unittest.TestCase): test_smarts = [ "[CH3:1][S:2][c:3]1[cH:4][cH:5][c:6]([B:7]([OH:8])[OH:9])[cH:10][cH:11]1", "[NH:1]1[CH2:2][CH2:3][O:4][CH2:5][CH2:6]1.[O:7]=[S:8]=[O:9]", ] test_smiles = [ "CCCCCCC", "CCCC", "CCC", "CO", "CN", "C1=CC=CC=C1",
def __call__( self, molecule_set_configs, fingerprint_type=None, fingerprint_params=None, similarity_measure=None, subsample_subset_size=0.01, optim_algo='max_min', show_top=0, only_metric=True, ): """ Calculate the correlation in the properties of molecules in set and their nearest and furthest neighbors using different fingerprints / similarity measure choices. Choose the best fingerprint and similarity measure pair (called measure choice for brevity) based on an optimization strategy. Args: molecule_set_configs (dict): All configurations (except fingerprint_type, fingerprint_params and similarity_measure) needed to form the moleculeSet. fingerprint_type (str): Label to indicate which fingerprint to use. If supplied, fingerprint is fixed and optimization carried out over similarity measures. Use None to indicate that optimization needs to be carried out over fingerprints. Default is None. fingerprint_params (dict): Hyper-parameters for fingerprints. Passed to the MoleculeSet constructor. If None is passed, set to empty dictionary before passing to MoleculeSet. similarity_measure (str): Label to indicate which similarity measure to use. If supplied, similarity measure is fixed and optimization carried out over similarity measures. Use None to indicate that optimization needs to be carried out over fingerprints. Default is None. subsample_subset_size (float): Fraction of molecule_set to subsample. This is separate from the sample_ratio parameter used when creating a moleculeSet since it is recommended to have an more aggressive subsampling strategy for this task due to the combinatorial explosion of looking at multiple fingerprints and similarity measures. Default is 0.01. optim_algo (str): Label to indicate the optimization algorithm chosen. Options are: 'max': The measure choice which maximizes correlation of properties between nearest neighbors (most similar). This is the default. 'min': The measure choice which minimizes the absolute value of property correlation between furthest neighbors (most dissimilar). 'max_min': The measure choice which maximizes correlation of properties between nearest neighbors (most similar) and minimizes he absolute value of property correlation between furthest neighbors (most dissimilar). This is the default. show_top (int): Number of top performing measures to show in plot. If 0, no plots are generated and the top performer is returned. only_metric (bool): If True only similarity measures satisfying the metricity property (i.e. can be converted to distance metrics) are selected. Returns: (NamedTuple): Top performer with fields: fingerprint_type (str): Label for fingerprint type similarity_measure (str): Label for similarity measure nearest_neighbor_correlation (float): Correlation of property of molecule and its nearest neighbor. furthest_neighbor_correlation (float): Correlation of property of molecule and its furthest neighbor. score_ (float): Overall score based on optimization strategy. More is better. """ print(f'Using subsample size {subsample_subset_size} for ' f'measure search') trial_ = namedtuple('trial_', [ 'fingerprint_type', 'similarity_measure', 'nearest_neighbor_correlation', 'furthest_neighbor_correlation', 'score_' ]) if fingerprint_type is None: all_fingerprint_types = Descriptor.get_supported_fprints() fingerprint_params = None else: all_fingerprint_types = [fingerprint_type] if similarity_measure is None: if only_metric: print('Only trying measures with valid distance metrics') all_similarity_measures = SimilarityMeasure.get_uniq_metrics() else: all_similarity_measures = [similarity_measure] is_verbose = molecule_set_configs.get("is_verbose", False) all_scores = [] if fingerprint_params is None: fingerprint_params = {} for similarity_measure in all_similarity_measures: if only_metric and not SimilarityMeasure( metric=similarity_measure).is_distance_metric(): continue if is_verbose: print(f'Trying {similarity_measure} similarity') for fingerprint_type in all_fingerprint_types: if is_verbose: print(f'Trying {fingerprint_type} fingerprint') try: molecule_set = MoleculeSet( molecule_database_src=molecule_set_configs[ 'molecule_database_src'], molecule_database_src_type=molecule_set_configs[ 'molecule_database_src_type'], similarity_measure=similarity_measure, fingerprint_type=fingerprint_type, fingerprint_params=fingerprint_params, is_verbose=is_verbose, n_threads=molecule_set_configs.get('n_threads', 1), sampling_ratio=subsample_subset_size) except (InvalidConfigurationError, ValueError) as e: if is_verbose: print( f'Could not try {fingerprint_type} with ' f'similarity measure {similarity_measure} due to ' f'{e}') continue nearest_corr, nearest_p_val = self.prop_var_w_similarity. \ get_property_correlations_in_most_similar( molecule_set) furthest_corr, furthest_p_val = self.prop_var_w_similarity. \ get_property_correlations_in_most_dissimilar( molecule_set) if optim_algo == 'max_min': score_ = nearest_corr - abs(furthest_corr) elif optim_algo == 'max': score_ = nearest_corr elif optim_algo == 'min': score_ = -abs(furthest_corr) else: raise InvalidConfigurationError(f'{optim_algo} ' f'not implemented') all_scores.append( trial_(fingerprint_type=fingerprint_type, similarity_measure=similarity_measure, nearest_neighbor_correlation=nearest_corr, furthest_neighbor_correlation=furthest_corr, score_=score_)) all_scores.sort(key=lambda x: x[-1], reverse=True) if self.log_fpath is not None: print('Writing to ', self.log_fpath) log_data = [trial._asdict() for trial in all_scores] with open(self.log_fpath, "w") as fp: json.dump(log_data, fp) if show_top > 0: top_performers = all_scores[:show_top] all_nearest_neighbor_correlations = [] all_furthest_neighbor_correlations = [] top_scores = [] all_measures = [] for trial in top_performers: all_nearest_neighbor_correlations.append( trial.nearest_neighbor_correlation) all_furthest_neighbor_correlations.append( trial.furthest_neighbor_correlation) top_scores.append(trial.score_) all_measures.append( Descriptor.shorten_label(trial.fingerprint_type) + '\n' + trial.similarity_measure) bar_heights = np.array([ top_scores, all_nearest_neighbor_correlations, all_furthest_neighbor_correlations ]) colors = self.plot_settings.pop('colors') plot_multiple_barchart(x=[_ for _ in range(len(top_performers))], heights=bar_heights, legend_labels=[ 'Overall scores', 'Nearest neighbor property ' 'correlation', 'Furthest neighbor property ' 'correlations' ], colors=colors, xtick_labels=all_measures, ylabel='Value', xlabel='Measure', **self.plot_settings) return all_scores[0]
def test_topological_fprint_min_path_lesser_than_atoms(self): atomic_mols = [ MolFromSmiles(smiles) for smiles in ['C', 'O', 'N', 'P'] ] diatomic_mols = [ MolFromSmiles(smiles) for smiles in ['CC', 'CO', 'CN', 'CP'] ] triatomic_mols = [ MolFromSmiles(smiles) for smiles in ['CCC', 'COO', 'CCN', 'CCP'] ] min_path = 1 for mol in atomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) for diatomic_mol in diatomic_mols: descriptor = Descriptor() try: descriptor.make_fingerprint( molecule_graph=diatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) except InvalidConfigurationError: self.fail("Did not expect Descriptor to raise " "InvalidConfigurationError") for triatomic_mol in triatomic_mols: descriptor = Descriptor() try: descriptor.make_fingerprint( molecule_graph=triatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) except InvalidConfigurationError: self.fail("Did not expect Descriptor to raise " "InvalidConfigurationError") min_path = 2 for mol in atomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) for diatomic_mol in diatomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=diatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) for triatomic_mol in triatomic_mols: descriptor = Descriptor() try: descriptor.make_fingerprint( molecule_graph=triatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) except InvalidConfigurationError: self.fail("Did not expect Descriptor to raise " "InvalidConfigurationError") min_path = 3 for mol in atomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) for diatomic_mol in diatomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=diatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path}) for triatomic_mol in triatomic_mols: with self.assertRaises(InvalidConfigurationError): descriptor = Descriptor() descriptor.make_fingerprint( molecule_graph=triatomic_mol, fingerprint_type='topological_fingerprint', fingerprint_params={'min_path': min_path})
def test_fingerprint_folding(self): """Create arbitrary fingerprint vector to check fold method""" # Case 1 arbit_vector = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) arbit_label = 'arbitrary' desc = Descriptor() desc.label_ = arbit_label desc.numpy_ = arbit_vector with self.assertRaises(ValueError): desc.get_folded_fprint(fold_to_length=2) # Case 2 arbit_vector = np.array([1, 0, 1, 0, 1, 0]) folded_vector = np.array([1, 1, 1]) arbit_label = 'arbitrary_fingerprint' desc = Descriptor() desc.label_ = arbit_label desc.numpy_ = arbit_vector with self.assertRaises(InvalidConfigurationError): desc.get_folded_fprint(fold_to_length=4) with self.assertRaises(InvalidConfigurationError): desc.get_folded_fprint(fold_to_length=10) self.assertTrue(((desc.get_folded_fprint( fold_to_length=3) == folded_vector).all())) # Case 3 arbit_vector = np.array([1, 0, 1, 0, 0, 0, 0, 0]) folded_once_vector = np.array([1, 0, 1, 0]) folded_twice_vector = np.array([1, 0]) arbit_label = 'arbitrary_fingerprint' desc = Descriptor() desc.label_ = arbit_label desc.numpy_ = arbit_vector with self.assertRaises(InvalidConfigurationError): desc.get_folded_fprint(fold_to_length=3) with self.assertRaises(InvalidConfigurationError): desc.get_folded_fprint(fold_to_length=10) self.assertTrue(((desc.get_folded_fprint( fold_to_length=4) == folded_once_vector).all())) self.assertTrue(((desc.get_folded_fprint( fold_to_length=2) == folded_twice_vector).all())) # Case 3 arbit_vector = np.array([0, 0, 0, 0, 0, 0, 0, 0]) folded_once_vector = np.array([0, 0, 0, 0]) folded_twice_vector = np.array([0, 0]) arbit_label = 'arbitrary_fingerprint' desc = Descriptor() desc.label_ = arbit_label desc.numpy_ = arbit_vector with self.assertRaises(InvalidConfigurationError): desc.get_folded_fprint(fold_to_length=3) with self.assertRaises(InvalidConfigurationError): desc.get_folded_fprint(fold_to_length=10) self.assertTrue(((desc.get_folded_fprint( fold_to_length=4) == folded_once_vector).all())) self.assertTrue(((desc.get_folded_fprint( fold_to_length=2) == folded_twice_vector).all())) # Case 4 arbit_vector = np.array([1, 1, 1, 1, 1, 1, 1, 1]) folded_once_vector = np.array([1, 1, 1, 1]) folded_twice_vector = np.array([1, 1]) arbit_label = 'arbitrary_fingerprint' desc = Descriptor() desc.label_ = arbit_label desc.numpy_ = arbit_vector with self.assertRaises(InvalidConfigurationError): desc.get_folded_fprint(fold_to_length=3) with self.assertRaises(InvalidConfigurationError): desc.get_folded_fprint(fold_to_length=10) self.assertTrue(((desc.get_folded_fprint( fold_to_length=4) == folded_once_vector).all())) self.assertTrue(((desc.get_folded_fprint( fold_to_length=2) == folded_twice_vector).all()))