def test_all_supported_measures(self): supported_measures = SimilarityMeasure.get_supported_metrics() for measure in supported_measures: try: _ = SimilarityMeasure(metric=measure) except ValueError: self.fail(f'Did not expect {measure} similarity metric to ' f'raise ValueError')
def test_molecule_graph_similar_to_itself_morgan_dice(self): """ Test that the morgan fingerprint of a Molecule object is similar to itself using dice similarity. """ test_smiles = "CCO" fingerprint_type = "morgan_fingerprint" similarity_metric = "dice" test_molecule = Molecule() test_molecule._set_molecule_from_smiles(test_smiles) test_molecule_duplicate = Molecule() test_molecule_duplicate._set_molecule_from_smiles(test_smiles) test_molecule.set_descriptor(fingerprint_type=fingerprint_type) test_molecule_duplicate.set_descriptor( fingerprint_type=fingerprint_type) similarity_measure = SimilarityMeasure(metric=similarity_metric) dice_similarity = test_molecule.get_similarity_to( test_molecule_duplicate, similarity_measure=similarity_measure) self.assertEqual( dice_similarity, 1.0, "Expected dice similarity to be 1 when comparing " "molecule graph to itself", )
def test_get_abcd(self): similarity_measure = SimilarityMeasure('tanimoto') def _check_abcd(true_vals, arr1, arr2): fp1 = Descriptor(arr1) fp1.label_ = 'arbitrary_fingerprint' fp2 = Descriptor(arr2) fp2.label_ = 'arbitrary_fingerprint' abcd_calc = similarity_measure._get_abcd(fp1, fp2) for var_id, var in enumerate(['a', 'b', 'c', 'd']): self.assertEqual( true_vals[var], abcd_calc[var_id], f'Expected true {var} to match calculated val ' f'for arrays {arr1}, {arr2}') # Case 1 arr1 = [1, 1, 1, 1, 1] arr2 = [0, 0, 0, 0, 0] true_vals = {'a': 0, 'b': 5, 'c': 0, 'd': 0} _check_abcd(true_vals, arr1=arr1, arr2=arr2) # Case 2 arr1 = [1, 1, 1, 0] arr2 = [0, 1] true_vals = {'a': 1, 'b': 1, 'c': 0, 'd': 0} _check_abcd(true_vals, arr1=arr1, arr2=arr2) # Case 3 arr1 = [1, 0, 1, 0] arr2 = [1, 0, 1, 0] true_vals = {'a': 2, 'b': 0, 'c': 0, 'd': 2} _check_abcd(true_vals, arr1=arr1, arr2=arr2) # Case 4 arr1 = [0, 1, 0, 1] arr2 = [1, 0, 1, 0] true_vals = {'a': 0, 'b': 2, 'c': 2, 'd': 0} _check_abcd(true_vals, arr1=arr1, arr2=arr2) # Case 5 arr1 = [1, 0, 0, 1, 1] arr2 = [1, 0, 1, 0, 0] true_vals = {'a': 1, 'b': 2, 'c': 1, 'd': 1} _check_abcd(true_vals, arr1=arr1, arr2=arr2)
def test_mol_mol_similarity_w_morgan_tanimoto(self): """ Test that the tanimoto similarity of the morgan fingerprints of two Molecules are in (0, 1). """ mol1_smiles = "CCCCCCCCC" mol2_smiles = "CCCCCCCCCCC" fingerprint_type = "morgan_fingerprint" similarity_metric = "tanimoto" molecules = [] for smiles in [mol1_smiles, mol2_smiles]: molecule = Molecule(mol_smiles=smiles) molecule.set_descriptor(fingerprint_type=fingerprint_type) molecules.append(molecule) similarity_measure = SimilarityMeasure(metric=similarity_metric) tanimoto_similarity = molecules[0].get_similarity_to( molecules[1], similarity_measure=similarity_measure) self.assertGreaterEqual(tanimoto_similarity, 0.0, "Expected tanimoto similarity to be >= 0.") self.assertLessEqual(tanimoto_similarity, 1.0, "Expected tanimoto similarity to be <= 1.")
import unittest import numpy as np import pandas as pd from rdkit.Chem import MolFromSmiles from rdkit.Chem.rdmolfiles import MolToPDBFile from sklearn.decomposition import PCA from sklearn.manifold import MDS, TSNE, Isomap, SpectralEmbedding from sklearn.preprocessing import StandardScaler from AIMSim.chemical_datastructures import Molecule, MoleculeSet from AIMSim.ops import Descriptor, SimilarityMeasure from AIMSim.exceptions import NotInitializedError, InvalidConfigurationError SUPPORTED_SIMILARITIES = SimilarityMeasure.get_supported_metrics() SUPPORTED_FPRINTS = Descriptor.get_supported_fprints() class TestMoleculeSet(unittest.TestCase): test_smarts = [ "[CH3:1][S:2][c:3]1[cH:4][cH:5][c:6]([B:7]([OH:8])[OH:9])[cH:10][cH:11]1", "[NH:1]1[CH2:2][CH2:3][O:4][CH2:5][CH2:6]1.[O:7]=[S:8]=[O:9]", ] test_smiles = [ "CCCCCCC", "CCCC", "CCC", "CO",
def __call__( self, molecule_set_configs, fingerprint_type=None, fingerprint_params=None, similarity_measure=None, subsample_subset_size=0.01, optim_algo='max_min', show_top=0, only_metric=True, ): """ Calculate the correlation in the properties of molecules in set and their nearest and furthest neighbors using different fingerprints / similarity measure choices. Choose the best fingerprint and similarity measure pair (called measure choice for brevity) based on an optimization strategy. Args: molecule_set_configs (dict): All configurations (except fingerprint_type, fingerprint_params and similarity_measure) needed to form the moleculeSet. fingerprint_type (str): Label to indicate which fingerprint to use. If supplied, fingerprint is fixed and optimization carried out over similarity measures. Use None to indicate that optimization needs to be carried out over fingerprints. Default is None. fingerprint_params (dict): Hyper-parameters for fingerprints. Passed to the MoleculeSet constructor. If None is passed, set to empty dictionary before passing to MoleculeSet. similarity_measure (str): Label to indicate which similarity measure to use. If supplied, similarity measure is fixed and optimization carried out over similarity measures. Use None to indicate that optimization needs to be carried out over fingerprints. Default is None. subsample_subset_size (float): Fraction of molecule_set to subsample. This is separate from the sample_ratio parameter used when creating a moleculeSet since it is recommended to have an more aggressive subsampling strategy for this task due to the combinatorial explosion of looking at multiple fingerprints and similarity measures. Default is 0.01. optim_algo (str): Label to indicate the optimization algorithm chosen. Options are: 'max': The measure choice which maximizes correlation of properties between nearest neighbors (most similar). This is the default. 'min': The measure choice which minimizes the absolute value of property correlation between furthest neighbors (most dissimilar). 'max_min': The measure choice which maximizes correlation of properties between nearest neighbors (most similar) and minimizes he absolute value of property correlation between furthest neighbors (most dissimilar). This is the default. show_top (int): Number of top performing measures to show in plot. If 0, no plots are generated and the top performer is returned. only_metric (bool): If True only similarity measures satisfying the metricity property (i.e. can be converted to distance metrics) are selected. Returns: (NamedTuple): Top performer with fields: fingerprint_type (str): Label for fingerprint type similarity_measure (str): Label for similarity measure nearest_neighbor_correlation (float): Correlation of property of molecule and its nearest neighbor. furthest_neighbor_correlation (float): Correlation of property of molecule and its furthest neighbor. score_ (float): Overall score based on optimization strategy. More is better. """ print(f'Using subsample size {subsample_subset_size} for ' f'measure search') trial_ = namedtuple('trial_', [ 'fingerprint_type', 'similarity_measure', 'nearest_neighbor_correlation', 'furthest_neighbor_correlation', 'score_' ]) if fingerprint_type is None: all_fingerprint_types = Descriptor.get_supported_fprints() fingerprint_params = None else: all_fingerprint_types = [fingerprint_type] if similarity_measure is None: if only_metric: print('Only trying measures with valid distance metrics') all_similarity_measures = SimilarityMeasure.get_uniq_metrics() else: all_similarity_measures = [similarity_measure] is_verbose = molecule_set_configs.get("is_verbose", False) all_scores = [] if fingerprint_params is None: fingerprint_params = {} for similarity_measure in all_similarity_measures: if only_metric and not SimilarityMeasure( metric=similarity_measure).is_distance_metric(): continue if is_verbose: print(f'Trying {similarity_measure} similarity') for fingerprint_type in all_fingerprint_types: if is_verbose: print(f'Trying {fingerprint_type} fingerprint') try: molecule_set = MoleculeSet( molecule_database_src=molecule_set_configs[ 'molecule_database_src'], molecule_database_src_type=molecule_set_configs[ 'molecule_database_src_type'], similarity_measure=similarity_measure, fingerprint_type=fingerprint_type, fingerprint_params=fingerprint_params, is_verbose=is_verbose, n_threads=molecule_set_configs.get('n_threads', 1), sampling_ratio=subsample_subset_size) except (InvalidConfigurationError, ValueError) as e: if is_verbose: print( f'Could not try {fingerprint_type} with ' f'similarity measure {similarity_measure} due to ' f'{e}') continue nearest_corr, nearest_p_val = self.prop_var_w_similarity. \ get_property_correlations_in_most_similar( molecule_set) furthest_corr, furthest_p_val = self.prop_var_w_similarity. \ get_property_correlations_in_most_dissimilar( molecule_set) if optim_algo == 'max_min': score_ = nearest_corr - abs(furthest_corr) elif optim_algo == 'max': score_ = nearest_corr elif optim_algo == 'min': score_ = -abs(furthest_corr) else: raise InvalidConfigurationError(f'{optim_algo} ' f'not implemented') all_scores.append( trial_(fingerprint_type=fingerprint_type, similarity_measure=similarity_measure, nearest_neighbor_correlation=nearest_corr, furthest_neighbor_correlation=furthest_corr, score_=score_)) all_scores.sort(key=lambda x: x[-1], reverse=True) if self.log_fpath is not None: print('Writing to ', self.log_fpath) log_data = [trial._asdict() for trial in all_scores] with open(self.log_fpath, "w") as fp: json.dump(log_data, fp) if show_top > 0: top_performers = all_scores[:show_top] all_nearest_neighbor_correlations = [] all_furthest_neighbor_correlations = [] top_scores = [] all_measures = [] for trial in top_performers: all_nearest_neighbor_correlations.append( trial.nearest_neighbor_correlation) all_furthest_neighbor_correlations.append( trial.furthest_neighbor_correlation) top_scores.append(trial.score_) all_measures.append( Descriptor.shorten_label(trial.fingerprint_type) + '\n' + trial.similarity_measure) bar_heights = np.array([ top_scores, all_nearest_neighbor_correlations, all_furthest_neighbor_correlations ]) colors = self.plot_settings.pop('colors') plot_multiple_barchart(x=[_ for _ in range(len(top_performers))], heights=bar_heights, legend_labels=[ 'Overall scores', 'Nearest neighbor property ' 'correlation', 'Furthest neighbor property ' 'correlations' ], colors=colors, xtick_labels=all_measures, ylabel='Value', xlabel='Measure', **self.plot_settings) return all_scores[0]