def test_spectral_embedding_transform(self): """ Test the unsupervised transformation of molecules in MoleculSet using Isomap. """ n_features = 20 features = np.random.normal(size=(len(self.test_smiles), n_features)) csv_fpath = self.smiles_seq_to_xl_or_csv( ftype="csv", feature_arr=features) molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", similarity_measure="l0_similarity", is_verbose=True, ) features = StandardScaler().fit_transform(features) features = SpectralEmbedding().fit_transform(features) error_matrix = features - \ molecule_set.get_transformed_descriptors(method_="spectral_embedding") error_threshold = 1e-6 self.assertLessEqual( error_matrix.min(), error_threshold, "Expected transformed molecular descriptors to be " "equal to SpectralEmbedding decomposed features", ) remove(csv_fpath)
def test_pca_transform(self): """ Test the unsupervised transformation of molecules in MoleculSet using Principal Component Analysis. """ n_features = 20 features = np.random.normal(size=(len(self.test_smiles), n_features)) csv_fpath = self.smiles_seq_to_xl_or_csv( ftype="csv", feature_arr=features) molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", similarity_measure="l0_similarity", is_verbose=True, ) features = StandardScaler().fit_transform(features) features = PCA(n_components=2).fit_transform(features) error_matrix = features - molecule_set.get_transformed_descriptors() error_threshold = 1e-6 self.assertLessEqual( error_matrix.min(), error_threshold, "Expected transformed molecular descriptors to be " "equal to PCA decomposed features", ) remove(csv_fpath)
def test_get_most_similar_pairs(self): """ Test that all combinations of fingerprint_type and similarity measure works with the MoleculeSet.get_most_similar_pairs() method. """ csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv") for descriptor in SUPPORTED_FPRINTS: for similarity_measure in SUPPORTED_SIMILARITIES: molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", fingerprint_type=descriptor, similarity_measure=similarity_measure, is_verbose=False, ) molecule_pairs = molecule_set.get_most_similar_pairs() self.assertIsInstance( molecule_pairs, list, "Expected get_most_similar_pairs() to return list", ) for pair in molecule_pairs: self.assertIsInstance( pair, tuple, "Expected elements of list " "returned by get_most_similar_pairs()" " to be tuples", ) remove(csv_fpath)
def test_subsample_molecule_database_from_csv(self): """ Test to randomly subsample a molecule database loaded from an CSV file. """ csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv") sampling_ratio = 0.5 molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", fingerprint_type="morgan_fingerprint", similarity_measure="tanimoto", sampling_ratio=sampling_ratio, is_verbose=True, ) self.assertIsNotNone( molecule_set.molecule_database, "Expected molecule_database to be set from csv file", ) self.assertEqual( len(molecule_set.molecule_database), int(sampling_ratio * len(self.test_smiles)), "Expected the size of database to be equal to number " "of smiles * sampling_ratio", ) for id, molecule in enumerate(molecule_set.molecule_database): self.assertIsInstance( molecule, Molecule, "Expected member of molecule_set to be Molecule object", ) print(f"Test complete. Deleting file {csv_fpath}...") remove(csv_fpath)
def test_similarity_measure_limits(self): csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv") for descriptor in SUPPORTED_FPRINTS: for similarity_measure in SUPPORTED_SIMILARITIES: molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", fingerprint_type=descriptor, similarity_measure=similarity_measure, is_verbose=False, ) for mol1 in molecule_set.molecule_database: for mol2 in molecule_set.molecule_database: similarity_ = mol1.get_similarity_to( mol2, molecule_set.similarity_measure) self.assertGreaterEqual( similarity_, 0., "Expected similarity value " "to be >= 0." f"using similarity measure:" f" {similarity_measure}, " f"descriptor: {descriptor}" f", for molecules " f"{mol1.mol_text}, " f"{mol2.mol_text}") self.assertLessEqual( similarity_, 1., "Expected similarity value to " "be <= 1." f"using similarity measure: " f"{similarity_measure}, " f"descriptor: {descriptor}, " f"for molecule {mol1.mol_text}, " f"{mol2.mol_text}") remove('temp_mol_file.csv')
def test_subsample_molecule_database_from_pdb_dir(self): """ Test to randomly subsample a molecule database loaded from a directory of pdb files. """ dir_path = self.smiles_seq_to_pdb_dir(self.test_smiles) sampling_ratio = 0.5 molecule_set = MoleculeSet( molecule_database_src=dir_path, molecule_database_src_type="directory", fingerprint_type="morgan_fingerprint", similarity_measure="tanimoto", is_verbose=True, sampling_ratio=sampling_ratio, ) self.assertIsNotNone( molecule_set.molecule_database, "Expected molecule_database to be set from dir", ) self.assertEqual( len(molecule_set.molecule_database), int(sampling_ratio * len(self.test_smiles)), "Expected the size of subsampled database to be " "equal to number of files in dir * sampling_ratio", ) for id, molecule in enumerate(molecule_set.molecule_database): self.assertIsInstance( molecule, Molecule, "Expected member of molecule_set to " "be Molecule object", ) print(f"Test complete. Deleting directory {dir_path}...") rmtree(dir_path)
def test_get_molecule_most_similar_to(self): csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv") for descriptor in SUPPORTED_FPRINTS: for similarity_measure in SUPPORTED_SIMILARITIES: molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", fingerprint_type=descriptor, similarity_measure=similarity_measure, is_verbose=False, ) for mol_smile, mol in zip(TEST_SMILES, molecule_set.molecule_database): compare_task = CompareTargetMolecule( target_molecule_smiles=mol_smile) [closest_mol], [similarity] = compare_task.\ get_hits_similar_to(molecule_set) mol_similarities = molecule_set.compare_against_molecule( mol) self.assertEqual( np.max(mol_similarities), mol.get_similarity_to( molecule_set.molecule_database[closest_mol], molecule_set.similarity_measure), f"Expected closest mol to have maximum " f"similarity to target molecule " f"using similarity measure: " f"{similarity_measure}, " f"descriptor: " f"{descriptor}, " f"for molecule {mol.mol_text}", ) self.assertGreaterEqual( similarity, 0., "Expected similarity value to " "be >= 0." f"using similarity measure: " f"{similarity_measure}, " f"descriptor: {descriptor}, " f"for molecule {mol.mol_text}") self.assertLessEqual( similarity, 1., "Expected similarity value to " "be <= 1." f"using similarity measure: " f"{similarity_measure}, " f"descriptor: {descriptor}, " f"for molecule {mol.mol_text}")
def test_set_molecule_database_w_descriptor_property_from_csv(self): """ Test to create MoleculeSet object by reading molecule database containing arbitrary molecular descriptors and molecular responses from a CSV file. """ properties = np.random.normal(size=len(self.test_smiles)) n_features = 20 features = np.random.normal(size=(len(self.test_smiles), n_features)) csv_fpath = self.smiles_seq_to_xl_or_csv( ftype="csv", property_seq=properties, feature_arr=features ) molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", similarity_measure="l0_similarity", is_verbose=True, ) self.assertTrue(molecule_set.is_verbose, "Expected is_verbose to be True") self.assertIsNotNone( molecule_set.molecule_database, "Expected molecule_database to be set from " "excel file", ) self.assertEqual( len(molecule_set.molecule_database), len(self.test_smiles), "Expected the size of database to be equal to number " "of smiles in csv file", ) for id, molecule in enumerate(molecule_set.molecule_database): self.assertEqual( molecule.mol_text, self.test_smiles[id], "Expected mol_text attribute of Molecule object " "to be smiles when names not present in csv", ) self.assertAlmostEqual( molecule.mol_property_val, properties[id], places=7, msg="Expected mol_property_val of" "Molecule object " "to be set to value in csv file", ) self.assertTrue( (molecule.descriptor.to_numpy() == features[id]).all, "Expected descriptor value to be same as the " "vector used to initialize descriptor", ) self.assertIsInstance( molecule, Molecule, "Expected member of molecule_set to be Molecule object", ) print(f"Test complete. Deleting file {csv_fpath}...") remove(csv_fpath)
def test_molecule_set_sim_getters(self): """Get the properties for most and least similar molecule pairs. """ properties = np.array([i for i in range(len(self.test_smiles))]) csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv", property_seq=properties) molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", fingerprint_type="morgan_fingerprint", similarity_measure="tanimoto", is_verbose=True, ) _, ref_props = molecule_set.get_property_of_most_dissimilar() self.assertListEqual(ref_props, [5, 5, 5, 5, 5, 0, 7, 0]) _, ref_props = molecule_set.get_property_of_most_similar() self.assertListEqual(ref_props, [1, 2, 1, 4, 3, 6, 5, 3]) remove(csv_fpath)
def test_molecule_set_getters(self): """Retrieve names and properties of mols using MoleculeSet. """ properties = np.random.normal(size=len(self.test_smiles)) csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv", property_seq=properties) molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", fingerprint_type="morgan_fingerprint", similarity_measure="tanimoto", is_verbose=True, ) self.assertListEqual( self.test_smiles, molecule_set.get_mol_names().tolist()) for a, b in zip(properties.tolist(), molecule_set.get_mol_properties().tolist()): self.assertAlmostEqual(a, b) remove(csv_fpath)
def test_invalid_transform_error(self): """Using an invalid or unimplemented dimensionality reduction method should throw an error. """ n_features = 20 features = np.random.normal(size=(len(self.test_smiles), n_features)) csv_fpath = self.smiles_seq_to_xl_or_csv( ftype="csv", feature_arr=features) molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", similarity_measure="l0_similarity", is_verbose=True, ) features = StandardScaler().fit_transform(features) features = TSNE().fit_transform(features) with self.assertRaises(InvalidConfigurationError): error_matrix = features - \ molecule_set.get_transformed_descriptors( method_="not a real method") remove(csv_fpath)
def test_clustering_fingerprints(self): """ Test the clustering of molecules featurized by their fingerprints. """ csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv") n_clusters = 3 for descriptor in SUPPORTED_FPRINTS: for similarity_measure in SUPPORTED_SIMILARITIES: molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", fingerprint_type=descriptor, similarity_measure=similarity_measure, is_verbose=True, ) with self.assertRaises(NotInitializedError): molecule_set.get_cluster_labels() if molecule_set.similarity_measure.is_distance_metric(): molecule_set.cluster(n_clusters=n_clusters) self.assertLessEqual( len(set(molecule_set.get_cluster_labels())), n_clusters, "Expected number of cluster labels to be " "less than equal to number of clusters", ) if molecule_set.similarity_measure.type_ == "continuous": self.assertEqual( str(molecule_set.clusters_), "kmedoids", f"Expected kmedoids clustering for " f"similarity: {similarity_measure}", ) else: self.assertEqual( str(molecule_set.clusters_), "complete_linkage", f"Expected complete_linkage clustering" f"for similarity: {similarity_measure}", ) else: with self.assertRaises(InvalidConfigurationError): molecule_set.cluster(n_clusters=n_clusters) remove(csv_fpath)
def test_set_molecule_database_w_property_from_excel(self): """ Test to create MoleculeSet object by reading molecule database and molecular responses from an Excel file. """ properties = np.random.normal(size=len(self.test_smiles)) xl_fpath = self.smiles_seq_to_xl_or_csv(ftype="excel", property_seq=properties) molecule_set = MoleculeSet( molecule_database_src=xl_fpath, molecule_database_src_type="excel", fingerprint_type="morgan_fingerprint", similarity_measure="tanimoto", is_verbose=True, ) self.assertTrue(molecule_set.is_verbose, "Expected is_verbose to be True") self.assertIsNotNone( molecule_set.molecule_database, "Expected molecule_database to be set from excel file", ) self.assertEqual( len(molecule_set.molecule_database), len(self.test_smiles), "Expected the size of database to be equal to number " "of smiles in excel file", ) for id, molecule in enumerate(molecule_set.molecule_database): self.assertEqual( molecule.mol_text, self.test_smiles[id], "Expected mol_text attribute of Molecule object " "to be smiles when names not present in excel", ) self.assertAlmostEqual( molecule.mol_property_val, properties[id], places=7, msg="Expected mol_property_val of" "Molecule object " "to be set to value in excel file", ) self.assertIsInstance( molecule, Molecule, "Expected member of molecule_set to be Molecule object", ) print(f"Test complete. Deleting file {xl_fpath}...") remove(xl_fpath)
def test_set_molecule_database_from_smarts_file(self): """ Test to create MoleculeSet object by reading molecule database from a SMILES file containing SMARTS strings. """ text_fpath = self.smarts_seq_to_smiles_file() molecule_set = MoleculeSet( molecule_database_src=text_fpath, molecule_database_src_type="text", fingerprint_type="morgan_fingerprint", similarity_measure="tanimoto", is_verbose=True, ) self.assertTrue(molecule_set.is_verbose, "Expected is_verbose to be True") self.assertIsNotNone( molecule_set.molecule_database, "Expected molecule_database to be set from text", ) self.assertEqual( len(molecule_set.molecule_database), len(self.test_smarts), "Expected the size of database to be equal to number " "of smiles in text file", ) for id, molecule in enumerate(molecule_set.molecule_database): self.assertEqual( molecule.mol_text, self.test_smarts[id], "Expected mol_text attribute of Molecule object to be smiles", ) self.assertIsNone( molecule.mol_property_val, "Expected mol_property_val of Molecule object " "initialized without property to be None", ) self.assertIsInstance( molecule, Molecule, "Expected member of molecule_set to " "be Molecule object", ) print(f"Test complete. Deleting file {text_fpath}...") remove(text_fpath)
def test_set_molecule_database_from_csv(self): """ Test to create MoleculeSet object by reading molecule database and molecular responses from a CSV file. """ csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv") molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", fingerprint_type="morgan_fingerprint", similarity_measure="tanimoto", is_verbose=True, ) self.assertTrue(molecule_set.is_verbose, "Expected is_verbose to be True") self.assertIsNotNone( molecule_set.molecule_database, "Expected molecule_database to be set from " "csv file", ) self.assertEqual( len(molecule_set.molecule_database), len(self.test_smiles), "Expected the size of database to be equal to number " "of smiles", ) for id, molecule in enumerate(molecule_set.molecule_database): self.assertEqual( molecule.mol_text, self.test_smiles[id], "Expected mol_text attribute of Molecule object " "to be smiles when names not present in csv", ) self.assertIsNone( molecule.mol_property_val, "Expected mol_property_val of Molecule object" "initialized without property to be None", ) self.assertIsInstance( molecule, Molecule, "Expected member of molecule_set to be Molecule object", ) print(f"Test complete. Deleting file {csv_fpath}...") remove(csv_fpath)
def test_set_molecule_database_from_pdb_dir(self): """ Test to create MoleculeSet object by reading molecule database from a directory of pdb files. """ dir_path = self.smiles_seq_to_pdb_dir(self.test_smiles) molecule_set = MoleculeSet( molecule_database_src=dir_path, molecule_database_src_type="directory", fingerprint_type="morgan_fingerprint", similarity_measure="tanimoto", is_verbose=True, ) self.assertTrue(molecule_set.is_verbose, "Expected is_verbose to be True") self.assertIsNotNone( molecule_set.molecule_database, "Expected molecule_database to be set from dir", ) self.assertEqual( len(molecule_set.molecule_database), len(self.test_smiles), "Expected the size of database to be equal to number " "of files in dir", ) for molecule in molecule_set.molecule_database: self.assertIn( molecule.mol_text, self.test_smiles, "Expected molecule text to be a smiles string", ) self.assertIsNone( molecule.mol_property_val, "Expected mol_property_val of Molecule object" "initialized without property to be None", ) self.assertIsInstance( molecule, Molecule, "Expected member of molecule_set to " "be Molecule object", ) print(f"Test complete. Deleting directory {dir_path}...") rmtree(dir_path)
def test_set_molecule_database_fingerprint_from_csv(self): """ Verify that a TypeError is raised if no similarity_measure is specified when instantiating a MoleculeSet object. """ properties = np.random.normal(size=len(self.test_smiles)) csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv", property_seq=properties) for descriptor in SUPPORTED_FPRINTS: with self.assertRaises(TypeError): MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", fingerprint_type=descriptor, is_verbose=False, ) print(f"Test complete. Deleting file {csv_fpath}...") remove(csv_fpath)
def test_set_molecule_database_w_fingerprint_similarity_from_csv(self): """ Test all combinations of fingerprints and similarity measures with the MoleculeSet class. """ properties = np.random.normal(size=len(self.test_smiles)) csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv", property_seq=properties) for descriptor in SUPPORTED_FPRINTS: for similarity_measure in SUPPORTED_SIMILARITIES: molecule_set = MoleculeSet( molecule_database_src=csv_fpath, molecule_database_src_type="csv", fingerprint_type=descriptor, similarity_measure=similarity_measure, is_verbose=False, ) self.assertFalse( molecule_set.is_verbose, "Expected is_verbose to be False" ) self.assertIsNotNone( molecule_set.molecule_database, "Expected molecule_database to be set from csv file", ) for molecule in molecule_set.molecule_database: self.assertTrue( molecule.descriptor.check_init(), "Expected descriptor to be set", ) self.assertIsNotNone( molecule_set.similarity_matrix, "Expected similarity_matrix to be set", ) print(f"Test complete. Deleting file {csv_fpath}...") remove(csv_fpath)
def __call__( self, molecule_set_configs, fingerprint_type=None, fingerprint_params=None, similarity_measure=None, subsample_subset_size=0.01, optim_algo='max_min', show_top=0, only_metric=True, ): """ Calculate the correlation in the properties of molecules in set and their nearest and furthest neighbors using different fingerprints / similarity measure choices. Choose the best fingerprint and similarity measure pair (called measure choice for brevity) based on an optimization strategy. Args: molecule_set_configs (dict): All configurations (except fingerprint_type, fingerprint_params and similarity_measure) needed to form the moleculeSet. fingerprint_type (str): Label to indicate which fingerprint to use. If supplied, fingerprint is fixed and optimization carried out over similarity measures. Use None to indicate that optimization needs to be carried out over fingerprints. Default is None. fingerprint_params (dict): Hyper-parameters for fingerprints. Passed to the MoleculeSet constructor. If None is passed, set to empty dictionary before passing to MoleculeSet. similarity_measure (str): Label to indicate which similarity measure to use. If supplied, similarity measure is fixed and optimization carried out over similarity measures. Use None to indicate that optimization needs to be carried out over fingerprints. Default is None. subsample_subset_size (float): Fraction of molecule_set to subsample. This is separate from the sample_ratio parameter used when creating a moleculeSet since it is recommended to have an more aggressive subsampling strategy for this task due to the combinatorial explosion of looking at multiple fingerprints and similarity measures. Default is 0.01. optim_algo (str): Label to indicate the optimization algorithm chosen. Options are: 'max': The measure choice which maximizes correlation of properties between nearest neighbors (most similar). This is the default. 'min': The measure choice which minimizes the absolute value of property correlation between furthest neighbors (most dissimilar). 'max_min': The measure choice which maximizes correlation of properties between nearest neighbors (most similar) and minimizes he absolute value of property correlation between furthest neighbors (most dissimilar). This is the default. show_top (int): Number of top performing measures to show in plot. If 0, no plots are generated and the top performer is returned. only_metric (bool): If True only similarity measures satisfying the metricity property (i.e. can be converted to distance metrics) are selected. Returns: (NamedTuple): Top performer with fields: fingerprint_type (str): Label for fingerprint type similarity_measure (str): Label for similarity measure nearest_neighbor_correlation (float): Correlation of property of molecule and its nearest neighbor. furthest_neighbor_correlation (float): Correlation of property of molecule and its furthest neighbor. score_ (float): Overall score based on optimization strategy. More is better. """ print(f'Using subsample size {subsample_subset_size} for ' f'measure search') trial_ = namedtuple('trial_', [ 'fingerprint_type', 'similarity_measure', 'nearest_neighbor_correlation', 'furthest_neighbor_correlation', 'score_' ]) if fingerprint_type is None: all_fingerprint_types = Descriptor.get_supported_fprints() fingerprint_params = None else: all_fingerprint_types = [fingerprint_type] if similarity_measure is None: if only_metric: print('Only trying measures with valid distance metrics') all_similarity_measures = SimilarityMeasure.get_uniq_metrics() else: all_similarity_measures = [similarity_measure] is_verbose = molecule_set_configs.get("is_verbose", False) all_scores = [] if fingerprint_params is None: fingerprint_params = {} for similarity_measure in all_similarity_measures: if only_metric and not SimilarityMeasure( metric=similarity_measure).is_distance_metric(): continue if is_verbose: print(f'Trying {similarity_measure} similarity') for fingerprint_type in all_fingerprint_types: if is_verbose: print(f'Trying {fingerprint_type} fingerprint') try: molecule_set = MoleculeSet( molecule_database_src=molecule_set_configs[ 'molecule_database_src'], molecule_database_src_type=molecule_set_configs[ 'molecule_database_src_type'], similarity_measure=similarity_measure, fingerprint_type=fingerprint_type, fingerprint_params=fingerprint_params, is_verbose=is_verbose, n_threads=molecule_set_configs.get('n_threads', 1), sampling_ratio=subsample_subset_size) except (InvalidConfigurationError, ValueError) as e: if is_verbose: print( f'Could not try {fingerprint_type} with ' f'similarity measure {similarity_measure} due to ' f'{e}') continue nearest_corr, nearest_p_val = self.prop_var_w_similarity. \ get_property_correlations_in_most_similar( molecule_set) furthest_corr, furthest_p_val = self.prop_var_w_similarity. \ get_property_correlations_in_most_dissimilar( molecule_set) if optim_algo == 'max_min': score_ = nearest_corr - abs(furthest_corr) elif optim_algo == 'max': score_ = nearest_corr elif optim_algo == 'min': score_ = -abs(furthest_corr) else: raise InvalidConfigurationError(f'{optim_algo} ' f'not implemented') all_scores.append( trial_(fingerprint_type=fingerprint_type, similarity_measure=similarity_measure, nearest_neighbor_correlation=nearest_corr, furthest_neighbor_correlation=furthest_corr, score_=score_)) all_scores.sort(key=lambda x: x[-1], reverse=True) if self.log_fpath is not None: print('Writing to ', self.log_fpath) log_data = [trial._asdict() for trial in all_scores] with open(self.log_fpath, "w") as fp: json.dump(log_data, fp) if show_top > 0: top_performers = all_scores[:show_top] all_nearest_neighbor_correlations = [] all_furthest_neighbor_correlations = [] top_scores = [] all_measures = [] for trial in top_performers: all_nearest_neighbor_correlations.append( trial.nearest_neighbor_correlation) all_furthest_neighbor_correlations.append( trial.furthest_neighbor_correlation) top_scores.append(trial.score_) all_measures.append( Descriptor.shorten_label(trial.fingerprint_type) + '\n' + trial.similarity_measure) bar_heights = np.array([ top_scores, all_nearest_neighbor_correlations, all_furthest_neighbor_correlations ]) colors = self.plot_settings.pop('colors') plot_multiple_barchart(x=[_ for _ in range(len(top_performers))], heights=bar_heights, legend_labels=[ 'Overall scores', 'Nearest neighbor property ' 'correlation', 'Furthest neighbor property ' 'correlations' ], colors=colors, xtick_labels=all_measures, ylabel='Value', xlabel='Measure', **self.plot_settings) return all_scores[0]
def _initialize_molecule_set(self, molecule_set_configs): """Initialize molecule_set attribute to a MoleculeSet object based on parameters in the config file. Args: molecule_set_configs (dict): Configurations for initializing the MoleculeSet object. """ molecule_database_src = molecule_set_configs.get( "molecule_database", None, ) database_src_type = molecule_set_configs.get( "molecule_database_source_type", None) if molecule_database_src is None or database_src_type is None: print("molecule_database fields not set in config file") print(f"molecule_database: {molecule_database_src}") print(f"molecule_database_source_type: {database_src_type}") raise InvalidConfigurationError is_verbose = molecule_set_configs.get("is_verbose", False) n_threads = molecule_set_configs.get("n_workers", 1) similarity_measure = molecule_set_configs.get("similarity_measure", 'determine') fingerprint_type = molecule_set_configs.get('fingerprint_type', 'determine') fingerprint_params = molecule_set_configs.get('fingerprint_params', {}) if similarity_measure == 'determine' or fingerprint_type == 'determine': subsample_subset_size = molecule_set_configs.get( 'measure_id_subsample', 0.05) if is_verbose: print('Determining best fingerprint_type / similarity_measure') measure_search = MeasureSearch(correlation_type='pearson') if similarity_measure == 'determine': similarity_measure = None only_valid_dist = molecule_set_configs.get( 'only_valid_dist', True) if fingerprint_type == 'determine': fingerprint_type = None fingerprint_params = {} measure_search_molset_configs = { 'molecule_database_src': molecule_database_src, 'molecule_database_src_type': database_src_type, 'is_verbose': is_verbose, 'n_threads': n_threads, } best_measure = measure_search( molecule_set_configs=measure_search_molset_configs, similarity_measure=similarity_measure, fingerprint_type=fingerprint_type, fingerprint_params=fingerprint_params, subsample_subset_size=subsample_subset_size, show_top=5, only_metric=only_valid_dist) similarity_measure = best_measure.similarity_measure fingerprint_type = best_measure.fingerprint_type print(f'Chosen measure: {fingerprint_type} ' f'and {similarity_measure}.') sampling_ratio = molecule_set_configs.get("sampling_ratio", 1.) print(f'Choosing sampling ratio of {sampling_ratio} for tasks') self.molecule_set = MoleculeSet( molecule_database_src=molecule_database_src, molecule_database_src_type=database_src_type, similarity_measure=similarity_measure, fingerprint_type=fingerprint_type, fingerprint_params=fingerprint_params, is_verbose=is_verbose, n_threads=n_threads, sampling_ratio=sampling_ratio, )