Ejemplo n.º 1
0
    def test_spectral_embedding_transform(self):
        """
        Test the unsupervised transformation of molecules in
        MoleculSet using Isomap.

        """
        n_features = 20
        features = np.random.normal(size=(len(self.test_smiles), n_features))
        csv_fpath = self.smiles_seq_to_xl_or_csv(
            ftype="csv", feature_arr=features)
        molecule_set = MoleculeSet(
            molecule_database_src=csv_fpath,
            molecule_database_src_type="csv",
            similarity_measure="l0_similarity",
            is_verbose=True,

        )
        features = StandardScaler().fit_transform(features)
        features = SpectralEmbedding().fit_transform(features)
        error_matrix = features - \
            molecule_set.get_transformed_descriptors(method_="spectral_embedding")
        error_threshold = 1e-6
        self.assertLessEqual(
            error_matrix.min(),
            error_threshold,
            "Expected transformed molecular descriptors to be "
            "equal to SpectralEmbedding decomposed features",
        )
        remove(csv_fpath)
Ejemplo n.º 2
0
    def test_pca_transform(self):
        """
        Test the unsupervised transformation of molecules in
        MoleculSet using Principal Component Analysis.

        """
        n_features = 20
        features = np.random.normal(size=(len(self.test_smiles), n_features))
        csv_fpath = self.smiles_seq_to_xl_or_csv(
            ftype="csv", feature_arr=features)
        molecule_set = MoleculeSet(
            molecule_database_src=csv_fpath,
            molecule_database_src_type="csv",
            similarity_measure="l0_similarity",
            is_verbose=True,
        )
        features = StandardScaler().fit_transform(features)
        features = PCA(n_components=2).fit_transform(features)
        error_matrix = features - molecule_set.get_transformed_descriptors()
        error_threshold = 1e-6
        self.assertLessEqual(
            error_matrix.min(),
            error_threshold,
            "Expected transformed molecular descriptors to be "
            "equal to PCA decomposed features",
        )
        remove(csv_fpath)
Ejemplo n.º 3
0
    def test_get_most_similar_pairs(self):
        """
        Test that all combinations of fingerprint_type and similarity measure
        works with the MoleculeSet.get_most_similar_pairs() method.

        """
        csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv")
        for descriptor in SUPPORTED_FPRINTS:
            for similarity_measure in SUPPORTED_SIMILARITIES:
                molecule_set = MoleculeSet(
                    molecule_database_src=csv_fpath,
                    molecule_database_src_type="csv",
                    fingerprint_type=descriptor,
                    similarity_measure=similarity_measure,
                    is_verbose=False,
                )
                molecule_pairs = molecule_set.get_most_similar_pairs()
                self.assertIsInstance(
                    molecule_pairs,
                    list,
                    "Expected get_most_similar_pairs() to return list",
                )
                for pair in molecule_pairs:
                    self.assertIsInstance(
                        pair,
                        tuple,
                        "Expected elements of list "
                        "returned by get_most_similar_pairs()"
                        " to be tuples",
                    )
        remove(csv_fpath)
Ejemplo n.º 4
0
    def test_subsample_molecule_database_from_csv(self):
        """
        Test to randomly subsample a molecule database loaded from an
        CSV file.

        """
        csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv")
        sampling_ratio = 0.5
        molecule_set = MoleculeSet(
            molecule_database_src=csv_fpath,
            molecule_database_src_type="csv",
            fingerprint_type="morgan_fingerprint",
            similarity_measure="tanimoto",
            sampling_ratio=sampling_ratio,
            is_verbose=True,
        )
        self.assertIsNotNone(
            molecule_set.molecule_database,
            "Expected molecule_database to be set from csv file",
        )
        self.assertEqual(
            len(molecule_set.molecule_database),
            int(sampling_ratio * len(self.test_smiles)),
            "Expected the size of database to be equal to number "
            "of smiles * sampling_ratio",
        )
        for id, molecule in enumerate(molecule_set.molecule_database):
            self.assertIsInstance(
                molecule,
                Molecule,
                "Expected member of molecule_set to be Molecule object",
            )
        print(f"Test complete. Deleting file {csv_fpath}...")
        remove(csv_fpath)
Ejemplo n.º 5
0
 def test_similarity_measure_limits(self):
     csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv")
     for descriptor in SUPPORTED_FPRINTS:
         for similarity_measure in SUPPORTED_SIMILARITIES:
             molecule_set = MoleculeSet(
                 molecule_database_src=csv_fpath,
                 molecule_database_src_type="csv",
                 fingerprint_type=descriptor,
                 similarity_measure=similarity_measure,
                 is_verbose=False,
             )
             for mol1 in molecule_set.molecule_database:
                 for mol2 in molecule_set.molecule_database:
                     similarity_ = mol1.get_similarity_to(
                         mol2, molecule_set.similarity_measure)
                     self.assertGreaterEqual(
                         similarity_, 0., "Expected similarity value "
                         "to be >= 0."
                         f"using similarity measure:"
                         f" {similarity_measure}, "
                         f"descriptor: {descriptor}"
                         f", for molecules "
                         f"{mol1.mol_text}, "
                         f"{mol2.mol_text}")
                     self.assertLessEqual(
                         similarity_, 1., "Expected similarity value to "
                         "be <= 1."
                         f"using similarity measure: "
                         f"{similarity_measure}, "
                         f"descriptor: {descriptor}, "
                         f"for molecule {mol1.mol_text}, "
                         f"{mol2.mol_text}")
     remove('temp_mol_file.csv')
Ejemplo n.º 6
0
    def test_subsample_molecule_database_from_pdb_dir(self):
        """
        Test to randomly subsample a molecule database loaded from a
        directory of pdb files.

        """
        dir_path = self.smiles_seq_to_pdb_dir(self.test_smiles)
        sampling_ratio = 0.5
        molecule_set = MoleculeSet(
            molecule_database_src=dir_path,
            molecule_database_src_type="directory",
            fingerprint_type="morgan_fingerprint",
            similarity_measure="tanimoto",
            is_verbose=True,
            sampling_ratio=sampling_ratio,
        )
        self.assertIsNotNone(
            molecule_set.molecule_database,
            "Expected molecule_database to be set from dir",
        )
        self.assertEqual(
            len(molecule_set.molecule_database),
            int(sampling_ratio * len(self.test_smiles)),
            "Expected the size of subsampled database to be "
            "equal to number of files in dir * sampling_ratio",
        )
        for id, molecule in enumerate(molecule_set.molecule_database):
            self.assertIsInstance(
                molecule,
                Molecule,
                "Expected member of molecule_set to " "be Molecule object",
            )
        print(f"Test complete. Deleting directory {dir_path}...")
        rmtree(dir_path)
Ejemplo n.º 7
0
 def test_get_molecule_most_similar_to(self):
     csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv")
     for descriptor in SUPPORTED_FPRINTS:
         for similarity_measure in SUPPORTED_SIMILARITIES:
             molecule_set = MoleculeSet(
                 molecule_database_src=csv_fpath,
                 molecule_database_src_type="csv",
                 fingerprint_type=descriptor,
                 similarity_measure=similarity_measure,
                 is_verbose=False,
             )
             for mol_smile, mol in zip(TEST_SMILES,
                                       molecule_set.molecule_database):
                 compare_task = CompareTargetMolecule(
                     target_molecule_smiles=mol_smile)
                 [closest_mol], [similarity] = compare_task.\
                     get_hits_similar_to(molecule_set)
                 mol_similarities = molecule_set.compare_against_molecule(
                     mol)
                 self.assertEqual(
                     np.max(mol_similarities),
                     mol.get_similarity_to(
                         molecule_set.molecule_database[closest_mol],
                         molecule_set.similarity_measure),
                     f"Expected closest mol to have maximum "
                     f"similarity to target molecule "
                     f"using similarity measure: "
                     f"{similarity_measure}, "
                     f"descriptor: "
                     f"{descriptor}, "
                     f"for molecule {mol.mol_text}",
                 )
                 self.assertGreaterEqual(
                     similarity, 0., "Expected similarity value to "
                     "be >= 0."
                     f"using similarity measure: "
                     f"{similarity_measure}, "
                     f"descriptor: {descriptor}, "
                     f"for molecule {mol.mol_text}")
                 self.assertLessEqual(
                     similarity, 1., "Expected similarity value to "
                     "be <= 1."
                     f"using similarity measure: "
                     f"{similarity_measure}, "
                     f"descriptor: {descriptor}, "
                     f"for molecule {mol.mol_text}")
Ejemplo n.º 8
0
    def test_set_molecule_database_w_descriptor_property_from_csv(self):
        """
        Test to create MoleculeSet object by reading molecule database
        containing arbitrary molecular descriptors and molecular responses
        from a CSV file.

        """
        properties = np.random.normal(size=len(self.test_smiles))
        n_features = 20
        features = np.random.normal(size=(len(self.test_smiles), n_features))
        csv_fpath = self.smiles_seq_to_xl_or_csv(
            ftype="csv", property_seq=properties, feature_arr=features
        )
        molecule_set = MoleculeSet(
            molecule_database_src=csv_fpath,
            molecule_database_src_type="csv",
            similarity_measure="l0_similarity",
            is_verbose=True,
        )
        self.assertTrue(molecule_set.is_verbose,
                        "Expected is_verbose to be True")
        self.assertIsNotNone(
            molecule_set.molecule_database,
            "Expected molecule_database to be set from " "excel file",
        )
        self.assertEqual(
            len(molecule_set.molecule_database),
            len(self.test_smiles),
            "Expected the size of database to be equal to number "
            "of smiles in csv file",
        )
        for id, molecule in enumerate(molecule_set.molecule_database):
            self.assertEqual(
                molecule.mol_text,
                self.test_smiles[id],
                "Expected mol_text attribute of Molecule object "
                "to be smiles when names not present in csv",
            )
            self.assertAlmostEqual(
                molecule.mol_property_val,
                properties[id],
                places=7,
                msg="Expected mol_property_val of"
                "Molecule object "
                "to be set to value in csv file",
            )
            self.assertTrue(
                (molecule.descriptor.to_numpy() == features[id]).all,
                "Expected descriptor value to be same as the "
                "vector used to initialize descriptor",
            )
            self.assertIsInstance(
                molecule,
                Molecule,
                "Expected member of molecule_set to be Molecule object",
            )
        print(f"Test complete. Deleting file {csv_fpath}...")
        remove(csv_fpath)
Ejemplo n.º 9
0
    def test_molecule_set_sim_getters(self):
        """Get the properties for most and least similar molecule pairs.
        """
        properties = np.array([i for i in range(len(self.test_smiles))])
        csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv",
                                                 property_seq=properties)
        molecule_set = MoleculeSet(
            molecule_database_src=csv_fpath,
            molecule_database_src_type="csv",
            fingerprint_type="morgan_fingerprint",
            similarity_measure="tanimoto",
            is_verbose=True,
        )
        _, ref_props = molecule_set.get_property_of_most_dissimilar()
        self.assertListEqual(ref_props, [5, 5, 5, 5, 5, 0, 7, 0])

        _, ref_props = molecule_set.get_property_of_most_similar()
        self.assertListEqual(ref_props, [1, 2, 1, 4, 3, 6, 5, 3])

        remove(csv_fpath)
Ejemplo n.º 10
0
    def test_molecule_set_getters(self):
        """Retrieve names and properties of mols using MoleculeSet.
        """
        properties = np.random.normal(size=len(self.test_smiles))
        csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv",
                                                 property_seq=properties)
        molecule_set = MoleculeSet(
            molecule_database_src=csv_fpath,
            molecule_database_src_type="csv",
            fingerprint_type="morgan_fingerprint",
            similarity_measure="tanimoto",
            is_verbose=True,
        )

        self.assertListEqual(
            self.test_smiles, molecule_set.get_mol_names().tolist())

        for a, b in zip(properties.tolist(), molecule_set.get_mol_properties().tolist()):
            self.assertAlmostEqual(a, b)
        remove(csv_fpath)
Ejemplo n.º 11
0
    def test_invalid_transform_error(self):
        """Using an invalid or unimplemented dimensionality reduction method
        should throw an error.
        """
        n_features = 20
        features = np.random.normal(size=(len(self.test_smiles), n_features))
        csv_fpath = self.smiles_seq_to_xl_or_csv(
            ftype="csv", feature_arr=features)
        molecule_set = MoleculeSet(
            molecule_database_src=csv_fpath,
            molecule_database_src_type="csv",
            similarity_measure="l0_similarity",
            is_verbose=True,

        )
        features = StandardScaler().fit_transform(features)
        features = TSNE().fit_transform(features)
        with self.assertRaises(InvalidConfigurationError):
            error_matrix = features - \
                molecule_set.get_transformed_descriptors(
                    method_="not a real method")
        remove(csv_fpath)
Ejemplo n.º 12
0
    def test_clustering_fingerprints(self):
        """
        Test the clustering of molecules featurized by their fingerprints.

        """
        csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv")
        n_clusters = 3
        for descriptor in SUPPORTED_FPRINTS:
            for similarity_measure in SUPPORTED_SIMILARITIES:
                molecule_set = MoleculeSet(
                    molecule_database_src=csv_fpath,
                    molecule_database_src_type="csv",
                    fingerprint_type=descriptor,
                    similarity_measure=similarity_measure,
                    is_verbose=True,
                )
                with self.assertRaises(NotInitializedError):
                    molecule_set.get_cluster_labels()
                if molecule_set.similarity_measure.is_distance_metric():
                    molecule_set.cluster(n_clusters=n_clusters)
                    self.assertLessEqual(
                        len(set(molecule_set.get_cluster_labels())),
                        n_clusters,
                        "Expected number of cluster labels to be "
                        "less than equal to number of clusters",
                    )
                    if molecule_set.similarity_measure.type_ == "continuous":
                        self.assertEqual(
                            str(molecule_set.clusters_),
                            "kmedoids",
                            f"Expected kmedoids clustering for "
                            f"similarity: {similarity_measure}",
                        )
                    else:
                        self.assertEqual(
                            str(molecule_set.clusters_),
                            "complete_linkage",
                            f"Expected complete_linkage clustering"
                            f"for similarity: {similarity_measure}",
                        )
                else:
                    with self.assertRaises(InvalidConfigurationError):
                        molecule_set.cluster(n_clusters=n_clusters)
        remove(csv_fpath)
Ejemplo n.º 13
0
    def test_set_molecule_database_w_property_from_excel(self):
        """
        Test to create MoleculeSet object by reading molecule database
        and molecular responses from an Excel file.

        """
        properties = np.random.normal(size=len(self.test_smiles))
        xl_fpath = self.smiles_seq_to_xl_or_csv(ftype="excel",
                                                property_seq=properties)
        molecule_set = MoleculeSet(
            molecule_database_src=xl_fpath,
            molecule_database_src_type="excel",
            fingerprint_type="morgan_fingerprint",
            similarity_measure="tanimoto",
            is_verbose=True,
        )
        self.assertTrue(molecule_set.is_verbose,
                        "Expected is_verbose to be True")
        self.assertIsNotNone(
            molecule_set.molecule_database,
            "Expected molecule_database to be set from excel file",
        )
        self.assertEqual(
            len(molecule_set.molecule_database),
            len(self.test_smiles),
            "Expected the size of database to be equal to number "
            "of smiles in excel file",
        )
        for id, molecule in enumerate(molecule_set.molecule_database):
            self.assertEqual(
                molecule.mol_text,
                self.test_smiles[id],
                "Expected mol_text attribute of Molecule object "
                "to be smiles when names not present in excel",
            )
            self.assertAlmostEqual(
                molecule.mol_property_val,
                properties[id],
                places=7,
                msg="Expected mol_property_val of"
                "Molecule object "
                "to be set to value in excel file",
            )
            self.assertIsInstance(
                molecule,
                Molecule,
                "Expected member of molecule_set to be Molecule object",
            )
            print(f"Test complete. Deleting file {xl_fpath}...")
        remove(xl_fpath)
Ejemplo n.º 14
0
    def test_set_molecule_database_from_smarts_file(self):
        """
        Test to create MoleculeSet object by reading molecule database
        from a SMILES file containing SMARTS strings.

        """
        text_fpath = self.smarts_seq_to_smiles_file()
        molecule_set = MoleculeSet(
            molecule_database_src=text_fpath,
            molecule_database_src_type="text",
            fingerprint_type="morgan_fingerprint",
            similarity_measure="tanimoto",
            is_verbose=True,
        )
        self.assertTrue(molecule_set.is_verbose,
                        "Expected is_verbose to be True")
        self.assertIsNotNone(
            molecule_set.molecule_database,
            "Expected molecule_database to be set from text",
        )
        self.assertEqual(
            len(molecule_set.molecule_database),
            len(self.test_smarts),
            "Expected the size of database to be equal to number "
            "of smiles in text file",
        )
        for id, molecule in enumerate(molecule_set.molecule_database):
            self.assertEqual(
                molecule.mol_text,
                self.test_smarts[id],
                "Expected mol_text attribute of Molecule object to be smiles",
            )
            self.assertIsNone(
                molecule.mol_property_val,
                "Expected mol_property_val of Molecule object "
                "initialized without property to be None",
            )
            self.assertIsInstance(
                molecule,
                Molecule,
                "Expected member of molecule_set to " "be Molecule object",
            )
        print(f"Test complete. Deleting file {text_fpath}...")
        remove(text_fpath)
Ejemplo n.º 15
0
    def test_set_molecule_database_from_csv(self):
        """
        Test to create MoleculeSet object by reading molecule database
        and molecular responses from a CSV file.

        """
        csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv")
        molecule_set = MoleculeSet(
            molecule_database_src=csv_fpath,
            molecule_database_src_type="csv",
            fingerprint_type="morgan_fingerprint",
            similarity_measure="tanimoto",
            is_verbose=True,
        )
        self.assertTrue(molecule_set.is_verbose,
                        "Expected is_verbose to be True")
        self.assertIsNotNone(
            molecule_set.molecule_database,
            "Expected molecule_database to be set from " "csv file",
        )
        self.assertEqual(
            len(molecule_set.molecule_database),
            len(self.test_smiles),
            "Expected the size of database to be equal to number " "of smiles",
        )
        for id, molecule in enumerate(molecule_set.molecule_database):
            self.assertEqual(
                molecule.mol_text,
                self.test_smiles[id],
                "Expected mol_text attribute of Molecule object "
                "to be smiles when names not present in csv",
            )
            self.assertIsNone(
                molecule.mol_property_val,
                "Expected mol_property_val of Molecule object"
                "initialized without property to be None",
            )
            self.assertIsInstance(
                molecule,
                Molecule,
                "Expected member of molecule_set to be Molecule object",
            )
        print(f"Test complete. Deleting file {csv_fpath}...")
        remove(csv_fpath)
Ejemplo n.º 16
0
    def test_set_molecule_database_from_pdb_dir(self):
        """
        Test to create MoleculeSet object by reading molecule database
        from a directory of pdb files.

        """
        dir_path = self.smiles_seq_to_pdb_dir(self.test_smiles)
        molecule_set = MoleculeSet(
            molecule_database_src=dir_path,
            molecule_database_src_type="directory",
            fingerprint_type="morgan_fingerprint",
            similarity_measure="tanimoto",
            is_verbose=True,
        )
        self.assertTrue(molecule_set.is_verbose,
                        "Expected is_verbose to be True")
        self.assertIsNotNone(
            molecule_set.molecule_database,
            "Expected molecule_database to be set from dir",
        )
        self.assertEqual(
            len(molecule_set.molecule_database),
            len(self.test_smiles),
            "Expected the size of database to be equal to number " "of files in dir",
        )
        for molecule in molecule_set.molecule_database:
            self.assertIn(
                molecule.mol_text,
                self.test_smiles,
                "Expected molecule text to be a smiles string",
            )
            self.assertIsNone(
                molecule.mol_property_val,
                "Expected mol_property_val of Molecule object"
                "initialized without property to be None",
            )
            self.assertIsInstance(
                molecule,
                Molecule,
                "Expected member of molecule_set to " "be Molecule object",
            )
        print(f"Test complete. Deleting directory {dir_path}...")
        rmtree(dir_path)
Ejemplo n.º 17
0
    def test_set_molecule_database_fingerprint_from_csv(self):
        """
        Verify that a TypeError is raised if no similarity_measure
        is specified when instantiating a MoleculeSet object.

        """
        properties = np.random.normal(size=len(self.test_smiles))
        csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv",
                                                 property_seq=properties)
        for descriptor in SUPPORTED_FPRINTS:
            with self.assertRaises(TypeError):
                MoleculeSet(
                    molecule_database_src=csv_fpath,
                    molecule_database_src_type="csv",
                    fingerprint_type=descriptor,
                    is_verbose=False,
                )

        print(f"Test complete. Deleting file {csv_fpath}...")
        remove(csv_fpath)
Ejemplo n.º 18
0
    def test_set_molecule_database_w_fingerprint_similarity_from_csv(self):
        """
        Test all combinations of fingerprints and similarity measures with the
        MoleculeSet class.

        """
        properties = np.random.normal(size=len(self.test_smiles))
        csv_fpath = self.smiles_seq_to_xl_or_csv(ftype="csv",
                                                 property_seq=properties)
        for descriptor in SUPPORTED_FPRINTS:
            for similarity_measure in SUPPORTED_SIMILARITIES:
                molecule_set = MoleculeSet(
                    molecule_database_src=csv_fpath,
                    molecule_database_src_type="csv",
                    fingerprint_type=descriptor,
                    similarity_measure=similarity_measure,
                    is_verbose=False,
                )
                self.assertFalse(
                    molecule_set.is_verbose, "Expected is_verbose to be False"
                )
                self.assertIsNotNone(
                    molecule_set.molecule_database,
                    "Expected molecule_database to be set from csv file",
                )
                for molecule in molecule_set.molecule_database:
                    self.assertTrue(
                        molecule.descriptor.check_init(),
                        "Expected descriptor to be set",
                    )
                self.assertIsNotNone(
                    molecule_set.similarity_matrix,
                    "Expected similarity_matrix to be set",
                )
        print(f"Test complete. Deleting file {csv_fpath}...")
        remove(csv_fpath)
Ejemplo n.º 19
0
    def __call__(
        self,
        molecule_set_configs,
        fingerprint_type=None,
        fingerprint_params=None,
        similarity_measure=None,
        subsample_subset_size=0.01,
        optim_algo='max_min',
        show_top=0,
        only_metric=True,
    ):
        """
        Calculate the correlation in the properties of molecules in set
        and their nearest and furthest neighbors using different
        fingerprints / similarity measure choices. Choose the best fingerprint
        and similarity measure pair (called measure choice for brevity)
        based on an optimization strategy.

        Args:
            molecule_set_configs (dict): All configurations (except
                fingerprint_type, fingerprint_params and similarity_measure)
                needed to form the moleculeSet.
            fingerprint_type (str): Label to indicate which fingerprint to
                use. If supplied, fingerprint is fixed and optimization
                carried out over similarity measures. Use None to indicate
                that optimization needs to be carried out over fingerprints.
                Default is None.
            fingerprint_params (dict): Hyper-parameters for fingerprints.
                Passed to the MoleculeSet constructor. If None is passed,
                set to empty dictionary before passing to MoleculeSet.
            similarity_measure (str): Label to indicate which similarity
                measure to use. If supplied, similarity measure is fixed
                and optimization carried out over similarity measures.
                Use None to indicate that optimization needs to be carried
                out over fingerprints. Default is None.
            subsample_subset_size (float): Fraction of molecule_set to
                subsample. This is separate from the sample_ratio parameter
                used when creating a moleculeSet since it is recommended
                to have an more aggressive subsampling strategy for this task
                due to the combinatorial explosion of looking at multiple
                fingerprints and similarity measures. Default is 0.01.
            optim_algo (str): Label to indicate the optimization algorithm
                chosen. Options are:
                'max': The measure choice which maximizes correlation
                    of properties between nearest neighbors (most similar).
                    This is the default.
                'min': The measure choice which minimizes the absolute value
                    of property correlation
                    between furthest neighbors (most dissimilar).
                'max_min': The measure choice which maximizes correlation
                    of properties between nearest neighbors (most similar)
                    and minimizes he absolute value of property correlation
                    between furthest neighbors (most dissimilar).
                    This is the default.
            show_top (int): Number of top performing measures to show in plot.
                If 0, no plots are generated and the top performer is returned.
            only_metric (bool): If True only similarity measures satisfying
                the metricity property
                (i.e. can be converted to distance metrics) are selected.

        Returns:
            (NamedTuple): Top performer with fields:
                fingerprint_type (str): Label for fingerprint type
               similarity_measure (str): Label for similarity measure
               nearest_neighbor_correlation (float): Correlation of property
                   of molecule and its nearest neighbor.
               furthest_neighbor_correlation (float): Correlation of property
                   of molecule and its furthest neighbor.
               score_ (float): Overall score based on optimization strategy.
                   More is better.

        """
        print(f'Using subsample size {subsample_subset_size} for '
              f'measure search')
        trial_ = namedtuple('trial_', [
            'fingerprint_type', 'similarity_measure',
            'nearest_neighbor_correlation', 'furthest_neighbor_correlation',
            'score_'
        ])
        if fingerprint_type is None:
            all_fingerprint_types = Descriptor.get_supported_fprints()
            fingerprint_params = None
        else:
            all_fingerprint_types = [fingerprint_type]
        if similarity_measure is None:
            if only_metric:
                print('Only trying measures with valid distance metrics')
            all_similarity_measures = SimilarityMeasure.get_uniq_metrics()
        else:
            all_similarity_measures = [similarity_measure]
        is_verbose = molecule_set_configs.get("is_verbose", False)
        all_scores = []
        if fingerprint_params is None:
            fingerprint_params = {}
        for similarity_measure in all_similarity_measures:
            if only_metric and not SimilarityMeasure(
                    metric=similarity_measure).is_distance_metric():
                continue
            if is_verbose:
                print(f'Trying {similarity_measure} similarity')
            for fingerprint_type in all_fingerprint_types:
                if is_verbose:
                    print(f'Trying {fingerprint_type} fingerprint')
                try:
                    molecule_set = MoleculeSet(
                        molecule_database_src=molecule_set_configs[
                            'molecule_database_src'],
                        molecule_database_src_type=molecule_set_configs[
                            'molecule_database_src_type'],
                        similarity_measure=similarity_measure,
                        fingerprint_type=fingerprint_type,
                        fingerprint_params=fingerprint_params,
                        is_verbose=is_verbose,
                        n_threads=molecule_set_configs.get('n_threads', 1),
                        sampling_ratio=subsample_subset_size)
                except (InvalidConfigurationError, ValueError) as e:
                    if is_verbose:
                        print(
                            f'Could not try {fingerprint_type} with '
                            f'similarity measure {similarity_measure} due to '
                            f'{e}')
                    continue
                nearest_corr, nearest_p_val = self.prop_var_w_similarity. \
                    get_property_correlations_in_most_similar(
                        molecule_set)
                furthest_corr, furthest_p_val = self.prop_var_w_similarity. \
                    get_property_correlations_in_most_dissimilar(
                        molecule_set)
                if optim_algo == 'max_min':
                    score_ = nearest_corr - abs(furthest_corr)
                elif optim_algo == 'max':
                    score_ = nearest_corr
                elif optim_algo == 'min':
                    score_ = -abs(furthest_corr)
                else:
                    raise InvalidConfigurationError(f'{optim_algo} '
                                                    f'not implemented')
                all_scores.append(
                    trial_(fingerprint_type=fingerprint_type,
                           similarity_measure=similarity_measure,
                           nearest_neighbor_correlation=nearest_corr,
                           furthest_neighbor_correlation=furthest_corr,
                           score_=score_))
        all_scores.sort(key=lambda x: x[-1], reverse=True)
        if self.log_fpath is not None:
            print('Writing to ', self.log_fpath)
            log_data = [trial._asdict() for trial in all_scores]
            with open(self.log_fpath, "w") as fp:
                json.dump(log_data, fp)

        if show_top > 0:
            top_performers = all_scores[:show_top]
            all_nearest_neighbor_correlations = []
            all_furthest_neighbor_correlations = []
            top_scores = []
            all_measures = []
            for trial in top_performers:
                all_nearest_neighbor_correlations.append(
                    trial.nearest_neighbor_correlation)
                all_furthest_neighbor_correlations.append(
                    trial.furthest_neighbor_correlation)
                top_scores.append(trial.score_)
                all_measures.append(
                    Descriptor.shorten_label(trial.fingerprint_type) + '\n' +
                    trial.similarity_measure)
            bar_heights = np.array([
                top_scores, all_nearest_neighbor_correlations,
                all_furthest_neighbor_correlations
            ])
            colors = self.plot_settings.pop('colors')
            plot_multiple_barchart(x=[_ for _ in range(len(top_performers))],
                                   heights=bar_heights,
                                   legend_labels=[
                                       'Overall scores',
                                       'Nearest neighbor property '
                                       'correlation',
                                       'Furthest neighbor property '
                                       'correlations'
                                   ],
                                   colors=colors,
                                   xtick_labels=all_measures,
                                   ylabel='Value',
                                   xlabel='Measure',
                                   **self.plot_settings)

        return all_scores[0]
Ejemplo n.º 20
0
    def _initialize_molecule_set(self, molecule_set_configs):
        """Initialize molecule_set attribute to a MoleculeSet object
        based on parameters in the config file.

        Args:
            molecule_set_configs (dict): Configurations for initializing
                the MoleculeSet object.
        """
        molecule_database_src = molecule_set_configs.get(
            "molecule_database",
            None,
        )
        database_src_type = molecule_set_configs.get(
            "molecule_database_source_type", None)
        if molecule_database_src is None or database_src_type is None:
            print("molecule_database fields not set in config file")
            print(f"molecule_database: {molecule_database_src}")
            print(f"molecule_database_source_type: {database_src_type}")
            raise InvalidConfigurationError
        is_verbose = molecule_set_configs.get("is_verbose", False)
        n_threads = molecule_set_configs.get("n_workers", 1)
        similarity_measure = molecule_set_configs.get("similarity_measure",
                                                      'determine')
        fingerprint_type = molecule_set_configs.get('fingerprint_type',
                                                    'determine')
        fingerprint_params = molecule_set_configs.get('fingerprint_params', {})
        if similarity_measure == 'determine' or fingerprint_type == 'determine':
            subsample_subset_size = molecule_set_configs.get(
                'measure_id_subsample', 0.05)
            if is_verbose:
                print('Determining best fingerprint_type / similarity_measure')
            measure_search = MeasureSearch(correlation_type='pearson')
            if similarity_measure == 'determine':
                similarity_measure = None
                only_valid_dist = molecule_set_configs.get(
                    'only_valid_dist', True)
            if fingerprint_type == 'determine':
                fingerprint_type = None
                fingerprint_params = {}
            measure_search_molset_configs = {
                'molecule_database_src': molecule_database_src,
                'molecule_database_src_type': database_src_type,
                'is_verbose': is_verbose,
                'n_threads': n_threads,
            }

            best_measure = measure_search(
                molecule_set_configs=measure_search_molset_configs,
                similarity_measure=similarity_measure,
                fingerprint_type=fingerprint_type,
                fingerprint_params=fingerprint_params,
                subsample_subset_size=subsample_subset_size,
                show_top=5,
                only_metric=only_valid_dist)
            similarity_measure = best_measure.similarity_measure
            fingerprint_type = best_measure.fingerprint_type
            print(f'Chosen measure: {fingerprint_type} '
                  f'and {similarity_measure}.')

        sampling_ratio = molecule_set_configs.get("sampling_ratio", 1.)
        print(f'Choosing sampling ratio of {sampling_ratio} for tasks')
        self.molecule_set = MoleculeSet(
            molecule_database_src=molecule_database_src,
            molecule_database_src_type=database_src_type,
            similarity_measure=similarity_measure,
            fingerprint_type=fingerprint_type,
            fingerprint_params=fingerprint_params,
            is_verbose=is_verbose,
            n_threads=n_threads,
            sampling_ratio=sampling_ratio,
        )