def test_set_molecule_from_smiles(self): """ Test to create Molecule object by reading SMILES string. """ test_smiles = "CC" test_molecule = Molecule() test_molecule._set_molecule_from_smiles(test_smiles) self.assertEqual( test_molecule.mol_text, test_smiles, "Expected mol_text attribute to be set " "to smiles string", ) self.assertIsNotNone( test_molecule.mol_graph, "Expected mol_graph attribute to be set " "from the smiles", ) self.assertIsInstance( test_molecule.mol_graph, rdkit.Chem.rdchem.Mol, "Expected initialized mol_graph to " "be rdkit.Chem.rdchem.Mol object", )
def test_missing_smiles(self): """ Missing smiles strings should raise a LoadingError. """ with self.assertRaises(LoadingError): test_molecule = Molecule() test_molecule._set_molecule_from_smiles([])
def test_invalid_pdb(self): """Invalid PDB files should raise a LoadingError. """ Path('blank.pdb').touch() with self.assertRaises(LoadingError): test_molecule = Molecule() test_molecule._set_molecule_from_pdb("blank.pdb") remove('blank.pdb')
def test_match_fprint_error(self): """Trying to match fingerprint from molecule w/o graph should throw an error. """ # initialize a molecule normally ref_mol = Molecule(mol_smiles="C") ref_mol.set_descriptor(fingerprint_type="morgan_fingerprint") # delete the molecular graph ref_mol.mol_graph = np.array([]) mol = Molecule() with self.assertRaises(ValueError): mol.match_fingerprint_from(ref_mol)
def test_molecule_created_w_attributes(self): """ Test to create Molecule object with descriptor value (list) and a response scalar. """ test_molecule = Molecule(mol_text="test_molecule", mol_property_val=42, mol_descriptor_val=[1, 2, 3]) self.assertEqual( test_molecule.mol_text, "test_molecule", "Expected mol_text attribute to be set.", ) self.assertEqual(test_molecule.mol_property_val, 42, "Expected mol_property_val to be set.") self.assertIsInstance( test_molecule.descriptor.to_numpy(), np.ndarray, "Expected descriptor.to_numpy()to be np.ndarray", ) self.assertTrue( np.all(test_molecule.descriptor.to_numpy() == np.array([1, 2, 3])), "Expected descriptor.to_numpy() to be array[1, 2, 3]", ) self.assertEqual( test_molecule.descriptor.label_, "arbitrary", "Expected descriptor.label to be arbitrary since " "it was initialized by list/array", )
def _extract_configs(self): target_molecule_smiles = self.configs.get("target_molecule_smiles") target_molecule_src = self.configs.get("target_molecule_src") if target_molecule_smiles: self.target_molecule = Molecule(mol_smiles=target_molecule_smiles) elif target_molecule_src: self.target_molecule = Molecule(mol_src=target_molecule_src) else: raise IOError("Target molecule source is not specified") self.log_fpath = self.configs.get("log_file_path", None) if self.log_fpath is not None: log_dir = dirname(self.log_fpath) makedirs(log_dir, exist_ok=True) self.plot_settings = self.configs.get("similarity_plot_settings", {}) self.n_hits = self.configs.get("n_hits", 1) self.draw_molecules = self.configs.get("draw_molecules", False)
def test_mol_mol_similarity_w_morgan_tanimoto(self): """ Test that the tanimoto similarity of the morgan fingerprints of two Molecules are in (0, 1). """ mol1_smiles = "CCCCCCCCC" mol2_smiles = "CCCCCCCCCCC" fingerprint_type = "morgan_fingerprint" similarity_metric = "tanimoto" molecules = [] for smiles in [mol1_smiles, mol2_smiles]: molecule = Molecule(mol_smiles=smiles) molecule.set_descriptor(fingerprint_type=fingerprint_type) molecules.append(molecule) similarity_measure = SimilarityMeasure(metric=similarity_metric) tanimoto_similarity = molecules[0].get_similarity_to( molecules[1], similarity_measure=similarity_measure) self.assertGreaterEqual(tanimoto_similarity, 0.0, "Expected tanimoto similarity to be >= 0.") self.assertLessEqual(tanimoto_similarity, 1.0, "Expected tanimoto similarity to be <= 1.")
def is_present(self, target_molecule): """ Searches the name of a target molecule in the molecule set to determine if the target molecule is present in the molecule set. Args: target_molecule (AIMSim.chemical_datastructures.Molecule): Target molecule to search. Returns: (bool): If the molecule is present in the molecule set or not. """ for set_molecule in self.molecule_database: if Molecule().is_same(set_molecule, target_molecule): return True return False
def test_molecule_draw(self): """ Test to draw molecule stored in Molecule object. """ test_smiles = "CC" test_molecule = Molecule() test_molecule._set_molecule_from_smiles(test_smiles) test_image_fpath = test_smiles + ".png" test_molecule.draw(fpath=test_image_fpath) self.assertTrue(os.path.isfile(test_image_fpath)) try: print(f"Deleting {test_image_fpath}") remove(test_image_fpath) except FileNotFoundError: print(f"Could not find {test_image_fpath}")
def test_molecule_created_with_no_attributes(self): """ Test for creation of empty Molecule object with no attributes. """ test_molecule = Molecule() self.assertIsNone( test_molecule.mol_graph, "Expected attribute mol_graph to be None for uninitialized Molecule", ) self.assertIsNone( test_molecule.mol_text, "Expected attribute mol_text to be None for uninitialized Molecule", ) self.assertIsNone( test_molecule.mol_property_val, "Expected attribute mol_property_val to be None " "for uninitialized Molecule", ) self.assertFalse( test_molecule.descriptor.check_init(), "Expected molecule.descriptor to be unitialized " "for uninitialized Molecule", )
def test_mol_smiles_loadingerror(self): """Error in mol_smiles should raise LoadingError """ with self.assertRaises(LoadingError): Molecule(mol_smiles="XYZ")
def _get_molecule_database(self, molecule_database_src, molecule_database_src_type): """Load molecular database and return it. Optionally return features if found in excel / csv file. Args: molecule_database_src (str): Source of molecular information. Can be a folder or a filepath. In case a folder is specified, all .pdb files in the folder are sequentially read. If a file path, it is assumed that the file is a .txt file with layout: SMILES string (column1) '\b' property (column2, optional). molecule_database_src_type (str): Type of source. Can be ['folder', 'text', 'excel', 'csv'] Returns: (list(Molecule), np.ndarray or None) Returns a tuple. First element of tuple is the molecule_database. Second element is array of features of shape (len(molecule_database), n_features) or None if None found. """ if not self.is_verbose: RDLogger.DisableLog('rdApp.*') molecule_database = [] features = None if molecule_database_src_type.lower() in ["folder", "directory"]: if self.is_verbose: print(f"Searching for *.pdb files in {molecule_database_src}") for molfile in glob(os.path.join(molecule_database_src, "*.pdb")): if self.is_verbose: print(f"Loading {molfile}") try: molecule_database.append(Molecule(mol_src=molfile)) except LoadingError as e: if self.is_verbose: print(f"{molfile} could not be imported. Skipping") elif molecule_database_src_type.lower() == "text": if self.is_verbose: print(f"Reading SMILES strings from {molecule_database_src}") with open(molecule_database_src, "r") as fp: smiles_data = fp.readlines() for count, line in enumerate(smiles_data): # Assumes that the first column contains the smiles string line_fields = line.split() smile = line_fields[0] mol_property_val = None if len(line_fields) > 1: mol_property_val = float(line_fields[1]) if self.is_verbose: print(f"Processing {smile} " f"({count + 1}/" f"{len(smiles_data)})") mol_text = smile try: molecule_database.append( Molecule( mol_smiles=smile, mol_text=mol_text, mol_property_val=mol_property_val, )) except LoadingError as e: if self.is_verbose: print(f"{smile} could not be imported. Skipping") elif molecule_database_src_type.lower() in ["excel", "csv"]: if self.is_verbose: print(f"Reading molecules from {molecule_database_src}") database_df = (pd.read_excel(molecule_database_src, engine="openpyxl") if molecule_database_src_type.lower() == "excel" else pd.read_csv(molecule_database_src)) # expects feature columns to be prefixed with feature_ # e.g. feature_smiles feature_cols = [ column for column in database_df.columns if column.split("_")[0] == "feature" ] database_feature_df = database_df[feature_cols] mol_names, mol_smiles, responses = None, None, None if "feature_name" in feature_cols: mol_names = database_feature_df["feature_name"].values.flatten( ) database_feature_df = database_feature_df.drop( ["feature_name"], axis=1) if "feature_smiles" in feature_cols: mol_smiles = database_df["feature_smiles"].values.flatten() database_feature_df = database_feature_df.drop( ["feature_smiles"], axis=1) response_col = [ column for column in database_df.columns if column.split("_")[0] == "response" ] if len(response_col) > 0: # currently handles one response responses = database_df[response_col].values.flatten() for mol_id, smile in enumerate(mol_smiles): if self.is_verbose: print(f"Processing {smile} " f"({mol_id + 1}/" f"{database_df['feature_smiles'].values.size})") mol_text = mol_names[mol_id] if mol_names is not None else smile mol_property_val = responses[ mol_id] if responses is not None else None try: molecule_database.append( Molecule( mol_smiles=smile, mol_text=mol_text, mol_property_val=mol_property_val, )) except LoadingError as e: if self.is_verbose: print(f"{smile} could not be imported. Skipping") if len(database_feature_df.columns) > 0: features = database_feature_df.values else: raise FileNotFoundError( f"{molecule_database_src} could not be found. " f"Please enter valid folder name or path of a " f"text/excel/csv") if len(molecule_database) == 0: raise UserWarning("No molecular files found in the location!") return molecule_database, features
def test_missing_pdb(self): """Missing PDB files should raise a LoadingError. """ with self.assertRaises(LoadingError): test_molecule = Molecule() test_molecule._set_molecule_from_pdb("missing.pdb")
def test_set_molecule_from_file(self): """ Test to create Molecule object by reading the contents of a file. Case #1: text file Case #2: PDB file """ test_smiles = "CC" # Case 1: text file test_text_molecule = Molecule() text_fpath = "test_mol_src.txt" print(f"Creating file {text_fpath}...") with open(text_fpath, "w") as fp: fp.write(test_smiles + " garbage vals") test_text_molecule._set_molecule_from_file(text_fpath) self.assertEqual( test_text_molecule.mol_text, test_smiles, "Expected mol_text attribute to be set " "to smiles string when loading from txt file", ) self.assertIsNotNone( test_text_molecule.mol_graph, "Expected mol_graph attribute to be set " "from the smiles when loading from txt file", ) self.assertIsInstance( test_text_molecule.mol_graph, rdkit.Chem.rdchem.Mol, "Expected initialized mol_graph to " "be rdkit.Chem.rdchem.Mol object " "when loading from txt file", ) print(f"Test complete. Deleting file {text_fpath}...") remove(text_fpath) # Case 2: pdb file test_pdb_molecule = Molecule() test_pdb_filename = "test_mol_src.pdb" print(f"Creating file {test_pdb_filename}...") test_mol = MolFromSmiles(test_smiles) MolToPDBFile(test_mol, test_pdb_filename) test_pdb_molecule._set_molecule_from_file(test_pdb_filename) self.assertEqual( test_pdb_molecule.mol_text, os.path.basename(test_pdb_filename).split('.')[0], "Expected mol_text attribute to be set " "to name of file when loading from pdb file", ) self.assertIsNotNone( test_pdb_molecule.mol_graph, "Expected mol_graph attribute to be set " "from the smiles when loading from pdb file", ) self.assertIsInstance( test_pdb_molecule.mol_graph, rdkit.Chem.rdchem.Mol, "Expected initialized mol_graph to " "be rdkit.Chem.rdchem.Mol object " "when loading from pdb file", ) print(f"Test complete. Deleting file {test_pdb_filename}...") remove(test_pdb_filename)
def test_get_property_value(self): """Retrieve the property value from the molecule """ correct_val = 10 mol = Molecule(mol_text="C", mol_property_val=correct_val) self.assertEqual(mol.get_mol_property_val(), correct_val)
def test_get_name(self): """Retrieve the name from the molecule. """ mol = Molecule(mol_text="C") self.assertEqual(mol.get_name(), "C")
def test_is_same(self): """Two identical molecules should be identifed as such. """ mol_1 = Molecule(mol_text="C") mol_2 = Molecule(mol_text="C") self.assertTrue(Molecule.is_same(mol_1, mol_2))
def test_invalid_smiles(self): """Invalid SMILES strings should raise a LoadingError. """ with self.assertRaises(LoadingError): test_molecule = Molecule() test_molecule._set_molecule_from_smiles("XYZ")
def test_mol_src_txt_loadingerror(self): """Error in mol_src should raise LoadingError """ with self.assertRaises(LoadingError): Molecule(mol_src='non-existent file.txt')
def test_molecule_graph_similar_to_itself_morgan_dice(self): """ Test that the morgan fingerprint of a Molecule object is similar to itself using dice similarity. """ test_smiles = "CCO" fingerprint_type = "morgan_fingerprint" similarity_metric = "dice" test_molecule = Molecule() test_molecule._set_molecule_from_smiles(test_smiles) test_molecule_duplicate = Molecule() test_molecule_duplicate._set_molecule_from_smiles(test_smiles) test_molecule.set_descriptor(fingerprint_type=fingerprint_type) test_molecule_duplicate.set_descriptor( fingerprint_type=fingerprint_type) similarity_measure = SimilarityMeasure(metric=similarity_metric) dice_similarity = test_molecule.get_similarity_to( test_molecule_duplicate, similarity_measure=similarity_measure) self.assertEqual( dice_similarity, 1.0, "Expected dice similarity to be 1 when comparing " "molecule graph to itself", )