def test_filter_sets_output_oeb_attribute(tmp_path): smiles_file = tmp_path / "smiles.smi" smiles_file.write_text("\n".join(TEST_SMILES)) output_oeb = tmp_path / "filter_output.oeb" dp = DancePipeline("SMILES", smiles_file) dp.filter(_relevant_always, output_oeb) assert dp.filter_output_oeb == output_oeb
def test_fingerprint_retrieval_raises_exceptions_with_bad_data(): # Retrieving from a molecule with no length tag. mol = oechem.OEMol() with pytest.raises(ValueError): DancePipeline.get_fingerprint_from_mol(mol) # Retrieving from a molecule with length tag but missing values. mol.SetIntData(DancePipeline.FINGERPRINT_LENGTH_NAME, 4) with pytest.raises(ValueError): DancePipeline.get_fingerprint_from_mol(mol)
def test_filters_molecules_with_relevance_function(tmp_path): smiles_file = tmp_path / "smiles.smi" smiles_file.write_text("\n".join(TEST_SMILES)) output_oeb = tmp_path / "filter_output.oeb" dp = DancePipeline("SMILES", smiles_file) dp.filter(_relevant_if_contains_nitrogen, output_oeb) assert dp.num_molecules == len(["N", "N#N", "C#N"]) utils.assert_smiles_in_oeb_are_equal(output_oeb, \ utils.get_list_of_canonical_isomeric_smiles(["N", "N#N", "C#N"]))
def test_filters_from_smiles_database(tmp_path): smiles_file = tmp_path / "smiles.smi" smiles_file.write_text("\n".join(TEST_SMILES)) output_oeb = tmp_path / "filter_output.oeb" dp = DancePipeline("SMILES", smiles_file) dp.filter(_relevant_always, output_oeb) # After "filtering," the output oeb should have all the molecules that were # originally inputted, and the `num_molecules` attribute should have been # set correctly. assert dp.num_molecules == len(TEST_SMILES) utils.assert_smiles_in_oeb_are_equal(output_oeb, TEST_CANONICAL_ISOMERIC_SMILES)
def test_select_with_longer_fingerprints(pipeline_test_files): (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb) = pipeline_test_files smiles_file.write_text("\n".join(TEST_SMILES)) dp = DancePipeline("SMILES", smiles_file) dp.filter(_relevant_always, filter_output_oeb) # Each fingerprint consists of (1, 1, num_atoms), to ensure that fingerprints # are sorted correctly when the fingerprint is longer. dp.assign_fingerprint(lambda mol: (1, 1, mol.NumAtoms()), fingerprint_output_oeb) dp.select(3, "SMILES", smiles_dataset_file, sorted_by_fingerprint_oeb, in_memory_sorting_threshold=3) # Check that the molecules are sorted by fingerprint. outputted_smiles = \ [oechem.OEMolToSmiles(mol) for mol in utils.get_mols_from_oeb(sorted_by_fingerprint_oeb)] assert outputted_smiles == \ utils.get_list_of_canonical_isomeric_smiles(["N", "N#N", "C#N", "O=C=O"]) or \ outputted_smiles == \ utils.get_list_of_canonical_isomeric_smiles(["N", "C#N", "N#N", "O=C=O"]) # Check that the correct molecules were selected. utils.assert_smiles_in_file_are_equal(smiles_dataset_file, utils.get_list_of_canonical_isomeric_smiles(["N", "O=C=O"]))
def _pipeline_executed_until_filter(pipeline_test_files): """Provides a pipeline that has been executed up to and including the filter step. Also provides several associated files. """ (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb) = pipeline_test_files smiles_file.write_text("\n".join(TEST_SMILES)) dp = DancePipeline("SMILES", smiles_file) dp.filter(_relevant_always, filter_output_oeb) return dp, smiles_file, filter_output_oeb, fingerprint_output_oeb
def test_retrieves_correct_fingerprint_from_mol(): mol = oechem.OEMol() mol.SetIntData(DancePipeline.FINGERPRINT_LENGTH_NAME, 4) for i, val in enumerate([2, 1, 8, 7]): mol.SetDoubleData(f"{DancePipeline.FINGERPRINT_VALUE_NAME}_{i}", val) assert DancePipeline.get_fingerprint_from_mol(mol) == (2, 1, 8, 7)
def test_filters_from_oeb_database(tmp_path): # Create the OEB file and write the molecules to it. oeb_file = tmp_path / "molecules.oeb" oeb_stream = oechem.oemolostream(str(oeb_file)) for smiles in TEST_SMILES: mol = utils.oemol_from_smiles(smiles) oechem.OEWriteMolecule(oeb_stream, mol) oeb_stream.close() output_oeb = tmp_path / "filter_output.oeb" dp = DancePipeline("OEB", oeb_file) dp.filter(_relevant_always, output_oeb) assert dp.num_molecules == len(TEST_SMILES) utils.assert_smiles_in_oeb_are_equal(output_oeb, TEST_CANONICAL_ISOMERIC_SMILES)
def test_filters_from_mol2_dir_database(tmp_path): mol2dir = tmp_path / "molecules" mol2dir.mkdir() for idx, mol in enumerate(TEST_OEMOLS): mol2file = mol2dir / f"{idx}.mol2" ofs = oechem.oemolostream(str(mol2file)) oechem.OEWriteMolecule(ofs, mol) ofs.close() output_oeb = tmp_path / "filter_output.oeb" dp = DancePipeline("MOL2_DIR", mol2dir) dp.filter(_relevant_always, output_oeb) # Even though this is a mol2dir, we still use SMILES to make sure the # molecules are equal, as we do not have a way to compare two OEMols. assert dp.num_molecules == len(TEST_SMILES) utils.assert_smiles_in_oeb_are_equal(output_oeb, TEST_CANONICAL_ISOMERIC_SMILES)
def test_select_on_larger_dataset(pipeline_test_files): (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb) = pipeline_test_files # The pipeline does not filter out repeated molecules, so this is okay. smiles = ["N", "O=C=O", "C#N"] * 10 random.shuffle(smiles) # The pipeline should work for any molecule ordering. smiles_file.write_text("\n".join(smiles)) # Pipeline execution. dp = DancePipeline("SMILES", smiles_file) dp.filter(_relevant_always, filter_output_oeb) dp.assign_fingerprint(lambda mol: (mol.NumAtoms(), ), fingerprint_output_oeb) dp.select(10, "SMILES", smiles_dataset_file, sorted_by_fingerprint_oeb, in_memory_sorting_threshold=7) utils.assert_ordered_smiles_in_oeb_are_equal(sorted_by_fingerprint_oeb, \ utils.get_list_of_canonical_isomeric_smiles(["N"] * 10 + ["C#N"] * 10 + ["O=C=O"] * 10)) utils.assert_ordered_smiles_in_file_are_equal(smiles_dataset_file, \ utils.get_list_of_canonical_isomeric_smiles(["N", "C#N", "O=C=O"]))
def _pipeline_executed_until_fingerprint(pipeline_test_files): """Provides a pipeline that has been executed up to and including the assign_fingerprint step. The fingerprint function assigns a fingerprint consisting of the number of atoms in the molecule. Also provides several associated files. """ (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb) = pipeline_test_files smiles_file.write_text("\n".join(TEST_SMILES)) dp = DancePipeline("SMILES", smiles_file) dp.filter(_relevant_always, filter_output_oeb) dp.assign_fingerprint(lambda mol: (mol.NumAtoms(), ), fingerprint_output_oeb) return dp, smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb
def main(): """Parses arguments and prints out the analysis.""" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--dataset-oeb", default="dataset.oeb") args = parser.parse_args() # Retrieve the fingerprints from the molecules. dataset_stream = oechem.oemolistream(args.dataset_file) fingerprints = [] for mol in dataset_stream.GetOEMols(): fingerprints.append(DancePipeline.get_fingerprint_from_mol(mol)) dataset_stream.close() # Plot CDFs of the fingerprints, display them, and save them to a file. fig, ax = plt.subplots(1, 2, figsize=(10, 4)) num_atoms = sorted(i[0] for i in fingerprints if i[1] > -1) pprint(num_atoms) ax[0].set_title("CDF of Molecule Size (Number of atoms)") ax[0].hist(num_atoms, bins=num_atoms, cumulative=True) ax[0].set_xlabel("Number of atoms") ax[0].set_ylabel("Molecules") if len(fingerprints) == 20: ax[0].set_yticks(range(0, 21, 2)) ax[0].grid(True) central_wbo = sorted(i[1] for i in fingerprints if i[1] > -1.0) pprint(central_wbo) ax[1].set_title("CDF of Central WBO") ax[1].hist(central_wbo, bins=central_wbo, cumulative=True) ax[1].set_xlabel("Central WBO") ax[1].set_ylabel("Molecules") if len(fingerprints) == 20: ax[0].set_yticks(range(0, 21, 2)) ax[1].grid(True) fig.savefig("results/fingerprint_cdf.png") plt.show()
def test_initial_molecules_not_available(): dp = DancePipeline("SMILES", "foo.smi") assert dp.num_molecules is None
def test_init_raises_exception_with_bad_database_type(): with pytest.raises(RuntimeError): dp = DancePipeline("FOOBAR", "foobar.baz")