def test_select_with_longer_fingerprints(pipeline_test_files): (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb) = pipeline_test_files smiles_file.write_text("\n".join(TEST_SMILES)) dp = DancePipeline("SMILES", smiles_file) dp.filter(_relevant_always, filter_output_oeb) # Each fingerprint consists of (1, 1, num_atoms), to ensure that fingerprints # are sorted correctly when the fingerprint is longer. dp.assign_fingerprint(lambda mol: (1, 1, mol.NumAtoms()), fingerprint_output_oeb) dp.select(3, "SMILES", smiles_dataset_file, sorted_by_fingerprint_oeb, in_memory_sorting_threshold=3) # Check that the molecules are sorted by fingerprint. outputted_smiles = \ [oechem.OEMolToSmiles(mol) for mol in utils.get_mols_from_oeb(sorted_by_fingerprint_oeb)] assert outputted_smiles == \ utils.get_list_of_canonical_isomeric_smiles(["N", "N#N", "C#N", "O=C=O"]) or \ outputted_smiles == \ utils.get_list_of_canonical_isomeric_smiles(["N", "C#N", "N#N", "O=C=O"]) # Check that the correct molecules were selected. utils.assert_smiles_in_file_are_equal(smiles_dataset_file, utils.get_list_of_canonical_isomeric_smiles(["N", "O=C=O"]))
def _pipeline_executed_until_fingerprint(pipeline_test_files): """Provides a pipeline that has been executed up to and including the assign_fingerprint step. The fingerprint function assigns a fingerprint consisting of the number of atoms in the molecule. Also provides several associated files. """ (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb) = pipeline_test_files smiles_file.write_text("\n".join(TEST_SMILES)) dp = DancePipeline("SMILES", smiles_file) dp.filter(_relevant_always, filter_output_oeb) dp.assign_fingerprint(lambda mol: (mol.NumAtoms(), ), fingerprint_output_oeb) return dp, smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb
def test_select_on_larger_dataset(pipeline_test_files): (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb) = pipeline_test_files # The pipeline does not filter out repeated molecules, so this is okay. smiles = ["N", "O=C=O", "C#N"] * 10 random.shuffle(smiles) # The pipeline should work for any molecule ordering. smiles_file.write_text("\n".join(smiles)) # Pipeline execution. dp = DancePipeline("SMILES", smiles_file) dp.filter(_relevant_always, filter_output_oeb) dp.assign_fingerprint(lambda mol: (mol.NumAtoms(), ), fingerprint_output_oeb) dp.select(10, "SMILES", smiles_dataset_file, sorted_by_fingerprint_oeb, in_memory_sorting_threshold=7) utils.assert_ordered_smiles_in_oeb_are_equal(sorted_by_fingerprint_oeb, \ utils.get_list_of_canonical_isomeric_smiles(["N"] * 10 + ["C#N"] * 10 + ["O=C=O"] * 10)) utils.assert_ordered_smiles_in_file_are_equal(smiles_dataset_file, \ utils.get_list_of_canonical_isomeric_smiles(["N", "C#N", "O=C=O"]))