コード例 #1
0
def test_select_with_longer_fingerprints(pipeline_test_files):
    (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file,
     sorted_by_fingerprint_oeb) = pipeline_test_files

    smiles_file.write_text("\n".join(TEST_SMILES))

    dp = DancePipeline("SMILES", smiles_file)
    dp.filter(_relevant_always, filter_output_oeb)

    # Each fingerprint consists of (1, 1, num_atoms), to ensure that fingerprints
    # are sorted correctly when the fingerprint is longer.
    dp.assign_fingerprint(lambda mol: (1, 1, mol.NumAtoms()), fingerprint_output_oeb)
    dp.select(3, "SMILES", smiles_dataset_file, sorted_by_fingerprint_oeb, in_memory_sorting_threshold=3)

    # Check that the molecules are sorted by fingerprint.
    outputted_smiles = \
            [oechem.OEMolToSmiles(mol) for mol in utils.get_mols_from_oeb(sorted_by_fingerprint_oeb)]
    assert outputted_smiles == \
            utils.get_list_of_canonical_isomeric_smiles(["N", "N#N", "C#N", "O=C=O"]) or \
           outputted_smiles == \
            utils.get_list_of_canonical_isomeric_smiles(["N", "C#N", "N#N", "O=C=O"])

    # Check that the correct molecules were selected.
    utils.assert_smiles_in_file_are_equal(smiles_dataset_file,
                                          utils.get_list_of_canonical_isomeric_smiles(["N", "O=C=O"]))
コード例 #2
0
def _pipeline_executed_until_fingerprint(pipeline_test_files):
    """Provides a pipeline that has been executed up to and including the assign_fingerprint step.

    The fingerprint function assigns a fingerprint consisting of the number of
    atoms in the molecule.

    Also provides several associated files.
    """
    (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file,
     sorted_by_fingerprint_oeb) = pipeline_test_files

    smiles_file.write_text("\n".join(TEST_SMILES))

    dp = DancePipeline("SMILES", smiles_file)
    dp.filter(_relevant_always, filter_output_oeb)
    dp.assign_fingerprint(lambda mol: (mol.NumAtoms(), ), fingerprint_output_oeb)

    return dp, smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb
コード例 #3
0
def test_select_on_larger_dataset(pipeline_test_files):
    (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file,
     sorted_by_fingerprint_oeb) = pipeline_test_files

    # The pipeline does not filter out repeated molecules, so this is okay.
    smiles = ["N", "O=C=O", "C#N"] * 10
    random.shuffle(smiles)  # The pipeline should work for any molecule ordering.
    smiles_file.write_text("\n".join(smiles))

    # Pipeline execution.
    dp = DancePipeline("SMILES", smiles_file)
    dp.filter(_relevant_always, filter_output_oeb)
    dp.assign_fingerprint(lambda mol: (mol.NumAtoms(), ), fingerprint_output_oeb)
    dp.select(10, "SMILES", smiles_dataset_file, sorted_by_fingerprint_oeb, in_memory_sorting_threshold=7)

    utils.assert_ordered_smiles_in_oeb_are_equal(sorted_by_fingerprint_oeb, \
            utils.get_list_of_canonical_isomeric_smiles(["N"] * 10 + ["C#N"] * 10 + ["O=C=O"] * 10))
    utils.assert_ordered_smiles_in_file_are_equal(smiles_dataset_file, \
            utils.get_list_of_canonical_isomeric_smiles(["N", "C#N", "O=C=O"]))