Esempio n. 1
0
def test_filter_sets_output_oeb_attribute(tmp_path):
    smiles_file = tmp_path / "smiles.smi"
    smiles_file.write_text("\n".join(TEST_SMILES))
    output_oeb = tmp_path / "filter_output.oeb"

    dp = DancePipeline("SMILES", smiles_file)
    dp.filter(_relevant_always, output_oeb)

    assert dp.filter_output_oeb == output_oeb
Esempio n. 2
0
def test_fingerprint_retrieval_raises_exceptions_with_bad_data():
    # Retrieving from a molecule with no length tag.
    mol = oechem.OEMol()
    with pytest.raises(ValueError):
        DancePipeline.get_fingerprint_from_mol(mol)

    # Retrieving from a molecule with length tag but missing values.
    mol.SetIntData(DancePipeline.FINGERPRINT_LENGTH_NAME, 4)
    with pytest.raises(ValueError):
        DancePipeline.get_fingerprint_from_mol(mol)
Esempio n. 3
0
def test_filters_molecules_with_relevance_function(tmp_path):
    smiles_file = tmp_path / "smiles.smi"
    smiles_file.write_text("\n".join(TEST_SMILES))
    output_oeb = tmp_path / "filter_output.oeb"

    dp = DancePipeline("SMILES", smiles_file)
    dp.filter(_relevant_if_contains_nitrogen, output_oeb)

    assert dp.num_molecules == len(["N", "N#N", "C#N"])
    utils.assert_smiles_in_oeb_are_equal(output_oeb, \
        utils.get_list_of_canonical_isomeric_smiles(["N", "N#N", "C#N"]))
Esempio n. 4
0
def test_filters_from_smiles_database(tmp_path):
    smiles_file = tmp_path / "smiles.smi"
    smiles_file.write_text("\n".join(TEST_SMILES))
    output_oeb = tmp_path / "filter_output.oeb"

    dp = DancePipeline("SMILES", smiles_file)
    dp.filter(_relevant_always, output_oeb)

    # After "filtering," the output oeb should have all the molecules that were
    # originally inputted, and the `num_molecules` attribute should have been
    # set correctly.
    assert dp.num_molecules == len(TEST_SMILES)
    utils.assert_smiles_in_oeb_are_equal(output_oeb, TEST_CANONICAL_ISOMERIC_SMILES)
Esempio n. 5
0
def test_select_with_longer_fingerprints(pipeline_test_files):
    (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file,
     sorted_by_fingerprint_oeb) = pipeline_test_files

    smiles_file.write_text("\n".join(TEST_SMILES))

    dp = DancePipeline("SMILES", smiles_file)
    dp.filter(_relevant_always, filter_output_oeb)

    # Each fingerprint consists of (1, 1, num_atoms), to ensure that fingerprints
    # are sorted correctly when the fingerprint is longer.
    dp.assign_fingerprint(lambda mol: (1, 1, mol.NumAtoms()), fingerprint_output_oeb)
    dp.select(3, "SMILES", smiles_dataset_file, sorted_by_fingerprint_oeb, in_memory_sorting_threshold=3)

    # Check that the molecules are sorted by fingerprint.
    outputted_smiles = \
            [oechem.OEMolToSmiles(mol) for mol in utils.get_mols_from_oeb(sorted_by_fingerprint_oeb)]
    assert outputted_smiles == \
            utils.get_list_of_canonical_isomeric_smiles(["N", "N#N", "C#N", "O=C=O"]) or \
           outputted_smiles == \
            utils.get_list_of_canonical_isomeric_smiles(["N", "C#N", "N#N", "O=C=O"])

    # Check that the correct molecules were selected.
    utils.assert_smiles_in_file_are_equal(smiles_dataset_file,
                                          utils.get_list_of_canonical_isomeric_smiles(["N", "O=C=O"]))
Esempio n. 6
0
def _pipeline_executed_until_filter(pipeline_test_files):
    """Provides a pipeline that has been executed up to and including the filter step.

    Also provides several associated files.
    """
    (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file,
     sorted_by_fingerprint_oeb) = pipeline_test_files

    smiles_file.write_text("\n".join(TEST_SMILES))

    dp = DancePipeline("SMILES", smiles_file)
    dp.filter(_relevant_always, filter_output_oeb)

    return dp, smiles_file, filter_output_oeb, fingerprint_output_oeb
Esempio n. 7
0
def test_retrieves_correct_fingerprint_from_mol():
    mol = oechem.OEMol()
    mol.SetIntData(DancePipeline.FINGERPRINT_LENGTH_NAME, 4)
    for i, val in enumerate([2, 1, 8, 7]):
        mol.SetDoubleData(f"{DancePipeline.FINGERPRINT_VALUE_NAME}_{i}", val)

    assert DancePipeline.get_fingerprint_from_mol(mol) == (2, 1, 8, 7)
Esempio n. 8
0
def test_filters_from_oeb_database(tmp_path):
    # Create the OEB file and write the molecules to it.
    oeb_file = tmp_path / "molecules.oeb"
    oeb_stream = oechem.oemolostream(str(oeb_file))
    for smiles in TEST_SMILES:
        mol = utils.oemol_from_smiles(smiles)
        oechem.OEWriteMolecule(oeb_stream, mol)
    oeb_stream.close()

    output_oeb = tmp_path / "filter_output.oeb"

    dp = DancePipeline("OEB", oeb_file)
    dp.filter(_relevant_always, output_oeb)

    assert dp.num_molecules == len(TEST_SMILES)
    utils.assert_smiles_in_oeb_are_equal(output_oeb, TEST_CANONICAL_ISOMERIC_SMILES)
Esempio n. 9
0
def test_filters_from_mol2_dir_database(tmp_path):
    mol2dir = tmp_path / "molecules"
    mol2dir.mkdir()
    for idx, mol in enumerate(TEST_OEMOLS):
        mol2file = mol2dir / f"{idx}.mol2"
        ofs = oechem.oemolostream(str(mol2file))
        oechem.OEWriteMolecule(ofs, mol)
        ofs.close()
    output_oeb = tmp_path / "filter_output.oeb"

    dp = DancePipeline("MOL2_DIR", mol2dir)
    dp.filter(_relevant_always, output_oeb)

    # Even though this is a mol2dir, we still use SMILES to make sure the
    # molecules are equal, as we do not have a way to compare two OEMols.
    assert dp.num_molecules == len(TEST_SMILES)
    utils.assert_smiles_in_oeb_are_equal(output_oeb, TEST_CANONICAL_ISOMERIC_SMILES)
Esempio n. 10
0
def test_select_on_larger_dataset(pipeline_test_files):
    (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file,
     sorted_by_fingerprint_oeb) = pipeline_test_files

    # The pipeline does not filter out repeated molecules, so this is okay.
    smiles = ["N", "O=C=O", "C#N"] * 10
    random.shuffle(smiles)  # The pipeline should work for any molecule ordering.
    smiles_file.write_text("\n".join(smiles))

    # Pipeline execution.
    dp = DancePipeline("SMILES", smiles_file)
    dp.filter(_relevant_always, filter_output_oeb)
    dp.assign_fingerprint(lambda mol: (mol.NumAtoms(), ), fingerprint_output_oeb)
    dp.select(10, "SMILES", smiles_dataset_file, sorted_by_fingerprint_oeb, in_memory_sorting_threshold=7)

    utils.assert_ordered_smiles_in_oeb_are_equal(sorted_by_fingerprint_oeb, \
            utils.get_list_of_canonical_isomeric_smiles(["N"] * 10 + ["C#N"] * 10 + ["O=C=O"] * 10))
    utils.assert_ordered_smiles_in_file_are_equal(smiles_dataset_file, \
            utils.get_list_of_canonical_isomeric_smiles(["N", "C#N", "O=C=O"]))
Esempio n. 11
0
def _pipeline_executed_until_fingerprint(pipeline_test_files):
    """Provides a pipeline that has been executed up to and including the assign_fingerprint step.

    The fingerprint function assigns a fingerprint consisting of the number of
    atoms in the molecule.

    Also provides several associated files.
    """
    (smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file,
     sorted_by_fingerprint_oeb) = pipeline_test_files

    smiles_file.write_text("\n".join(TEST_SMILES))

    dp = DancePipeline("SMILES", smiles_file)
    dp.filter(_relevant_always, filter_output_oeb)
    dp.assign_fingerprint(lambda mol: (mol.NumAtoms(), ), fingerprint_output_oeb)

    return dp, smiles_file, filter_output_oeb, fingerprint_output_oeb, smiles_dataset_file, sorted_by_fingerprint_oeb
Esempio n. 12
0
def main():
    """Parses arguments and prints out the analysis."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--dataset-oeb", default="dataset.oeb")
    args = parser.parse_args()

    # Retrieve the fingerprints from the molecules.
    dataset_stream = oechem.oemolistream(args.dataset_file)
    fingerprints = []
    for mol in dataset_stream.GetOEMols():
        fingerprints.append(DancePipeline.get_fingerprint_from_mol(mol))
    dataset_stream.close()

    # Plot CDFs of the fingerprints, display them, and save them to a file.
    fig, ax = plt.subplots(1, 2, figsize=(10, 4))

    num_atoms = sorted(i[0] for i in fingerprints if i[1] > -1)
    pprint(num_atoms)
    ax[0].set_title("CDF of Molecule Size (Number of atoms)")
    ax[0].hist(num_atoms, bins=num_atoms, cumulative=True)
    ax[0].set_xlabel("Number of atoms")
    ax[0].set_ylabel("Molecules")
    if len(fingerprints) == 20: ax[0].set_yticks(range(0, 21, 2))
    ax[0].grid(True)

    central_wbo = sorted(i[1] for i in fingerprints if i[1] > -1.0)
    pprint(central_wbo)
    ax[1].set_title("CDF of Central WBO")
    ax[1].hist(central_wbo, bins=central_wbo, cumulative=True)
    ax[1].set_xlabel("Central WBO")
    ax[1].set_ylabel("Molecules")
    if len(fingerprints) == 20: ax[0].set_yticks(range(0, 21, 2))
    ax[1].grid(True)

    fig.savefig("results/fingerprint_cdf.png")
    plt.show()
Esempio n. 13
0
def test_initial_molecules_not_available():
    dp = DancePipeline("SMILES", "foo.smi")
    assert dp.num_molecules is None
Esempio n. 14
0
def test_init_raises_exception_with_bad_database_type():
    with pytest.raises(RuntimeError):
        dp = DancePipeline("FOOBAR", "foobar.baz")