Ejemplo n.º 1
0
def test_normalize_molecule():

    expected_molecule = Molecule.from_smiles("CS(=O)(=O)C")

    molecule = Molecule.from_smiles("C[S+2]([O-])([O-])C")
    assert not Molecule.are_isomorphic(molecule, expected_molecule)[0]

    output_molecule = normalize_molecule(molecule)
    assert Molecule.are_isomorphic(output_molecule, expected_molecule)[0]
Ejemplo n.º 2
0
def test_record_to_molecule(result, query_function, public_client):

    expected_molecule = Molecule.from_mapped_smiles(result.cmiles)

    records = query_function(public_client.address, [result])
    assert len(records) == 1

    record, molecule = records[0]

    assert molecule.n_conformers == 1

    if isinstance(record, ResultRecord):
        expected_qc_molecule = record.get_molecule()
    elif isinstance(record, OptimizationRecord):
        expected_qc_molecule = record.get_final_molecule()
    else:
        raise RuntimeError()

    assert numpy.allclose(
        molecule.conformers[0].value_in_unit(unit.bohr),
        expected_qc_molecule.geometry.reshape((molecule.n_atoms, 3)))

    are_isomorphic, _ = Molecule.are_isomorphic(molecule, expected_molecule)
    assert are_isomorphic

    # The request mocker would raise an exception if the client tries to reach out
    # to the server.
    with requests_mock.Mocker():
        query_function(public_client.address, [result])
Ejemplo n.º 3
0
def map_indexed_smiles(smiles_a: str, smiles_b: str) -> Dict[int, int]:
    """Creates a map between the indices of atoms in one indexed SMILES pattern and
    the indices of atoms in another indexed SMILES pattern.

    Args:
        smiles_a: The first indexed SMILES pattern.
        smiles_b: The second indexed SMILES pattern.

    Returns
        A dictionary where each key is the index of an atom in ``smiles_a`` and the
        corresponding value the index of the corresponding atom in ``smiles_b``.

    Examples:

        >>> map_indexed_smiles("[Cl:1][H:2]", "[Cl:2][H:1]")
        {0: 1, 1: 0}
    """

    from openff.toolkit.topology import Molecule

    original_molecule: Molecule = Molecule.from_mapped_smiles(smiles_a)
    expected_molecule: Molecule = Molecule.from_mapped_smiles(smiles_b)

    _, index_map = Molecule.are_isomorphic(original_molecule,
                                           expected_molecule,
                                           return_atom_map=True)

    return index_map
def test_attributes_to_openff_molecule():
    """Round trip a molecule to and from its attributes."""

    mol: Molecule = Molecule.from_smiles("CC")

    attributes = MoleculeAttributes.from_openff_molecule(molecule=mol)

    mol2 = attributes.to_openff_molecule()

    isomorphic, atom_map = Molecule.are_isomorphic(mol, mol2, return_atom_map=True)
    assert isomorphic is True
    # make sure the molecules are in the same order
    assert atom_map == dict((i, i) for i in range(mol.n_atoms))
Ejemplo n.º 5
0
def test_generate_conformers_ordering():

    original_molecule = Molecule.from_smiles("CCCC")

    returned_molecule = chemi._generate_conformers(original_molecule,
                                                   max_confs=1)
    assert returned_molecule.n_conformers == 1

    # Make sure the atom ordering did not change.
    _, atom_map = Molecule.are_isomorphic(original_molecule,
                                          returned_molecule,
                                          return_atom_map=True)

    assert all(i == j for i, j in atom_map.items())
Ejemplo n.º 6
0
def test_xxx_from_networkx(from_function):

    expected_molecule = Molecule.from_mapped_smiles(
        "[C:1]([O-:2])(=[O:3])([H:4])")

    nx_graph = openff_molecule_to_networkx(expected_molecule)
    actual_molecule = from_function(nx_graph)

    if isinstance(actual_molecule, Molecule):

        are_isomorphic, atom_map = Molecule.are_isomorphic(
            expected_molecule, actual_molecule, return_atom_map=True)

        assert are_isomorphic
        assert atom_map == {i: i for i in range(4)}
Ejemplo n.º 7
0
def test_cached_query_torsion_drive_results(public_client):

    assert len(_grid_id_cache) == 0

    result = TorsionDriveResult(
        record_id=ObjectId("36633243"),
        cmiles="[H:6][N:5]([H:7])[C:3](=[O:4])[C:1]#[N:2]",
        inchi_key="",
    )

    expected_molecule = Molecule.from_mapped_smiles(result.cmiles)

    records = cached_query_torsion_drive_results(public_client.address,
                                                 [result])
    assert len(records) == 1

    record, molecule = records[0]

    assert molecule.n_conformers == 24

    assert "grid_ids" in molecule.properties

    expected_qc_molecules = {
        json.dumps(grid_id): expected_qc_molecule
        for grid_id, expected_qc_molecule in
        record.get_final_molecules().items()
    }

    for grid_id, conformer in zip(molecule.properties["grid_ids"],
                                  molecule.conformers):

        assert numpy.allclose(
            conformer.value_in_unit(unit.bohr),
            expected_qc_molecules[grid_id].geometry.reshape(
                (molecule.n_atoms, 3)))

    assert len(molecule.properties["grid_ids"]) == 24

    are_isomorphic, _ = Molecule.are_isomorphic(molecule, expected_molecule)
    assert are_isomorphic

    assert len(_grid_id_cache) == 24

    # The request mocker would raise an exception if the client tries to reach out
    # to the server.
    with requests_mock.Mocker():
        cached_query_torsion_drive_results(public_client.address, [result])
Ejemplo n.º 8
0
def _generate_conformers(
    molecule: Molecule, max_confs: int = 800, rms_threshold: float = 1.0
) -> Molecule:
    """Generate conformations for the supplied molecule.

    Parameters
    ----------
    molecule
        Molecule for which to generate conformers
    max_confs
        Max number of conformers to generate.
    rms_threshold
        The minimum RMS value [Angstrom] at which two conformers are considered redundant
        and one is deleted.

    Returns
    -------
        A new multi-conformer molecule with up to max_confs conformers.
    """

    from simtk import unit

    molecule = copy.deepcopy(molecule)

    # Store the atom map separately in case it gets removed / mangled by a TK.
    atom_map = molecule.properties.pop("atom_map", None)

    # Canonically order the atoms in the molecule before generating the conformer.
    # This helps ensure the same conformers are generated for the same molecules
    # independently of their atom order.
    canonical_molecule = molecule.canonical_order_atoms()

    canonical_molecule.generate_conformers(
        n_conformers=max_confs, rms_cutoff=rms_threshold * unit.angstrom
    )

    _, canonical_map = Molecule.are_isomorphic(
        canonical_molecule, molecule, return_atom_map=True
    )

    molecule = canonical_molecule.remap(canonical_map)

    # Restore the atom map.
    if atom_map is not None:
        molecule.properties["atom_map"] = atom_map

    return molecule
Ejemplo n.º 9
0
def normalize_molecule(molecule: "Molecule",
                       check_output: bool = True) -> "Molecule":
    """Applies a set of reaction SMARTS in sequence to an input molecule in order to
    attempt to 'normalize' its structure.

    This involves, for example, converting ``-N(=O)=O`` groups to ``-N(=O)[O-]`` and
    ``-[S+2]([O-])([O-])-`` to ``-S(=O)=O-``. See ``nagl/data/normalizations.json`` for
    a full list of transforms.

    Args:
        molecule: The molecule to normalize.
        check_output: Whether to make sure the normalized molecule is isomorphic with
            the input molecule, ignoring aromaticity, bond order, formal charge, and
            stereochemistry.
    """

    from openff.toolkit.topology import Molecule
    from openff.toolkit.utils import ToolkitUnavailableException

    reaction_smarts_path = data.get_file_path("normalizations.json")

    with open(reaction_smarts_path) as file:
        reaction_smarts = [entry["smarts"] for entry in json.load(file)]

    try:  # pragma: no cover
        # normal_molecule = _oe_normalize_molecule(molecule, reaction_smarts)
        raise NotImplementedError()
    except (
            ImportError,
            ModuleNotFoundError,
            ToolkitUnavailableException,
            NotImplementedError,
    ):
        normal_molecule = _rd_normalize_molecule(molecule, reaction_smarts)

    assert (not check_output or Molecule.are_isomorphic(
        molecule,
        normal_molecule,
        aromatic_matching=False,
        formal_charge_matching=False,
        bond_order_matching=False,
        atom_stereochemistry_matching=False,
        bond_stereochemistry_matching=False,
    )[0]), "normalization changed the molecule - this should not happen"

    return normal_molecule
Ejemplo n.º 10
0
def test_select_lowest_energy_forms():

    input_molecules = [
        Molecule.from_mapped_smiles("[N:1]([H:2])([H:3])[C:4](=[O:5])[H:6]"),
        Molecule.from_mapped_smiles("[N+:1]([H:2])([H:3])=[C:4]([O-:5])[H:6]"),
    ]

    lowest_energy_forms = _select_lowest_energy_forms(
        {
            str(i).encode(): openff_molecule_to_networkx(molecule)
            for i, molecule in enumerate(input_molecules)
        }
    )
    assert len(lowest_energy_forms) == 1

    lowest_energy_form = openff_molecule_from_networkx(lowest_energy_forms[b"0"])
    assert Molecule.are_isomorphic(lowest_energy_form, input_molecules[0])[0]

    print(lowest_energy_forms)
Ejemplo n.º 11
0
def test_extract_fragment(smiles, atoms, bonds, expected, extract_method):

    molecule = Molecule.from_mapped_smiles(smiles)
    molecule.properties["atom_map"] = {
        i: i + 1
        for i in range(molecule.n_atoms)
    }

    fragment = None

    try:
        fragment = extract_method(molecule, atoms, bonds)
    except ModuleNotFoundError as e:
        pytest.skip(str(e))

    expected_fragment = Molecule.from_smiles(expected)

    assert Molecule.are_isomorphic(fragment,
                                   expected_fragment,
                                   bond_stereochemistry_matching=False)[0]
Ejemplo n.º 12
0
def validate_and_assign(loaded_mols,
                        group_name,
                        add,
                        existing_output_mols,
                        name_assignments=None):
    """
    Parameters
    ----------

    """
    if name_assignments is None:
        name_assignments = []

    logging.basicConfig(filename='log.txt',
                        #level=logging.DEBUG
                        )
    #logging.setLevel(logging.INFO)
    #this_logger = logging.getLogger()
    #this_logger.setLevel(logging.INFO)

    smiles_to_success_mol = {}
    error_mols = []

    existing_smiles_to_mol = {}
    for mol in existing_output_mols:
        existing_smiles_to_mol[mol.to_smiles()] = mol

    # Handle 3d molecules
    print("Validating input molecules and grouping by connection table")
    logging.info("Validating input molecules and grouping by connection table")
    for mol_index, mol in enumerate(tqdm(loaded_mols)):
        # Simulate a SDF file roundtrip to check for errors such as undefined stereochemistry
        try:
            with tempfile.NamedTemporaryFile(suffix='.sdf') as of:
                mol.to_file(of.name, file_format='sdf')
                of.seek(0)
                test_loaded_mol = Molecule.from_file(of.name,
                                                     file_format='sdf')
                test_loaded_mol.to_rdkit()
        except Exception as e:
            error_mols.append((
                f'{mol.properties["original_file"]}:{mol.properties["original_file_index"]}',
                mol, e))
            continue

        # See whether this graph is already in the existing outputs
        smiles = mol.to_smiles()
        if smiles in existing_smiles_to_mol:
            msg = f'Input molecule graph is already present in output.\n'
            msg += f'{mol.name} from {mol.properties["original_file"]}:{mol.properties["original_file"]} '
            msg += f'has an equivalent connection table to existing output'
            msg += f'{existing_smiles_to_mol[smiles]}'
            logging.debug(msg)
            error_mols.append((
                f'{mol.properties["original_file"]}:{mol.properties["original_file_index"]}',
                mol, msg))
            continue

        # If we've reached here, then the molecule is validated

        # Pop off now-nonessential metadata
        allowed_properties = [
            'original_file', 'original_file_index', 'original_name'
        ]
        keys = list(mol.properties.keys())
        for key in keys:
            if key not in allowed_properties:
                mol.properties.pop(key)
        mol.partial_charges = None

        # If this graph molecule IS already known, add this 3d information as a conformer
        if smiles in smiles_to_success_mol:
            try:
                orig_mol = smiles_to_success_mol[smiles]
                _, atom_map = Molecule.are_isomorphic(
                    mol,
                    orig_mol,
                    return_atom_map=True,
                    formal_charge_matching=False,
                    aromatic_matching=False,
                    #atom_stereochemistry_matching=False,
                    #bond_stereochemistry_matching=False,
                )
                reordered_mol = mol.remap(atom_map)
                # Make a temporary copy of the parent mol for conformer alignment and deduplication
                temp_mol = copy.deepcopy(orig_mol)
                temp_mol.add_conformer(reordered_mol.conformers[0])
                temp_mol, _rmslist = align_offmol_conformers(temp_mol)
                # Don't trust rmslist above for deduplication -- It doesn't take into
                # account multiple atom mappings
                rms_cutoff = 0.2  # angstrom
                confs_to_delete = greedy_conf_deduplication(
                    temp_mol, rms_cutoff)
                if len(confs_to_delete) > 0:
                    msg = f'Duplicate molecule conformer input detected.\n'
                    msg += f'{mol.properties["original_file"]}:{mol.properties["original_file_index"]} has an RMSD within {rms_cutoff} A '
                    msg += f'to a conformer of the molecule originally loaded from the following file(s) and indices:'
                    msg += f'{orig_mol.properties["original_file"]}:{orig_mol.properties["original_file_index"]}'
                    logging.debug(msg)
                    temp_mol._conformers = [temp_mol.conformers[-1]]
                    error_mols.append((
                        f'{mol.properties["original_file"]}:{mol.properties["original_file_index"]}',
                        mol, msg))
                    continue
                temp_mol.properties['original_file'].append(
                    mol.properties['original_file'])
                temp_mol.properties['original_file_index'].append(
                    mol.properties['original_file_index'])
                temp_mol.properties['original_name'].append(
                    mol.properties['original_name'])
                smiles_to_success_mol[smiles] = temp_mol
            # This is a catch-all for any unexpected processing errors that are encountered above
            except Exception as e:
                error_mols.append((
                    f'{mol.properties["original_file"]}:{mol.properties["original_file_index"]}',
                    mol, e))

        # If this graph molecule ISN'T already known, then add
        # this representation as a new molecule
        else:
            # Change the metadata into lists so that we can record it for each conformer
            mol.properties['original_file'] = [mol.properties['original_file']]
            mol.properties['original_file_index'] = [
                mol.properties['original_file_index']
            ]
            mol.properties['original_name'] = [mol.properties['original_name']]
            smiles_to_success_mol[smiles] = mol

    # Assign names and write out files
    # Preserve a mapping of input filename/mol index to output name
    success_mols = []
    print("Assigning IDs and preparing molecules for output")
    logging.info("Assigning IDs and preparing molecules for output")
    # Determine the highest previously-existing mol index if the output directory is already populated
    if len(existing_output_mols) == 0:
        output_mol_index_start_value = 0
    else:
        existing_mol_indices = [
            int(mol.properties['molecule_index'])
            for mol in existing_output_mols
        ]
        output_mol_index_start_value = max(existing_mol_indices) + 1

    for success_mol_index, smiles in enumerate(
            tqdm(smiles_to_success_mol.keys())):
        unique_mol_index = success_mol_index + output_mol_index_start_value
        mol_name = f'{group_name}-{unique_mol_index:05d}'
        smiles_to_success_mol[smiles].properties['group_name'] = group_name
        smiles_to_success_mol[smiles].properties[
            'molecule_index'] = unique_mol_index
        smiles_to_success_mol[smiles].name = mol_name
        mol_copy = copy.deepcopy(smiles_to_success_mol[smiles])

        # Write conformers
        for conf_index, conformer in enumerate(
                smiles_to_success_mol[smiles].conformers):
            mol_copy2 = copy.deepcopy(mol_copy)
            mol_copy2.name = f'{mol_copy.name}-{conf_index:02d}'

            orig_file = smiles_to_success_mol[smiles].properties[
                'original_file'][conf_index]
            orig_file_index = smiles_to_success_mol[smiles].properties[
                'original_file_index'][conf_index]
            orig_name = smiles_to_success_mol[smiles].properties[
                'original_name'][conf_index]
            msg = f'Molecule with name {orig_name} from '
            msg += f'file:position {orig_file}:{orig_file_index}'
            msg += f' has passed validation '
            msg += f'and is being renamed to {mol_copy2.name}.'
            logging.debug(msg)

            name_assignments.append(
                (orig_name, orig_file, orig_file_index, mol_copy2.name))
            mol_copy2._conformers = None
            mol_copy2.add_conformer(conformer)
            mol_copy2.properties['conformer_index'] = conf_index
            # Sanitize last remaining metadata
            mol_copy2.properties.pop('original_file')
            mol_copy2.properties.pop('original_file_index')
            mol_copy2.properties.pop('original_name')
            success_mols.append(mol_copy2)
    return success_mols, error_mols, name_assignments
def loadDataset_low(
    datasetName, specification, benchmark_smiles, qca_overlapped_entries
):
    """
    Low level call to load each torsion drive dataset and return a list of molecules

        Parameters
        ----------
        datasetName : str
            torsion drive dataset name.
        specification : str
            specification in the dataset. Example: "B3LYP-D3", "default", "UFF"

        Returns
        -------
        molList : list of objects
            each row contains the tdr_object.id, dihedral_indices, torsion_barrier, oemol_object
    """
    while True:
        try:
            assert datasetName
            break
        except AssertionError:
            print("datasetName is empty. Check input list of dataset tuples")
            raise
    while True:
        try:
            assert specification
            break
        except AssertionError:
            print("specification is empty. Check input list of dataset tuples")
            raise

    # initiate qc portal instance
    client = ptl.FractalClient()
    # from the TorsionDriveDataset collection picking up given datasetName
    ds = client.get_collection("TorsionDriveDataset", datasetName)
    ds.status([specification], status="COMPLETE")

    # Serial implementation

    # Hardcoding benchmark molecules from the lim_mobley_parsely_benchmark
    # https://openforcefield.org/force-fields/force-fields/
    # https://github.com/MobleyLab/benchmarkff/blob/91476147f35579bc52bf984839fd20c72a61d76d/molecules/set_v03_non_redundant/trim3_full_qcarchive.smi

    with open(benchmark_smiles) as f:
        bm_smiles = f.readlines()
    bm_mols = [Molecule.from_smiles(smiles) for smiles in bm_smiles]

    tb = []
    overlaps = 0
    qca_entries = []
    for i in range(ds.df.size):
        if ds.df.iloc[i, 0].status == "COMPLETE":
            smiles = ds.df.index[i]
            mapped_smiles = ds.get_entry(smiles).attributes[
                "canonical_isomeric_explicit_hydrogen_mapped_smiles"
            ]
            mol1 = Molecule.from_mapped_smiles(mapped_smiles)
            not_identical = True
            for mol in bm_mols:
                isomorphic, atom_map = Molecule.are_isomorphic(
                    mol1,
                    mol,
                    return_atom_map=False,
                    aromatic_matching=False,
                    formal_charge_matching=False,
                    bond_order_matching=False,
                    atom_stereochemistry_matching=False,
                    bond_stereochemistry_matching=False,
                )
                if isomorphic:
                    not_identical = False
                    overlaps += 1
                    entry = ds.get_entry(smiles)
                    tdr_id = entry.object_map["default"]
                    #                     print(tdr_id)
                    qca_entries.append(tdr_id)
                    break
            if not_identical:
                tb.append(torsion_barrier_for_molecule(ds.df.iloc[i, 0], mapped_smiles))

    # overlaps_qca_ids.txt is also a hardcoded file
    with open(qca_overlapped_entries, "a") as f:
        for item in qca_entries:
            f.write("%s\n" % item)

    print(
        "No. of overlaps with benchmark set, qca entries added to overlaps_qca_ids.txt: ",
        overlaps,
    )
    print(
        "No. of COMPLETE and not overlapping with benchmark in this dataset:",
        len(tb),
        "out of ",
        len(ds.df),
    )
    return tb