def test_normalize_molecule(): expected_molecule = Molecule.from_smiles("CS(=O)(=O)C") molecule = Molecule.from_smiles("C[S+2]([O-])([O-])C") assert not Molecule.are_isomorphic(molecule, expected_molecule)[0] output_molecule = normalize_molecule(molecule) assert Molecule.are_isomorphic(output_molecule, expected_molecule)[0]
def test_record_to_molecule(result, query_function, public_client): expected_molecule = Molecule.from_mapped_smiles(result.cmiles) records = query_function(public_client.address, [result]) assert len(records) == 1 record, molecule = records[0] assert molecule.n_conformers == 1 if isinstance(record, ResultRecord): expected_qc_molecule = record.get_molecule() elif isinstance(record, OptimizationRecord): expected_qc_molecule = record.get_final_molecule() else: raise RuntimeError() assert numpy.allclose( molecule.conformers[0].value_in_unit(unit.bohr), expected_qc_molecule.geometry.reshape((molecule.n_atoms, 3))) are_isomorphic, _ = Molecule.are_isomorphic(molecule, expected_molecule) assert are_isomorphic # The request mocker would raise an exception if the client tries to reach out # to the server. with requests_mock.Mocker(): query_function(public_client.address, [result])
def map_indexed_smiles(smiles_a: str, smiles_b: str) -> Dict[int, int]: """Creates a map between the indices of atoms in one indexed SMILES pattern and the indices of atoms in another indexed SMILES pattern. Args: smiles_a: The first indexed SMILES pattern. smiles_b: The second indexed SMILES pattern. Returns A dictionary where each key is the index of an atom in ``smiles_a`` and the corresponding value the index of the corresponding atom in ``smiles_b``. Examples: >>> map_indexed_smiles("[Cl:1][H:2]", "[Cl:2][H:1]") {0: 1, 1: 0} """ from openff.toolkit.topology import Molecule original_molecule: Molecule = Molecule.from_mapped_smiles(smiles_a) expected_molecule: Molecule = Molecule.from_mapped_smiles(smiles_b) _, index_map = Molecule.are_isomorphic(original_molecule, expected_molecule, return_atom_map=True) return index_map
def test_attributes_to_openff_molecule(): """Round trip a molecule to and from its attributes.""" mol: Molecule = Molecule.from_smiles("CC") attributes = MoleculeAttributes.from_openff_molecule(molecule=mol) mol2 = attributes.to_openff_molecule() isomorphic, atom_map = Molecule.are_isomorphic(mol, mol2, return_atom_map=True) assert isomorphic is True # make sure the molecules are in the same order assert atom_map == dict((i, i) for i in range(mol.n_atoms))
def test_generate_conformers_ordering(): original_molecule = Molecule.from_smiles("CCCC") returned_molecule = chemi._generate_conformers(original_molecule, max_confs=1) assert returned_molecule.n_conformers == 1 # Make sure the atom ordering did not change. _, atom_map = Molecule.are_isomorphic(original_molecule, returned_molecule, return_atom_map=True) assert all(i == j for i, j in atom_map.items())
def test_xxx_from_networkx(from_function): expected_molecule = Molecule.from_mapped_smiles( "[C:1]([O-:2])(=[O:3])([H:4])") nx_graph = openff_molecule_to_networkx(expected_molecule) actual_molecule = from_function(nx_graph) if isinstance(actual_molecule, Molecule): are_isomorphic, atom_map = Molecule.are_isomorphic( expected_molecule, actual_molecule, return_atom_map=True) assert are_isomorphic assert atom_map == {i: i for i in range(4)}
def test_cached_query_torsion_drive_results(public_client): assert len(_grid_id_cache) == 0 result = TorsionDriveResult( record_id=ObjectId("36633243"), cmiles="[H:6][N:5]([H:7])[C:3](=[O:4])[C:1]#[N:2]", inchi_key="", ) expected_molecule = Molecule.from_mapped_smiles(result.cmiles) records = cached_query_torsion_drive_results(public_client.address, [result]) assert len(records) == 1 record, molecule = records[0] assert molecule.n_conformers == 24 assert "grid_ids" in molecule.properties expected_qc_molecules = { json.dumps(grid_id): expected_qc_molecule for grid_id, expected_qc_molecule in record.get_final_molecules().items() } for grid_id, conformer in zip(molecule.properties["grid_ids"], molecule.conformers): assert numpy.allclose( conformer.value_in_unit(unit.bohr), expected_qc_molecules[grid_id].geometry.reshape( (molecule.n_atoms, 3))) assert len(molecule.properties["grid_ids"]) == 24 are_isomorphic, _ = Molecule.are_isomorphic(molecule, expected_molecule) assert are_isomorphic assert len(_grid_id_cache) == 24 # The request mocker would raise an exception if the client tries to reach out # to the server. with requests_mock.Mocker(): cached_query_torsion_drive_results(public_client.address, [result])
def _generate_conformers( molecule: Molecule, max_confs: int = 800, rms_threshold: float = 1.0 ) -> Molecule: """Generate conformations for the supplied molecule. Parameters ---------- molecule Molecule for which to generate conformers max_confs Max number of conformers to generate. rms_threshold The minimum RMS value [Angstrom] at which two conformers are considered redundant and one is deleted. Returns ------- A new multi-conformer molecule with up to max_confs conformers. """ from simtk import unit molecule = copy.deepcopy(molecule) # Store the atom map separately in case it gets removed / mangled by a TK. atom_map = molecule.properties.pop("atom_map", None) # Canonically order the atoms in the molecule before generating the conformer. # This helps ensure the same conformers are generated for the same molecules # independently of their atom order. canonical_molecule = molecule.canonical_order_atoms() canonical_molecule.generate_conformers( n_conformers=max_confs, rms_cutoff=rms_threshold * unit.angstrom ) _, canonical_map = Molecule.are_isomorphic( canonical_molecule, molecule, return_atom_map=True ) molecule = canonical_molecule.remap(canonical_map) # Restore the atom map. if atom_map is not None: molecule.properties["atom_map"] = atom_map return molecule
def normalize_molecule(molecule: "Molecule", check_output: bool = True) -> "Molecule": """Applies a set of reaction SMARTS in sequence to an input molecule in order to attempt to 'normalize' its structure. This involves, for example, converting ``-N(=O)=O`` groups to ``-N(=O)[O-]`` and ``-[S+2]([O-])([O-])-`` to ``-S(=O)=O-``. See ``nagl/data/normalizations.json`` for a full list of transforms. Args: molecule: The molecule to normalize. check_output: Whether to make sure the normalized molecule is isomorphic with the input molecule, ignoring aromaticity, bond order, formal charge, and stereochemistry. """ from openff.toolkit.topology import Molecule from openff.toolkit.utils import ToolkitUnavailableException reaction_smarts_path = data.get_file_path("normalizations.json") with open(reaction_smarts_path) as file: reaction_smarts = [entry["smarts"] for entry in json.load(file)] try: # pragma: no cover # normal_molecule = _oe_normalize_molecule(molecule, reaction_smarts) raise NotImplementedError() except ( ImportError, ModuleNotFoundError, ToolkitUnavailableException, NotImplementedError, ): normal_molecule = _rd_normalize_molecule(molecule, reaction_smarts) assert (not check_output or Molecule.are_isomorphic( molecule, normal_molecule, aromatic_matching=False, formal_charge_matching=False, bond_order_matching=False, atom_stereochemistry_matching=False, bond_stereochemistry_matching=False, )[0]), "normalization changed the molecule - this should not happen" return normal_molecule
def test_select_lowest_energy_forms(): input_molecules = [ Molecule.from_mapped_smiles("[N:1]([H:2])([H:3])[C:4](=[O:5])[H:6]"), Molecule.from_mapped_smiles("[N+:1]([H:2])([H:3])=[C:4]([O-:5])[H:6]"), ] lowest_energy_forms = _select_lowest_energy_forms( { str(i).encode(): openff_molecule_to_networkx(molecule) for i, molecule in enumerate(input_molecules) } ) assert len(lowest_energy_forms) == 1 lowest_energy_form = openff_molecule_from_networkx(lowest_energy_forms[b"0"]) assert Molecule.are_isomorphic(lowest_energy_form, input_molecules[0])[0] print(lowest_energy_forms)
def test_extract_fragment(smiles, atoms, bonds, expected, extract_method): molecule = Molecule.from_mapped_smiles(smiles) molecule.properties["atom_map"] = { i: i + 1 for i in range(molecule.n_atoms) } fragment = None try: fragment = extract_method(molecule, atoms, bonds) except ModuleNotFoundError as e: pytest.skip(str(e)) expected_fragment = Molecule.from_smiles(expected) assert Molecule.are_isomorphic(fragment, expected_fragment, bond_stereochemistry_matching=False)[0]
def validate_and_assign(loaded_mols, group_name, add, existing_output_mols, name_assignments=None): """ Parameters ---------- """ if name_assignments is None: name_assignments = [] logging.basicConfig(filename='log.txt', #level=logging.DEBUG ) #logging.setLevel(logging.INFO) #this_logger = logging.getLogger() #this_logger.setLevel(logging.INFO) smiles_to_success_mol = {} error_mols = [] existing_smiles_to_mol = {} for mol in existing_output_mols: existing_smiles_to_mol[mol.to_smiles()] = mol # Handle 3d molecules print("Validating input molecules and grouping by connection table") logging.info("Validating input molecules and grouping by connection table") for mol_index, mol in enumerate(tqdm(loaded_mols)): # Simulate a SDF file roundtrip to check for errors such as undefined stereochemistry try: with tempfile.NamedTemporaryFile(suffix='.sdf') as of: mol.to_file(of.name, file_format='sdf') of.seek(0) test_loaded_mol = Molecule.from_file(of.name, file_format='sdf') test_loaded_mol.to_rdkit() except Exception as e: error_mols.append(( f'{mol.properties["original_file"]}:{mol.properties["original_file_index"]}', mol, e)) continue # See whether this graph is already in the existing outputs smiles = mol.to_smiles() if smiles in existing_smiles_to_mol: msg = f'Input molecule graph is already present in output.\n' msg += f'{mol.name} from {mol.properties["original_file"]}:{mol.properties["original_file"]} ' msg += f'has an equivalent connection table to existing output' msg += f'{existing_smiles_to_mol[smiles]}' logging.debug(msg) error_mols.append(( f'{mol.properties["original_file"]}:{mol.properties["original_file_index"]}', mol, msg)) continue # If we've reached here, then the molecule is validated # Pop off now-nonessential metadata allowed_properties = [ 'original_file', 'original_file_index', 'original_name' ] keys = list(mol.properties.keys()) for key in keys: if key not in allowed_properties: mol.properties.pop(key) mol.partial_charges = None # If this graph molecule IS already known, add this 3d information as a conformer if smiles in smiles_to_success_mol: try: orig_mol = smiles_to_success_mol[smiles] _, atom_map = Molecule.are_isomorphic( mol, orig_mol, return_atom_map=True, formal_charge_matching=False, aromatic_matching=False, #atom_stereochemistry_matching=False, #bond_stereochemistry_matching=False, ) reordered_mol = mol.remap(atom_map) # Make a temporary copy of the parent mol for conformer alignment and deduplication temp_mol = copy.deepcopy(orig_mol) temp_mol.add_conformer(reordered_mol.conformers[0]) temp_mol, _rmslist = align_offmol_conformers(temp_mol) # Don't trust rmslist above for deduplication -- It doesn't take into # account multiple atom mappings rms_cutoff = 0.2 # angstrom confs_to_delete = greedy_conf_deduplication( temp_mol, rms_cutoff) if len(confs_to_delete) > 0: msg = f'Duplicate molecule conformer input detected.\n' msg += f'{mol.properties["original_file"]}:{mol.properties["original_file_index"]} has an RMSD within {rms_cutoff} A ' msg += f'to a conformer of the molecule originally loaded from the following file(s) and indices:' msg += f'{orig_mol.properties["original_file"]}:{orig_mol.properties["original_file_index"]}' logging.debug(msg) temp_mol._conformers = [temp_mol.conformers[-1]] error_mols.append(( f'{mol.properties["original_file"]}:{mol.properties["original_file_index"]}', mol, msg)) continue temp_mol.properties['original_file'].append( mol.properties['original_file']) temp_mol.properties['original_file_index'].append( mol.properties['original_file_index']) temp_mol.properties['original_name'].append( mol.properties['original_name']) smiles_to_success_mol[smiles] = temp_mol # This is a catch-all for any unexpected processing errors that are encountered above except Exception as e: error_mols.append(( f'{mol.properties["original_file"]}:{mol.properties["original_file_index"]}', mol, e)) # If this graph molecule ISN'T already known, then add # this representation as a new molecule else: # Change the metadata into lists so that we can record it for each conformer mol.properties['original_file'] = [mol.properties['original_file']] mol.properties['original_file_index'] = [ mol.properties['original_file_index'] ] mol.properties['original_name'] = [mol.properties['original_name']] smiles_to_success_mol[smiles] = mol # Assign names and write out files # Preserve a mapping of input filename/mol index to output name success_mols = [] print("Assigning IDs and preparing molecules for output") logging.info("Assigning IDs and preparing molecules for output") # Determine the highest previously-existing mol index if the output directory is already populated if len(existing_output_mols) == 0: output_mol_index_start_value = 0 else: existing_mol_indices = [ int(mol.properties['molecule_index']) for mol in existing_output_mols ] output_mol_index_start_value = max(existing_mol_indices) + 1 for success_mol_index, smiles in enumerate( tqdm(smiles_to_success_mol.keys())): unique_mol_index = success_mol_index + output_mol_index_start_value mol_name = f'{group_name}-{unique_mol_index:05d}' smiles_to_success_mol[smiles].properties['group_name'] = group_name smiles_to_success_mol[smiles].properties[ 'molecule_index'] = unique_mol_index smiles_to_success_mol[smiles].name = mol_name mol_copy = copy.deepcopy(smiles_to_success_mol[smiles]) # Write conformers for conf_index, conformer in enumerate( smiles_to_success_mol[smiles].conformers): mol_copy2 = copy.deepcopy(mol_copy) mol_copy2.name = f'{mol_copy.name}-{conf_index:02d}' orig_file = smiles_to_success_mol[smiles].properties[ 'original_file'][conf_index] orig_file_index = smiles_to_success_mol[smiles].properties[ 'original_file_index'][conf_index] orig_name = smiles_to_success_mol[smiles].properties[ 'original_name'][conf_index] msg = f'Molecule with name {orig_name} from ' msg += f'file:position {orig_file}:{orig_file_index}' msg += f' has passed validation ' msg += f'and is being renamed to {mol_copy2.name}.' logging.debug(msg) name_assignments.append( (orig_name, orig_file, orig_file_index, mol_copy2.name)) mol_copy2._conformers = None mol_copy2.add_conformer(conformer) mol_copy2.properties['conformer_index'] = conf_index # Sanitize last remaining metadata mol_copy2.properties.pop('original_file') mol_copy2.properties.pop('original_file_index') mol_copy2.properties.pop('original_name') success_mols.append(mol_copy2) return success_mols, error_mols, name_assignments
def loadDataset_low( datasetName, specification, benchmark_smiles, qca_overlapped_entries ): """ Low level call to load each torsion drive dataset and return a list of molecules Parameters ---------- datasetName : str torsion drive dataset name. specification : str specification in the dataset. Example: "B3LYP-D3", "default", "UFF" Returns ------- molList : list of objects each row contains the tdr_object.id, dihedral_indices, torsion_barrier, oemol_object """ while True: try: assert datasetName break except AssertionError: print("datasetName is empty. Check input list of dataset tuples") raise while True: try: assert specification break except AssertionError: print("specification is empty. Check input list of dataset tuples") raise # initiate qc portal instance client = ptl.FractalClient() # from the TorsionDriveDataset collection picking up given datasetName ds = client.get_collection("TorsionDriveDataset", datasetName) ds.status([specification], status="COMPLETE") # Serial implementation # Hardcoding benchmark molecules from the lim_mobley_parsely_benchmark # https://openforcefield.org/force-fields/force-fields/ # https://github.com/MobleyLab/benchmarkff/blob/91476147f35579bc52bf984839fd20c72a61d76d/molecules/set_v03_non_redundant/trim3_full_qcarchive.smi with open(benchmark_smiles) as f: bm_smiles = f.readlines() bm_mols = [Molecule.from_smiles(smiles) for smiles in bm_smiles] tb = [] overlaps = 0 qca_entries = [] for i in range(ds.df.size): if ds.df.iloc[i, 0].status == "COMPLETE": smiles = ds.df.index[i] mapped_smiles = ds.get_entry(smiles).attributes[ "canonical_isomeric_explicit_hydrogen_mapped_smiles" ] mol1 = Molecule.from_mapped_smiles(mapped_smiles) not_identical = True for mol in bm_mols: isomorphic, atom_map = Molecule.are_isomorphic( mol1, mol, return_atom_map=False, aromatic_matching=False, formal_charge_matching=False, bond_order_matching=False, atom_stereochemistry_matching=False, bond_stereochemistry_matching=False, ) if isomorphic: not_identical = False overlaps += 1 entry = ds.get_entry(smiles) tdr_id = entry.object_map["default"] # print(tdr_id) qca_entries.append(tdr_id) break if not_identical: tb.append(torsion_barrier_for_molecule(ds.df.iloc[i, 0], mapped_smiles)) # overlaps_qca_ids.txt is also a hardcoded file with open(qca_overlapped_entries, "a") as f: for item in qca_entries: f.write("%s\n" % item) print( "No. of overlaps with benchmark set, qca entries added to overlaps_qca_ids.txt: ", overlaps, ) print( "No. of COMPLETE and not overlapping with benchmark in this dataset:", len(tb), "out of ", len(ds.df), ) return tb