def test_diff_smiles(): """Test different SMILES of same molecule""" input_smiles = [ 'C[C@H](c1c(ccc(c1Cl)F)Cl)OC', 'CO[C@H](C)c1c(Cl)ccc(F)c1Cl' ] cmiles_1 = cmiles.get_molecule_ids(input_smiles[0], strict=False) cmiles_2 = cmiles.get_molecule_ids(input_smiles[1], strict=False) assert cmiles_1['canonical_smiles'] == cmiles_2['canonical_smiles'] assert cmiles_1['canonical_isomeric_smiles'] == cmiles_2[ 'canonical_isomeric_smiles'] assert cmiles_1[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == cmiles_2[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] cmiles_1 = cmiles.get_molecule_ids(input_smiles[0], toolkit='rdkit', strict=False) cmiles_2 = cmiles.get_molecule_ids(input_smiles[1], toolkit='rdkit', strict=False) assert cmiles_1['canonical_smiles'] == cmiles_2['canonical_smiles'] assert cmiles_1['canonical_isomeric_smiles'] == cmiles_2[ 'canonical_isomeric_smiles'] assert cmiles_1[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == cmiles_2[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles']
def test_oe_cmiles(): """Regression test oe cmiles""" expected_output = { 'canonical_smiles': 'CN(C)CC=CC(=O)Nc1cc2c(cc1OC3CCOC3)ncnc2Nc4ccc(c(c4)Cl)F', 'canonical_isomeric_smiles': 'CN(C)C/C=C/C(=O)Nc1cc2c(cc1O[C@@H]3CCOC3)ncnc2Nc4ccc(c(c4)Cl)F', 'canonical_isomeric_explicit_hydrogen_smiles': '[H]c1c(c(c(c(c1N([H])c2c3c(c(c(c(c3nc(n2)[H])[H])O[C@@]4(C(C(OC4([H])[H])([H])[H])([H])[H])[H])N([H])C(=O)/C(=C(\\[H])/C([H])([H])N(C([H])([H])[H])C([H])([H])[H])/[H])[H])[H])Cl)F)[H]', 'canonical_explicit_hydrogen_smiles': '[H]c1c(c(c(c(c1N([H])c2c3c(c(c(c(c3nc(n2)[H])[H])OC4(C(C(OC4([H])[H])([H])[H])([H])[H])[H])N([H])C(=O)C(=C([H])C([H])([H])N(C([H])([H])[H])C([H])([H])[H])[H])[H])[H])Cl)F)[H]', 'canonical_isomeric_explicit_hydrogen_mapped_smiles': '[H:35][c:1]1[c:2]([c:12]([c:13]([c:5]([c:9]1[N:27]([H:58])[c:14]2[c:7]3[c:3]([c:10]([c:11]([c:4]([c:8]3[n:25][c:6]([n:26]2)[H:40])[H:38])[O:32][C@@:21]4([C:18]([C:19]([O:31][C:20]4([H:47])[H:48])([H:45])[H:46])([H:43])[H:44])[H:49])[N:28]([H:59])[C:17](=[O:30])/[C:15](=[C:16](\\[H:42])/[C:24]([H:56])([H:57])[N:29]([C:22]([H:50])([H:51])[H:52])[C:23]([H:53])([H:54])[H:55])/[H:41])[H:37])[H:39])[Cl:34])[F:33])[H:36]', 'provenance': 'cmiles_0.0.0+1.geb7d850.dirty_openeye_2018.Feb.b6' } smiles = 'CN(C)C/C=C/C(=O)Nc1cc2c(cc1O[C@@H]3CCOC3)ncnc2Nc4ccc(c(c4)Cl)F' output = cmiles.get_molecule_ids(smiles, toolkit='openeye', strict=False) assert expected_output['canonical_smiles'] == output['canonical_smiles'] assert expected_output['canonical_isomeric_smiles'] == output[ 'canonical_isomeric_smiles'] assert expected_output[ 'canonical_isomeric_explicit_hydrogen_smiles'] == output[ 'canonical_isomeric_explicit_hydrogen_smiles'] assert expected_output['canonical_explicit_hydrogen_smiles'] == output[ 'canonical_explicit_hydrogen_smiles'] assert expected_output[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == output[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles']
def test_permute_xyz(toolkit): hooh = { 'symbols': ['H', 'O', 'O', 'H'], 'geometry': [ 1.84719633, 1.47046223, 0.80987166, 1.3126021, -0.13023157, -0.0513322, -1.31320906, 0.13130216, -0.05020593, -1.83756335, -1.48745318, 0.80161212 ], 'name': 'HOOH', 'connectivity': [[0, 1, 1], [1, 2, 1], [2, 3, 1]], 'molecular_multiplicity': 1 } permuted_hooh = cmiles.get_molecule_ids(hooh, toolkit, permute_xyz=True) assert hooh['geometry'] != permuted_hooh['geometry'] mol = cmiles.utils.mol_from_json(hooh, toolkit=toolkit) atom_map = cmiles.utils.get_atom_map( mol, permuted_hooh['identifiers'] ['canonical_isomeric_explicit_hydrogen_mapped_smiles']) json_geom = np.asarray(hooh['geometry']).reshape( int(len(hooh['geometry']) / 3), 3) permuted_geom = np.asarray(permuted_hooh['geometry']).reshape( int(len(hooh['geometry']) / 3), 3) for m in atom_map: for i in range(3): assert json_geom[atom_map[m]][i] == pytest.approx( permuted_geom[m - 1][i], 0.0000001)
def frag_to_smile(fragment, isomeric=True): ## import cmiles from openeye import oechem mol_copy = oechem.OEMol(fragment) try: explicit_h_smiles = cmiles.utils.mol_to_smiles(mol_copy, mapped=False) except: print( f'No stereochemistry defined:{ cmiles.utils.mol_to_smiles(mol_copy, mapped=False, isomeric=False)}' ) print( '-> Will enumerate a random stereoisomer for substructure search.') from fragmenter.states import _enumerate_stereoisomers mol_copy = _enumerate_stereoisomers(mol_copy, max_states=1)[0] from cmiles._cmiles_oe import has_stereo_defined if not has_stereo_defined(mol_copy): print(f'{fragment} failed to gen stereo isomer!!') return None else: explicit_h_smiles = cmiles.utils.mol_to_smiles(mol_copy, mapped=False) try: cmiles_identifiers = cmiles.get_molecule_ids(explicit_h_smiles, toolkit='openeye') if isomeric: return cmiles_identifiers['canonical_isomeric_smiles'] else: return cmiles_identifiers['canonical_smiles'] except: print(f'{fragment} failed to get inchi!!') return None
def test_rd_cmiles(): """Regression test rdkit cmiles""" expected_output = { 'canonical_smiles': 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'canonical_isomeric_smiles': 'CN(C)C/C=C/C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1O[C@H]1CCOC1', 'canonical_isomeric_explicit_hydrogen_smiles': '[H]/[C]([C](=[O])[N]([H])[c]1[c]([O][C@]2([H])[C]([H])([H])[O][C]([H])([H])[C]2([H])[H])[c]([H])[c]2[n][c]([H])[n][c]([N]([H])[c]3[c]([H])[c]([H])[c]([F])[c]([Cl])[c]3[H])[c]2[c]1[H])=[C](/[H])[C]([H])([H])[N]([C]([H])([H])[H])[C]([H])([H])[H]', 'canonical_explicit_hydrogen_smiles': '[H][C]([C](=[O])[N]([H])[c]1[c]([O][C]2([H])[C]([H])([H])[O][C]([H])([H])[C]2([H])[H])[c]([H])[c]2[n][c]([H])[n][c]([N]([H])[c]3[c]([H])[c]([H])[c]([F])[c]([Cl])[c]3[H])[c]2[c]1[H])=[C]([H])[C]([H])([H])[N]([C]([H])([H])[H])[C]([H])([H])[H]', 'canonical_isomeric_explicit_hydrogen_mapped_smiles': '[O:1]=[C:8](/[C:9](=[C:10](/[C:30]([N:27]([C:28]([H:45])([H:46])[H:47])[C:29]([H:48])([H:49])[H:50])([H:51])[H:52])[H:36])[H:35])[N:25]([c:18]1[c:17]([O:7][C@:34]2([H:59])[C:32]([H:55])([H:56])[O:6][C:31]([H:53])([H:54])[C:33]2([H:57])[H:58])[c:20]([H:41])[c:23]2[n:5][c:11]([H:37])[n:4][c:21]([N:26]([c:19]3[c:13]([H:39])[c:12]([H:38])[c:15]([F:2])[c:16]([Cl:3])[c:14]3[H:40])[H:44])[c:24]2[c:22]1[H:42])[H:43]', 'provenance': 'cmiles_0.0.0+7.gc71f3a6.dirty_rdkit_2018.03.3' } smiles = 'CN(C)C/C=C/C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1O[C@H]1CCOC1' output = cmiles.get_molecule_ids(smiles, toolkit='rdkit', strict=False) assert expected_output['canonical_smiles'] == output['canonical_smiles'] assert expected_output['canonical_isomeric_smiles'] == output[ 'canonical_isomeric_smiles'] assert expected_output[ 'canonical_isomeric_explicit_hydrogen_smiles'] == output[ 'canonical_isomeric_explicit_hydrogen_smiles'] assert expected_output['canonical_explicit_hydrogen_smiles'] == output[ 'canonical_explicit_hydrogen_smiles'] assert expected_output[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == output[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles']
def test_initial_iso(): """test given chirality""" input_smiles = ["CC[C@@H](C)N", "CC[C@H](C)N"] cmiles_1 = cmiles.get_molecule_ids(input_smiles[0], strict=False) cmiles_2 = cmiles.get_molecule_ids(input_smiles[-1], strict=False) assert cmiles_1['canonical_smiles'] == cmiles_2['canonical_smiles'] assert cmiles_1['canonical_isomeric_smiles'] != cmiles_2[ 'canonical_isomeric_smiles'] cmiles_1 = cmiles.get_molecule_ids(input_smiles[0], toolkit='rdkit', strict=False) cmiles_2 = cmiles.get_molecule_ids(input_smiles[-1], toolkit='rdkit', strict=False) assert cmiles_1['canonical_smiles'] == cmiles_2['canonical_smiles'] assert cmiles_1['canonical_isomeric_smiles'] != cmiles_2[ 'canonical_isomeric_smiles']
def test_input_mapped(): smiles = '[H:3][C:1]([H:4])([H:5])[C:2]([H:6])([H:7])[H:8]' mol_id = cmiles.get_molecule_ids(smiles) mol_1 = cmiles.utils.load_molecule(mol_id['canonical_isomeric_smiles']) mol_2 = cmiles.utils.load_molecule( mol_id['canonical_isomeric_explicit_hydrogen_mapped_smiles']) assert cmiles.utils.has_atom_map(mol_1) == False assert cmiles.utils.has_atom_map(mol_2) == True mol_id = cmiles.get_molecule_ids(smiles, toolkit='rdkit', strict=False) mol_1 = cmiles.utils.load_molecule(mol_id['canonical_isomeric_smiles'], backend='rdkit') mol_2 = cmiles.utils.load_molecule( mol_id['canonical_isomeric_explicit_hydrogen_mapped_smiles'], backend='rdkit') assert cmiles.utils.has_atom_map(mol_1) == False assert cmiles.utils.has_atom_map(mol_2) == True
def parse_input(input_file, output_json='optimization_inputs.json'): # Read input smi file and generate oemols oemols = fragmenter.chemi.file_to_oemols(input_file) optimization_input = [] processed_canonical_smiles = [] # Expand states. for mol in oemols: # Filter out single atom molecules. if mol.GetMaxAtomIdx() == 1: continue # Expand tautomeric states and stereoisomers. states = fragmenter.states.enumerate_states(mol, stereoisomers=True, tautomers=False) for s in states: # Screen out states having valence that rdkit does not accept. try: cmiles_ids = cmiles.get_molecule_ids(s) except: continue canonical_smiles = cmiles_ids['canonical_smiles'] if canonical_smiles in processed_canonical_smiles: continue else: processed_canonical_smiles.append(canonical_smiles) mapped_smiles = cmiles_ids[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] m = cmiles.utils.load_molecule(s) # Screen out states Omega fails to process. try: conformers = fragmenter.chemi.generate_conformers(m) except RuntimeError: continue qcschema_molecules = [ cmiles.utils.mol_to_map_ordered_qcschema(conf, mapped_smiles) for conf in conformers.GetConfs() ] optimization_input.append({ 'initial_molecules': qcschema_molecules, 'cmiles_identifiers': cmiles_ids }) # Save curated molecule set into json file. with open(output_json, 'w') as f: json.dump(optimization_input, f, indent=2, sort_keys=True) return optimization_input
def test_atom_map(smiles): """Test that atom map orders geometry the same way every time no matter the SMILES used to create the molecule""" import cmiles mapped_smiles = '[H:5][C:1]([H:6])([C:2]([H:7])([H:8])[O:4][H:10])[O:3][H:9]' mol_id_oe = cmiles.get_molecule_ids(mapped_smiles, toolkit='openeye') oemol = utils.load_molecule(mapped_smiles, toolkit='openeye') mapped_symbols = ['C', 'C', 'O', 'O', 'H', 'H', 'H', 'H', 'H', 'H'] mapped_geometry = [ -1.6887193912042044, 0.8515190939276903, 0.8344587822904272, -4.05544806361675, -0.3658269566455062, -0.22848169646448416, -1.6111611950422127, 0.4463128276938808, 3.490617694146934, -3.97756355964586, -3.0080934853087373, 0.25948499322223956, -1.6821252026076652, 2.891135395246369, 0.4936556190978574, 0.0, 0.0, 0.0, -4.180315034973438, -0.09210893239246959, -2.2748227320305525, -5.740516456782416, 0.4115539217904015, 0.6823267491485907, -0.07872657410528058, 1.2476492272884379, 4.101615944163073, -5.514569080545831, -3.7195945404657222, -0.4441653010509862 ] mol = cmiles.utils.load_molecule(smiles, toolkit='openeye') if not utils.has_explicit_hydrogen(mol): mol = utils.add_explicit_hydrogen(mol) atom_map = utils.get_atom_map(mol, mapped_smiles=mapped_smiles) # use the atom map to add coordinates to molecule. First reorder mapped geometry to order in molecule mapped_coords = np.array(mapped_geometry, dtype=float).reshape( int(len(mapped_geometry) / 3), 3) coords = np.zeros((mapped_coords.shape)) for m in atom_map: coords[atom_map[m]] = mapped_coords[m - 1] # flatten coords = coords.flatten() # convert to Angstroms coords = coords * utils.BOHR_2_ANGSTROM # set coordinates in oemol mol.SetCoords(coords) mol.SetDimension(3) # Get new atom map atom_map = utils.get_atom_map(mol, mapped_smiles) symbols, geometry = _cmiles_oe.get_map_ordered_geometry(mol, atom_map) assert geometry == mapped_geometry assert symbols == mapped_symbols
async def root(molecule: QCMolecule = Body(...)): mapped_smiles = cmiles.get_molecule_ids(molecule.dict(), toolkit="rdkit") return mapped_smiles
def send_qm_job(ptl_mol, nitrogen, nitrogen_i, mol): """Sends a job to the QM Client - returns a submitted object""" indices = [nitrogen_i] + [nbor.GetIdx() for nbor in list(nitrogen.GetAtoms())] AtomsAroundNit = list(nitrogen.GetAtoms()) print(AtomsAroundNit) for atom in AtomsAroundNit: print(atom.GetAtomicNum()) #make a list of the valence indices for the restrained optimization using the #newlist = sorted(AtomsAroundNit, key=lambda x: x.GetAtomicNum(), reverse=True) AtomsAroundNit.sort(key=lambda x: x.GetAtomicNum(), reverse=True) print(AtomsAroundNit) for atom in AtomsAroundNit: print(atom.GetIdx()) try: valenceIdx=[AtomsAroundNit[0].GetIdx(), nitrogen_i, AtomsAroundNit[1].GetIdx()] except: pass #print(f"indices: {indices}") keywords = ptl.models.KeywordSet(values={"scf_properties":["wiberg_lowdin_indices"]}) try: #keywords_id = (client.add_keywords([keywords])[0]) keywords_id = str(client.add_keywords([keywords])[0]) smiles=cmiles.utils.mol_to_smiles(mol, mapped=False, explicit_hydrogen=False) mol_id = cmiles.get_molecule_ids(smiles, toolkit='openeye', strict=False) connectivity=np.array(ptl_mol.connectivity).tolist() geometry=np.array([[ptl_mol.geometry]]).ravel().tolist() symbols=np.array([[ptl_mol.symbols]]).ravel().tolist() jsonDict={ "cmiles_ids":mol_id, "keywords": { "preoptimization": True, "scans": [{ "type": "dihedral", "indices": list(indices), "steps": [-52 ,-48,-44,-40, -36, -32, -28, -24, -20, -16, -12, -8, -4, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52], "step_type": "absolute" } , { "type": "angle", "indices": list(valenceIdx), "steps": [100, 105, 110, 115, 120, 125, 130, 135, 140], "step_type": "absolute"} ] }, "optimization_spec": { "program": "geometric", "keywords": { "coordsys": "tric", } }, "qc_spec": { "driver": "gradient", "method": "mp2", "basis": "def2-SV(P)", "keywords": keywords_id, "program": "psi4", }, "initial_molecule":{ "geometry":geometry, "symbols":symbols, "connectivity":connectivity }} return jsonDict, smiles except: pass return
# Write out SDF file of all conformations ofs = oechem.oemolostream('optimization_inputs.sdf') for mol in oemols: # Filter out single atom molecules if mol.GetMaxAtomIdx() == 1: skipped.append(cmiles.utils.mol_to_smiles(mol, mapped=False)) continue # Expand protonation states and stereoisomers states = fragment.expand_states(mol, stereoisomers=True, protonation=False, tautomers=False) for s in states: # Some states have valences that rdkit does not accept. try: cmiles_ids = cmiles.get_molecule_ids(s) except: cmiles_failures.append(s) continue # Drop duplicates canonical_smiles = cmiles_ids['canonical_smiles'] if canonical_smiles in processed_canonical_smiles: logging.info('Found duplicate canonical SMILES {}'.format(canonical_smiles)) duplicates.append(canonical_smiles) continue else: processed_canonical_smiles.append(canonical_smiles) # Generate molecule using mapped SMILES mapped_smiles = cmiles_ids['canonical_isomeric_explicit_hydrogen_mapped_smiles']
def makeJson(smiles): """ makeJson takes in a list of smiles strings and expands the tautomeric and isomeric state of the molecules and generates a .json file from these molecules. The functional also generates .smi files that record processed canonical smiles, duplicates, omega failures, cmiles failures, and skipped ions. input: smiles: List of smiles strings return: optSmiles: List of smiles that are used as optimization inputs in the .json file. """ with tempfile.NamedTemporaryFile('w+', suffix='.smi') as tmp: #smiles = [smile+'\n' for smile in smiles] #tmp.writelines(smiles) for line in smiles: tmp.writelines(line + '\n') tmp.seek(0) temp_name = tmp.name print(tmp.name) oemols = fragmenter.chemi.file_to_oemols(temp_name) optimization_input = [] processed_canonical_smiles = [] skipped = [] duplicates = [] # duplicate states omega_failures = [] cmiles_failures = [] # Write out SDF file of all conformations ofs = oechem.oemolostream('optimization_inputs.sdf') optimizationCount = 0 for mol in oemols: # Filter out single atom molecules if mol.GetMaxAtomIdx() == 1: skipped.append(cmiles.utils.mol_to_smiles(mol, mapped=False)) continue # Expand protonation states and stereoisomers states = fragmenter.states.enumerate_states(mol, max_states=5, stereoisomers=True, tautomers=True) for s in states: # Some states have valences that rdkit does not accept. try: cmiles_ids = cmiles.get_molecule_ids(s) except: cmiles_failures.append(s) continue # Drop duplicates canonical_smiles = cmiles_ids['canonical_smiles'] if canonical_smiles in processed_canonical_smiles: logging.info('Found duplicate canonical SMILES {}'.format( canonical_smiles)) duplicates.append(canonical_smiles) continue else: processed_canonical_smiles.append(canonical_smiles) # Generate molecule using mapped SMILES mapped_smiles = cmiles_ids[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] m = cmiles.utils.load_molecule(s) try: # Omega fails for some molecules. conformers = fragmenter.chemi.generate_conformers(m, max_confs=5) except RuntimeError: logging.info( 'Omega failed to generate conformers for {}'.format( cmiles_ids['canonical_isomeric_smiles'])) # Omega failed omega_failures.append(cmiles_ids['canonical_isomeric_smiles']) continue qcschema_molecules = [ cmiles.utils.mol_to_map_ordered_qcschema(conf, mapped_smiles) for conf in conformers.GetConfs() ] optimization_input.append({ 'initial_molecules': qcschema_molecules, 'cmiles_identifiers': cmiles_ids }) optimizationCount += len(qcschema_molecules) # Write to SDF oechem.OEWriteMolecule(ofs, conformers) with gzip.open('optimization_inputs.json.gz', 'w') as f: f.write( json.dumps(optimization_input, indent=2, sort_keys=True).encode('utf-8')) ofs.close() save_smiles(processed_canonical_smiles, 'optimization_inputs.smi') save_smiles(duplicates, 'duplicates.smi') save_smiles(omega_failures, 'omega_failures.smi') save_smiles(cmiles_failures, 'cmiles_failures.smi') save_smiles(skipped, 'skipped_ions.smi') print("Number of unique molecules optimized:" + str(len(oemols))) print("Final optimization count is:" + str(optimizationCount)) print(len(set(oemols))) file1 = open("finalCounts.txt", "w") #write mode file1.write("Number of molecules optimized:" + str(len(oemols)) + '\n') file1.write("Final optimization count with expanded states is:" + str(optimizationCount) + '\n') file1.close() optSmiles = [] for mol in oemols: optSmiles.append(OEMolToSmiles(mol)) return optSmiles
import cmiles import fragmenter import json mol_id = cmiles.get_molecule_ids('OCCO', strict=False) mapped_smiles = (mol_id['canonical_isomeric_explicit_hydrogen_mapped_smiles']) mol = cmiles.utils.load_molecule(mapped_smiles) torsions = fragmenter.torsions.find_torsions(mol) dihedrals_list = [torsions['internal']['torsion_0'], torsions['terminal']['torsion_0']] single_conformer = fragmenter.chemi.generate_conformers(mol, max_confs=1) mult_conformers_grid = fragmenter.chemi.generate_grid_conformers(mol, dihedrals=dihedrals_list, intervals=[90, 120]) qm_mol_single_conf = cmiles.utils.mol_to_map_ordered_qcschema(single_conformer, mapped_smiles) qm_mol_mult_conf = [cmiles.utils.mol_to_map_ordered_qcschema(conf, mapped_smiles) for conf in mult_conformers_grid.GetConfs()] job_index_1d = cmiles.utils.to_canonical_label(mapped_smiles, dihedrals_list[0]) job_index_2d = job_index_1d + ',' + cmiles.utils.to_canonical_label(mapped_smiles , dihedrals_list[1]) job_index_1d_mult = job_index_1d + '_' + str(len(qm_mol_mult_conf)) job_index_2d_mult = job_index_2d + '_' + str(len(qm_mol_mult_conf)) job_index_1d = job_index_1d + '_1' job_index_2d = job_index_2d + '_1' torsion_drive_inputs = { job_index_1d: { 'dihedral': [dihedrals_list[0]], 'grid': [15], 'input_molecules': qm_mol_single_conf, 'cmiles_identifiers': mol_id
def test_bond_stereo(): """Test bond steroe from json molecule""" json_mol_from_oe_map = { 'symbols': ['C', 'C', 'F', 'Cl', 'H', 'H'], 'geometry': [ 0.7558174176630313, -0.9436196701031863, -0.5135812331847833, -0.7123369866046005, 0.7689600644555532, 0.5875385545305212, 1.2485802802219408, -3.180729126504143, 0.5903747404566769, -1.3805989906253051, 3.6349234648338813, -0.7522673418877901, 1.6921967038297914, -0.786834118158881, -2.319716469002742, -1.6036583681666305, 0.5072991602038667, 2.4076517490881173 ], 'molecular_charge': 0, 'molecular_multiplicity': 1, 'connectivity': [[0, 1, 2], [0, 2, 1], [1, 3, 1], [0, 4, 1], [1, 5, 1]] } json_mol_from_rd_map = { 'symbols': ['H', 'H', 'F', 'Cl', 'C', 'C'], 'geometry': [ 1.6921967038297914, -0.786834118158881, -2.319716469002742, -1.6036583681666305, 0.5072991602038667, 2.4076517490881173, 1.2485802802219408, -3.180729126504143, 0.5903747404566769, -1.3805989906253051, 3.6349234648338813, -0.7522673418877901, 0.7558174176630313, -0.9436196701031863, -0.5135812331847833, -0.7123369866046005, 0.7689600644555532, 0.5875385545305212 ], 'molecular_charge': 0, 'molecular_multiplicity': 1, 'connectivity': [[4, 5, 2], [4, 2, 1], [5, 3, 1], [4, 0, 1], [5, 1, 1]], } id_oe_to_oe = cmiles.get_molecule_ids(json_mol_from_oe_map, toolkit='openeye') id_oe_to_rd = cmiles.get_molecule_ids(json_mol_from_oe_map, toolkit='rdkit') id_rd_to_oe = cmiles.get_molecule_ids(json_mol_from_rd_map, toolkit='openeye') id_rd_to_rd = cmiles.get_molecule_ids(json_mol_from_rd_map, toolkit='rdkit') assert id_oe_to_oe['canonical_smiles'] == id_rd_to_oe[ 'canonical_smiles'] == 'C(=CCl)F' assert id_oe_to_oe['canonical_isomeric_smiles'] == id_rd_to_oe[ 'canonical_isomeric_smiles'] == 'C(=C/Cl)\\F' assert id_oe_to_oe['canonical_explicit_hydrogen_smiles'] == id_rd_to_oe[ 'canonical_explicit_hydrogen_smiles'] == '[H]C(=C([H])Cl)F' assert id_oe_to_oe[ 'canonical_isomeric_explicit_hydrogen_smiles'] == id_rd_to_oe[ 'canonical_isomeric_explicit_hydrogen_smiles'] == '[H]/C(=C(/[H])\\Cl)/F' assert id_oe_to_oe[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[H:5]/[C:1](=[C:2](/[H:6])\\[Cl:4])/[F:3]' assert id_rd_to_oe[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[H:1]/[C:5](=[C:6](/[H:2])\\[Cl:4])/[F:3]' assert id_oe_to_rd['canonical_smiles'] == id_rd_to_rd[ 'canonical_smiles'] == 'FC=CCl' assert id_oe_to_rd['canonical_isomeric_smiles'] == id_rd_to_rd[ 'canonical_isomeric_smiles'] == 'F/C=C/Cl' assert id_oe_to_rd['canonical_explicit_hydrogen_smiles'] == id_rd_to_rd[ 'canonical_explicit_hydrogen_smiles'] == '[H][C]([F])=[C]([H])[Cl]' assert id_oe_to_rd[ 'canonical_isomeric_explicit_hydrogen_smiles'] == id_rd_to_rd[ 'canonical_isomeric_explicit_hydrogen_smiles'] == '[H]/[C]([F])=[C](/[H])[Cl]' assert id_oe_to_rd[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[C:1](=[C:2](/[Cl:4])[H:6])(\\[F:3])[H:5]' assert id_rd_to_rd[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[H:1]/[C:5]([F:3])=[C:6](/[H:2])[Cl:4]' # Now the other stereoisomer json_mol_from_oe_map = { 'symbols': ['C', 'C', 'F', 'Cl', 'H', 'H'], 'geometry': [ 0.7558174176630313, -0.9436196701031863, -0.5135812331847833, -0.7123369866046005, 0.7689600644555532, 0.5875385545305212, 1.2485802802219408, -3.180729126504143, 0.5903747404566769, -2.098028327659392, 0.3039736895037833, 3.4718143896933764, 1.6921967038297914, -0.786834118158881, -2.319716469002742, -1.1578923904597032, 2.5868975732566115, -0.2324103574431031 ], 'molecular_charge': 0, 'molecular_multiplicity': 1, 'connectivity': [[0, 1, 2], [0, 2, 1], [1, 3, 1], [0, 4, 1], [1, 5, 1]] } json_mol_from_rd_map = { 'symbols': ['H', 'H', 'F', 'Cl', 'C', 'C'], 'geometry': [ 1.6921967038297914, -0.786834118158881, -2.319716469002742, -1.1578923904597032, 2.5868975732566115, -0.2324103574431031, 1.2485802802219408, -3.180729126504143, 0.5903747404566769, -2.098028327659392, 0.3039736895037833, 3.4718143896933764, 0.7558174176630313, -0.9436196701031863, -0.5135812331847833, -0.7123369866046005, 0.7689600644555532, 0.5875385545305212 ], 'molecular_charge': 0, 'molecular_multiplicity': 1, 'connectivity': [[4, 5, 2], [4, 2, 1], [5, 3, 1], [4, 0, 1], [5, 1, 1]] } id_oe_to_oe = cmiles.get_molecule_ids(json_mol_from_oe_map, toolkit='openeye') id_oe_to_rd = cmiles.get_molecule_ids(json_mol_from_oe_map, toolkit='rdkit') id_rd_to_oe = cmiles.get_molecule_ids(json_mol_from_rd_map, toolkit='openeye') id_rd_to_rd = cmiles.get_molecule_ids(json_mol_from_rd_map, toolkit='rdkit') assert id_oe_to_oe['canonical_smiles'] == id_rd_to_oe[ 'canonical_smiles'] == 'C(=CCl)F' assert id_oe_to_oe['canonical_isomeric_smiles'] == id_rd_to_oe[ 'canonical_isomeric_smiles'] == 'C(=C\\Cl)\\F' assert id_oe_to_oe['canonical_explicit_hydrogen_smiles'] == id_rd_to_oe[ 'canonical_explicit_hydrogen_smiles'] == '[H]C(=C([H])Cl)F' assert id_oe_to_oe[ 'canonical_isomeric_explicit_hydrogen_smiles'] == id_rd_to_oe[ 'canonical_isomeric_explicit_hydrogen_smiles'] == '[H]/C(=C(\\[H])/Cl)/F' assert id_oe_to_oe[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[H:5]/[C:1](=[C:2](\\[H:6])/[Cl:4])/[F:3]' assert id_rd_to_oe[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[H:1]/[C:5](=[C:6](\\[H:2])/[Cl:4])/[F:3]' assert id_oe_to_rd['canonical_smiles'] == id_rd_to_rd[ 'canonical_smiles'] == 'FC=CCl' assert id_oe_to_rd['canonical_isomeric_smiles'] == id_rd_to_rd[ 'canonical_isomeric_smiles'] == 'F/C=C\\Cl' assert id_oe_to_rd['canonical_explicit_hydrogen_smiles'] == id_rd_to_rd[ 'canonical_explicit_hydrogen_smiles'] == '[H][C]([F])=[C]([H])[Cl]' assert id_oe_to_rd[ 'canonical_isomeric_explicit_hydrogen_smiles'] == id_rd_to_rd[ 'canonical_isomeric_explicit_hydrogen_smiles'] == '[H]/[C]([F])=[C](\\[H])[Cl]' assert id_oe_to_rd[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[C:1](=[C:2](\\[Cl:4])[H:6])(\\[F:3])[H:5]' assert id_rd_to_rd[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[H:1]/[C:5]([F:3])=[C:6](\\[H:2])[Cl:4]'
def send_qm_job(ptl_mol, nitrogen, nitrogen_i, mol): """Sends a job to the QM Client - returns a submitted object""" indices = [nitrogen_i ] + [nbor.GetIdx() for nbor in list(nitrogen.GetAtoms())] print(f"indices: {indices}") keywords = ptl.models.KeywordSet( values={"scf_properties": ["wiberg_lowdin_indices"]}) try: #keywords_id = (client.add_keywords([keywords])[0]) keywords_id = str(client.add_keywords([keywords])[0]) smiles = cmiles.utils.mol_to_smiles(mol, mapped=False, explicit_hydrogen=False) mol_id = cmiles.get_molecule_ids(smiles, toolkit='openeye', strict=False) connectivity = np.array(ptl_mol.connectivity).tolist() geometry = np.array([[ptl_mol.geometry]]).ravel().tolist() symbols = np.array([[ptl_mol.symbols]]).ravel().tolist() jsonDict = { "cmiles_ids": mol_id, "keywords": { "preoptimization": True, "scans": [{ "type": "dihedral", "indices": list(indices), "steps": [ -52, -48, -44, -40, -36, -32, -28, -24, -20, -16, -12, -8, -4, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52 ], "step_type": "absolute" }] }, "optimization_spec": { "program": "geometric", "keywords": { "coordsys": "tric", } }, "qc_spec": { "driver": "gradient", "method": "mp2", "basis": "def2-SV(P)", "keywords": keywords_id, "program": "psi4", }, "initial_molecule": { "geometry": geometry, "symbols": symbols, "connectivity": connectivity } } return jsonDict, smiles except: pass return
def make_json(smiles): """ Takes in a list of smiles strings and expands the tautomeric and isomeric state of the molecules and generates a .json file from these molecules. The functional also generates .smi files that record processed canonical smiles, duplicates, omega failures, cmiles failures, and skipped ions. Also takes in torsion indices and writes them in the `atom_indices` field. Copied from Jessica Maat's script: https://github.com/openforcefield/qca-dataset-submission/blob/master/2020-03-20-OpenFF-Gen-2-Optimization-Set-3-Pfizer-Discrepancy/01_generateOptDS.py#L350 Input: smiles: List of smiles strings Return: optSmiles: List of smiles that are used as optimization inputs in the .json file. """ with tempfile.NamedTemporaryFile('w+', suffix='.smi') as tmp: for line in smiles: tmp.writelines(line + '\n') tmp.seek(0) temp_name = tmp.name print(tmp.name) oemols = fragmenter.chemi.file_to_oemols(temp_name) optimization_input = [] processed_canonical_smiles = [] skipped = [] duplicates = [] # duplicate states omega_failures = [] cmiles_failures = [] # SDF file for writing all conformations. ofs = oechem.oemolostream('optimization_inputs.sdf') optimization_count = 0 for mol in oemols: # Filter out single atom molecules if mol.GetMaxAtomIdx() == 1: skipped.append(cmiles.utils.mol_to_smiles(mol, mapped=False)) continue # Expand protonation states and stereoisomers states = fragmenter.states.enumerate_states(mol, stereoisomers=False, tautomers=False) for s in states: # Some states have valences that rdkit does not accept. try: cmiles_ids = cmiles.get_molecule_ids(s) except: cmiles_failures.append(s) continue # Drop duplicates canonical_smiles = cmiles_ids['canonical_smiles'] if canonical_smiles in processed_canonical_smiles: logging.info('Found duplicate canonical SMILES {}'.format( canonical_smiles)) duplicates.append(canonical_smiles) continue else: processed_canonical_smiles.append(canonical_smiles) # Calculate indices of the parameter. We have to recalculate because # indices change when we use different SMILES. mol_from_cmiles = oechem.OEMol() oechem.OESmilesToMol( mol_from_cmiles, cmiles_ids[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles']) torsion_indices = calculate_mol_params(mol_from_cmiles)['t142'][0] # Generate molecule using mapped SMILES mapped_smiles = cmiles_ids[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] m = cmiles.utils.load_molecule(s) try: # Omega fails for some molecules. conformers = fragmenter.chemi.generate_conformers(m) except RuntimeError: logging.info( 'Omega failed to generate conformers for {}'.format( cmiles_ids['canonical_isomeric_smiles'])) # Omega failed omega_failures.append(cmiles_ids['canonical_isomeric_smiles']) continue qcschema_molecules = [ cmiles.utils.mol_to_map_ordered_qcschema(conf, mapped_smiles) for conf in conformers.GetConfs() ] optimization_input.append({ 'cmiles_identifiers': cmiles_ids, 'atom_indices': [torsion_indices], 'initial_molecules': qcschema_molecules, }) optimization_count += len(qcschema_molecules) # Write to SDF oechem.OEWriteMolecule(ofs, conformers) with gzip.open('optimization_inputs.json.gz', 'w') as f: f.write(json.dumps(optimization_input, indent=2).encode('utf-8')) ofs.close() save_smiles(processed_canonical_smiles, 'optimization_inputs.smi') save_smiles(duplicates, 'duplicates.smi') save_smiles(omega_failures, 'omega_failures.smi') save_smiles(cmiles_failures, 'cmiles_failures.smi') save_smiles(skipped, 'skipped_ions.smi') print("Number of unique molecules optimized:" + str(len(oemols))) print("Final optimization count is:" + str(optimization_count)) file1 = open("finalCounts.txt", "w") #write mode file1.write("Number of molecules optimized:" + str(len(oemols)) + '\n') file1.write("Final optimization count with expanded states is:" + str(optimization_count) + '\n') file1.close() opt_smiles = [] for mol in oemols: opt_smiles.append(oechem.OEMolToSmiles(mol)) return opt_smiles
import fragmenter import cmiles from openeye import oechem import json # Get cmiles identifiers for kinase inhibitors mols = fragmenter.chemi.file_to_oemols('kinase_inhibitors.smi') cmiles_identifiers = {} for mol in mols: name = mol.GetTitle() cmiles_identifiers[name] = cmiles.get_molecule_ids( oechem.OEMolToSmiles(mol), strict=False) with open('data/kinase_inhibitors_cmiles_ids.json', 'w') as f: json.dump(cmiles_identifiers, f, indent=2, sort_keys=True)
skipped = [] omega_failures = [] cmiles_failures = [] for mol in oemols: # Filter out single atom molecules if mol.GetMaxAtomIdx() == 1: skipped.append(cmiles.utils.mol_to_smiles(mol, mapped=False)) continue # Expand protonation states and stereoisomers states = fragment.expand_states(mol, stereoisomers=True, protonation=True, tautomers=False) for s in states: # Some states have valences that rdkit does not accept. try: cmiles_ids = cmiles.get_molecule_ids(s) except: cmiles_failures.append(s) continue mapped_smiles = cmiles_ids['canonical_isomeric_explicit_hydrogen_mapped_smiles'] m = cmiles.utils.load_molecule(s) try: # Omega fails for some molecules. conformers = chemi.generate_conformers(m) except RuntimeError: logging.info('Omega failed to generate conformers for {}'.format(cmiles_ids['canonical_isomeric_smiles'])) # Omega failed omega_failures.append(cmiles_ids['canonical_isomeric_smiles']) continue qcschema_molecules = [cmiles.utils.mol_to_map_ordered_qcschema(conf, mapped_smiles) for conf in conformers.GetConfs()] optimization_input.append({'initial_molecules': qcschema_molecules,
def test_keep_chiral_stereo(): """Test that reading from json molecule retains the order of json geometry and stereochemistry""" json_mol = { 'symbols': ['C', 'C', 'N', 'O', 'F', 'H', 'H', 'H', 'H', 'H', 'H'], 'geometry': [ 1.490934395068127, -0.022852472359013117, -1.935709059338355, -0.07992863034848685, -0.42027454585371643, 0.4300901370510521, -1.6008431326210255, 1.7962788702240675, 0.9893952378782299, -1.578310435156546, -2.623152319435938, 0.12587101271275358, 1.5081897367264838, -0.8595839767115931, 2.4023274238804375, 2.643487029125874, -1.686714912858618, -2.3700985298604698, 0.29985967115960716, 0.42241227312506313, -3.568237727722486, 2.7917672897488948, 1.5663042901906687, -1.6694857028577224, -0.4416762043595982, 3.317083889862761, 1.2129328698056736, -2.732926456425621, 2.1997415241410825, -0.5153340816908529, -2.648885919666481, -2.3294408246718734, -1.337378806095166 ], 'molecular_charge': 0, 'molecular_multiplicity': 1, 'connectivity': [[0, 1, 1], [1, 2, 1], [1, 3, 1], [1, 4, 1], [0, 5, 1], [0, 6, 1], [0, 7, 1], [2, 8, 1], [2, 9, 1], [3, 10, 1]] } mol_id = cmiles.get_molecule_ids(json_mol) assert mol_id['canonical_smiles'] == 'CC(N)(O)F' assert mol_id['canonical_isomeric_smiles'] == 'C[C@@](N)(O)F' assert mol_id[ 'canonical_isomeric_explicit_hydrogen_smiles'] == '[H]C([H])([H])[C@@](N([H])[H])(O[H])F' assert mol_id[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[H:6][C:1]([H:7])([H:8])[C@@:2]([N:3]([H:9])[H:10])([O:4][H:11])[F:5]' # generate rd canonicalized smiles - the order should still be as before even though that is not the rdkit canonical # order. We want to retain the order for json molecules to their geometry mol_id = cmiles.get_molecule_ids(json_mol, toolkit='rdkit') assert mol_id['canonical_smiles'] == 'CC(N)(O)F' assert mol_id['canonical_isomeric_smiles'] == 'C[C@@](N)(O)F' assert mol_id[ 'canonical_isomeric_explicit_hydrogen_smiles'] == '[H][O][C@]([F])([N]([H])[H])[C]([H])([H])[H]' assert mol_id[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[C:1]([C@@:2]([N:3]([H:9])[H:10])([O:4][H:11])[F:5])([H:6])([H:7])[H:8]' # Now the other stereoisomer json_mol = { 'symbols': ['C', 'C', 'N', 'O', 'F', 'H', 'H', 'H', 'H', 'H', 'H'], 'geometry': [ 1.490934395068127, -0.022852472359013117, -1.935709059338355, -0.07992863034848685, -0.42027454585371643, 0.4300901370510521, -1.6008431326210255, 1.7962788702240675, 0.9893952378782299, 1.544484919393002, -1.0715460728389934, 2.461713642916755, -1.6405346423022924, -2.4261921600567007, 0.04846706513552157, 2.643487029125874, -1.686714912858618, -2.3700985298604698, 0.29985967115960716, 0.42241227312506313, -3.568237727722486, 2.7917672897488948, 1.5663042901906687, -1.6694857028577224, -0.4416762043595982, 3.317083889862761, 1.2129328698056736, -2.732926456425621, 2.1997415241410825, -0.5153340816908529, 2.4021616230193055, -2.619467530461027, 1.9699541458951846 ], 'molecular_charge': 0, 'molecular_multiplicity': 1, 'connectivity': [[0, 1, 1], [1, 2, 1], [1, 3, 1], [1, 4, 1], [0, 5, 1], [0, 6, 1], [0, 7, 1], [2, 8, 1], [2, 9, 1], [3, 10, 1]], } mol_id = cmiles.get_molecule_ids(json_mol) assert mol_id['canonical_smiles'] == 'CC(N)(O)F' assert mol_id['canonical_isomeric_smiles'] == 'C[C@](N)(O)F' assert mol_id[ 'canonical_isomeric_explicit_hydrogen_smiles'] == '[H]C([H])([H])[C@](N([H])[H])(O[H])F' assert mol_id[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[H:6][C:1]([H:7])([H:8])[C@:2]([N:3]([H:9])[H:10])([O:4][H:11])[F:5]' # generate rd canonicalized smiles - the order should still be as before even though that is not the rdkit canonical # order. We want to retain the order for json molecules to their geometry mol_id = cmiles.get_molecule_ids(json_mol, toolkit='rdkit') assert mol_id['canonical_smiles'] == 'CC(N)(O)F' assert mol_id['canonical_isomeric_smiles'] == 'C[C@](N)(O)F' assert mol_id[ 'canonical_isomeric_explicit_hydrogen_smiles'] == '[H][O][C@@]([F])([N]([H])[H])[C]([H])([H])[H]' assert mol_id[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles'] == '[C:1]([C@:2]([N:3]([H:9])[H:10])([O:4][H:11])[F:5])([H:6])([H:7])[H:8]'
cmiles.utils.add_atom_map(mol) charged_mol = fragmenter.chemi.get_charges(mol, keep_confs=-1) # Find rotatable bond for bond in charged_mol.GetBonds(): if bond.IsRotor(): map_idx = (bond.GetBgn().GetMapIdx(), bond.GetEnd().GetMapIdx()) bond_maps.append(map_idx) wbos.append(bond.GetData('WibergBondOrder')) # Find torsion around this bond mapped_smiles = openeye.oechem.OEMolToSmiles(mol) torsion = fragmenter.torsions.find_torsion_around_bond( charged_mol, map_idx) qcarchive_mols = cmiles.utils.mol_to_map_ordered_qcschema( charged_mol, mapped_smiles) job_idx = cmiles.utils.to_canonical_label(mapped_smiles, torsion) cmiles_identifiers = cmiles.get_molecule_ids(mapped_smiles) # Sanity check that the mapped SMILES are the same if not mapped_smiles == cmiles_identifiers[ 'canonical_isomeric_explicit_hydrogen_mapped_smiles']: print('mapped SMILES do not match. {}, {}'.format( mapped_smiles, cmiles_identifiers['canonical_isomeric_explicit_hydrogen_smiles'])) td_json[job_idx] = { 'dihedral': [torsion], 'grid': [15], 'cmiles_identifiers': cmiles_identifiers, 'input_molecules': qcarchive_mols, 'provenance': { 'fragmenter_version': fragmenter.__version__, 'openeye_version': openeye.__version__, 'username': getpass.getuser(),
def test_provenance(toolkit): provenance = cmiles.get_molecule_ids( '[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H]', toolkit=toolkit)['provenance'] assert provenance.split('_')[2] == toolkit
client = ptl.FractalClient('https://localhost:7777/', verify=False) fragmenter_wf = workflow_api.WorkFlow(client=client, workflow_id='torsiondrive_input', workflow_json='example_workflow.json') # Load Roche molecules roche_mols = chemi.file_to_oemols('OpenFF_references.sdf') smiles = [ cmiles.utils.mol_to_smiles(mol, mapped=False, explicit_hydrogen=False) for mol in roche_mols ] # Put smiles in format for generating torsion input files frags = {} for sm in smiles: identifiers = cmiles.get_molecule_ids(sm, toolkit='openeye', strict=False) frags[sm] = {'identifiers': identifiers, 'provenance': {'routine': {}}} # Generate torsiondrive input torsiondrive_inputs = {} for frag in frags: td_input = fragmenter_wf.generate_torsiondrive_input(frags[frag]) torsiondrive_inputs.update(td_input) # Save with open('torsiondrive_input.json', 'w') as f: json.dump(torsiondrive_inputs, f, indent=2, sort_keys=True) # Added comment 2019-09-04 # Since this script was written using an older verison of fragmenter before there was a function to generate optimizaiton # inputs, some manipulation of the data was needed. In addition, molecules that do not have torsions needed conformers