def filter_pubchem(ms): ms_filtered = [] elements = set(['C', 'H', 'O', 'N', 'S', 'P', 'Cl', 'B', 'Br', 'Se']) for m in ms: mw = CalcExactMolWt(m) if mw < 100 or mw > 1500: continue if GetFormalCharge(m) != 0: continue atoms = [a.GetSymbol() for a in m.GetAtoms()] c = Counter(atoms) if 'C' in c and 'H' in c: if 'S' in c and c['S'] > 5: continue if 'Cl' in c and c['Cl'] > 5: continue if 'Br' in c and c['Br'] > 5: continue if 'B' in c and c['B'] > 5: continue if set(c.keys()).issubset(elements): ms_filtered.append(CalcMolFormula(m)) return ms_filtered
def extract_smiles(xyz_file, charge, allow_charge=True, check_ac=False): """ uses xyz2mol to extract smiles with as much 3d structural information as possible """ atoms, _, xyz_coordinates = xyz2mol_local.read_xyz_file(xyz_file) try: input_mol = xyz2mol_local.xyz2mol(atoms, xyz_coordinates, charge=charge, use_graph=True, allow_charged_fragments=allow_charge, use_huckel=True, use_atom_maps=True, embed_chiral=True) except: input_mol = xyz2mol_local.xyz2mol(atoms, xyz_coordinates, charge=charge, use_graph=True, allow_charged_fragments=allow_charge, use_huckel=False, use_atom_maps=True, embed_chiral=True) input_mol = reorder_atoms_to_map(input_mol) structure_mol, res_status = choose_resonance_structure(input_mol) structure_mol = chiral_tags(structure_mol) rdmolops.AssignStereochemistry(structure_mol) structure_smiles = Chem.MolToSmiles(structure_mol) if check_ac: global AC_SAME ac = Chem.GetAdjacencyMatrix(input_mol) if not np.all(AC == ac): AC_SAME = False print("change in AC: stopping") return structure_smiles, GetFormalCharge(structure_mol), res_status
def calculate_md_relaxed_structure(smiles, scale_factor, ridx): """ This function submits an md with a box with size scaled by scale_factor and extracts last structure of the trajectory file """ os.mkdir('md') os.chdir('md') mol = Chem.MolFromSmiles(smiles, sanitize=False) mol = reorder_atoms_to_map(mol) n_atoms = mol.GetNumAtoms() charge = GetFormalCharge(mol) write_xyz_file(mol, str(ridx)+'.xyz') write_md_input(scale_factor) output = run_cmd("/groups/kemi/koerstz/opt/xtb/6.1/bin/xtb {0} --omd --input md.inp --gfn2 --chrg {1}".format(str(ridx)+'.xyz', charge)) with open('md_out.log', 'w') as _file: _file.write(output) out_file = str(scale_factor)+'_md.xyz' extract_last_structure('xtb.trj', out_file) check_md_reaction(out_file, charge, smiles, str(ridx)+'.xyz') shutil.copy(out_file, '../') os.chdir('../') return out_file, charge, n_atoms
def extract_smiles(xyz_file, charge, allow_charge=True): """ uses xyz2mol to extract smiles with as much 3d structural information as possible """ atoms, _, xyz_coordinates = xyz2mol_local.read_xyz_file(xyz_file) try: input_mol = xyz2mol_local.xyz2mol(atoms, xyz_coordinates, charge=charge, use_graph=True, allow_charged_fragments=allow_charge, use_huckel=True, use_atom_maps=True, embed_chiral=True) except: input_mol = xyz2mol_local.xyz2mol(atoms, xyz_coordinates, charge=charge, use_graph=True, allow_charged_fragments=allow_charge, use_huckel=False, use_atom_maps=True, embed_chiral=True) input_mol = reorder_atoms_to_map(input_mol) structure_mol, res_status = choose_resonance_structure(input_mol) structure_mol = chiral_tags(structure_mol) rdmolops.AssignStereochemistry(structure_mol) structure_smiles = Chem.MolToSmiles(structure_mol) return structure_smiles, GetFormalCharge(structure_mol), res_status
def syncProperties(smiles): try: mol = Chem.MolFromSmiles(smiles) formula = CalcMolFormula(mol) charge = GetFormalCharge(mol) formula = formula.replace(str(charge), '') return formula, charge, 'calculated properties from structure' except: return False, False, 'property calculation error'
def preprocess(dataset, dir_input): train_smiles = list(dataset['SMILES']) train_adducts = dataset['Adducts'] train_ccs = list(dataset['CCS']) adducts_encoder = AdductToOneHotEncoder() adducts_encoder.fit(train_adducts) adducts = adducts_encoder.transform(train_adducts) Smiles, molecules, adjacencies, properties, descriptors = '', [], [], [], [] for i, smi in enumerate(train_smiles): if '.' in smi: continue smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) mol = Chem.MolFromSmiles(smi) mol = Chem.AddHs(mol) atoms = create_atoms(mol) i_jbond_dict = create_ijbonddict(mol) fingerprints = extract_fingerprints(atoms, i_jbond_dict, radius) adjacency = create_adjacency(mol) Smiles += smi + '\n' molecules.append(fingerprints) adjacencies.append(adjacency) properties.append([[train_ccs[i]]]) descriptors.append([ ExactMolWt(mol), MolLogP(mol), GetFormalCharge(mol), CalcNumRings(mol), CalcNumRotatableBonds(mol), CalcLogS(mol), AcidCount(mol), BaseCount(mol), APolar(mol), BPolar(mol) ]) properties = np.array(properties) mean, std = np.mean(properties), np.std(properties) properties = np.array((properties - mean) / std) os.makedirs(dir_input, exist_ok=True) with open(dir_input + 'Smiles.txt', 'w') as f: f.write(Smiles) np.save(dir_input + 'molecules', molecules) np.save(dir_input + 'adducts', adducts) np.save(dir_input + 'adjacencies', adjacencies) np.save(dir_input + 'properties', properties) np.save(dir_input + 'descriptors', descriptors) np.save(dir_input + 'mean', mean) np.save(dir_input + 'std', std) dump_dictionary(fingerprint_dict, dir_input + 'fingerprint_dict.pickle')
def is_ts_correct(rsmi, psmi, irc_start_xyz, irc_end_xyz): """ This function compares the input smiles with the smiles of the endpoints of the IRC. """ print(rsmi, psmi) rmol = smiles_to_mol(rsmi) pmol = smiles_to_mol(psmi) charge = GetFormalCharge(rmol) ts_found = False #doing smiles check irc_start_smi, _, _ = get_smiles(irc_start_xyz, charge) print("reverse SMILES: ", irc_start_smi) irc_end_smi, _, _ = get_smiles(irc_end_xyz, charge) print("forward smiles: ", irc_end_smi) if irc_start_smi == rsmi and irc_end_smi == psmi: ts_found = True print("SMILES MATCH: TS FOUND: reactant = reverse") if irc_start_smi == psmi and irc_end_smi == rsmi: ts_found = True print("SMILES MATCH: TS FOUND: reactant = forward") #doing AC check r_ac = rdmolops.GetAdjacencyMatrix(rmol) p_ac = rdmolops.GetAdjacencyMatrix(pmol) irc_start_mol = smiles_to_mol(irc_start_smi) irc_end_mol = smiles_to_mol(irc_end_smi) irc_start_ac = rdmolops.GetAdjacencyMatrix(irc_start_mol) irc_end_ac = rdmolops.GetAdjacencyMatrix(irc_end_mol) if np.all(irc_start_ac == irc_end_ac): print("found TS for conformational change") else: print("found non-coonformational change") if np.all(r_ac == irc_start_ac) and np.all(p_ac == irc_end_ac): print("AC MATCH: reactant = reverse") if np.all(p_ac == irc_start_ac) and np.all(r_ac == irc_end_ac): print("AC MATCH: reactant = forward") return ts_found
def get_charge(smiles): mol = get_mol(smiles) charge = GetFormalCharge(mol) return charge