def molobjfilter(molobj): # GetRingInfo info = molobj.GetRingInfo() n_rings = info.NumRings() # if n_rings == 0: # return False # if n_rings > 2: # return False # really_small_space = [1, 5, 6, 7, 8] really_small_space = [1, 6] atoms = cheminfo.molobj_to_atoms(molobj) if not is_allowed_atoms(atoms, allowed_atoms=really_small_space): return False # n_atoms = len(atoms) # n_heavy_atoms, = np.where(atoms > 1) # n_heavy_atoms = len(n_heavy_atoms) # # # no long chains # aromatic_atoms = molobj.GetAromaticAtoms() # aromatic_atoms = [atom for atom in aromatic_atoms] # aromatic_atoms = [atom.GetAtomicNum() for atom in aromatic_atoms] # n_atomatic_atoms = len(aromatic_atoms) # # n_non_aromatic_atoms = n_heavy_atoms - n_atomatic_atoms # # if n_non_aromatic_atoms > 7: # return False return True
def prepare_sdf_and_csv(smi, values, debug=True, **kwargs): kelvin = np.array(values) # standard_deviation = np.std(kelvin) mean = np.mean(kelvin) # Load molecule information molobj = Chem.MolFromSmiles(smi) atoms = cheminfo.molobj_to_atoms(molobj) n_atoms = len(atoms) # NOTE This is a choice # NOTE Filter organic chemistry if n_atoms > 50: return None if n_atoms < 4: return None if not is_allowed_atoms(atoms): return None molobj = Chem.AddHs(molobj) if molobj is None: return None molobj = cheminfo.conformationalsearch(smi) if molobj is None: return None # sdfstr = cheminfo.molobj_to_sdfstr(molobj) if debug: print("{:4.1f}".format(mean), "{:1.2f}".format(standard_deviation)) # return molobj, mean, standard_deviation, values return molobj, values
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="DIR", default="_tmp_") parser.add_argument('--json', action='store', help='', metavar="FILE") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" data = misc.load_json(args.json) keys = data.keys() keys = list(keys) canonical_data = {} for key in keys: molobj, status = cheminfo.smiles_to_molobj(key) if molobj is None: print("error none mol:", key) continue smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True) if "." in smiles: print("error multi mol:", smiles) continue atoms = cheminfo.molobj_to_atoms(molobj) if not is_mol_allowed(atoms): print("error heavy mol:", smiles) continue canonical_data[smiles] = data[key] misc.save_json(args.scratch + "molecule_data", canonical_data) misc.save_obj(args.scratch + "molecule_data", canonical_data) return
def merge_individual_mp(molobjs, filenames, procs=1, debug=True): print("starting {:} procs".format(procs)) atoms_list = [cheminfo.molobj_to_atoms(molobj) for molobj in molobjs] easyusage.parallel(filenames, merge_results_filename, [atoms_list], {}, procs=procs) return
def merge_results_cumulative(sdffile, filenames, debug=True, molid=0): # init energies = [] coordinates = [] representations = [] atoms = [] n_total = 0 molobjs = cheminfo.read_sdffile(sdffile[0]) molobjs = [molobj for molobj in molobjs] atoms_list = [cheminfo.molobj_to_atoms(molobj) for molobj in molobjs] for filename in filenames: energies_next, coordinates_next, atoms = read_resulttxt( atoms_list, filename) representations_next = [ sim.get_representation(atoms, coord) for coord in coordinates_next ] if len(energies) == 0: energies += energies_next coordinates += coordinates_next representations += representations_next n_total += len(energies_next) continue idxs = merge_asymmetric(atoms, energies_next, energies, representations_next, representations) n_new = 0 for i, idxl in enumerate(idxs): N = len(idxl) if N > 0: continue energies.append(energies_next[i]) coordinates.append(coordinates_next[i]) representations.append(representations_next[i]) n_new += 1 if debug: n_total += n_new print(" - new", n_new) print("total", n_total) return
def merge_results_filenames(molobjs, filenames): print("filenames") # init energies = [] coordinates = [] representations = [] atoms = [] n_total = 0 atoms_list = [cheminfo.molobj_to_atoms(molobj) for molobj in molobjs] for filename in filenames: merge_results_filename(filename, atoms_list) return
def clean_data(listdata): data = {} atom_types = [] for row in listdata: idx = row[0] smi = row[1] value = row[3] value = float(value) molobj, status = cheminfo.smiles_to_molobj(smi) if molobj is None: print("error:", smi) continue smi = cheminfo.molobj_to_smiles(molobj, remove_hs=True) atoms = cheminfo.molobj_to_atoms(molobj) # filter for organic chemistry if not is_mol_allowed(atoms): continue atom_types += list(atoms) if smi not in data: data[smi] = [] data[smi].append(value) atom_types, counts = np.unique(atom_types, return_counts=True) for atom, count in zip(atom_types, counts): print(atom, count) keys = data.keys() print("Total molecules", len(keys)) return data
def filter_molobj(molobj): # GetRingInfo info = molobj.GetRingInfo() n_rings = info.NumRings() # if n_rings == 0: # return False # if n_rings > 2: # return False atoms = cheminfo.molobj_to_atoms(molobj) if not is_allowed_atoms(atoms): return False n_atoms = len(atoms) n_heavy_atoms, = np.where(atoms > 1) n_heavy_atoms = len(n_heavy_atoms) # # no long chains # aromatic_atoms = molobj.GetAromaticAtoms() # aromatic_atoms = [atom for atom in aromatic_atoms] # aromatic_atoms = [atom.GetAtomicNum() for atom in aromatic_atoms] # n_atomatic_atoms = len(aromatic_atoms) # # n_non_aromatic_atoms = n_heavy_atoms - n_atomatic_atoms # # if n_non_aromatic_atoms > 7: # return False if n_heavy_atoms < 10: return False if n_heavy_atoms > 20: return False if n_atoms > 40: return False return True
def filter_dict(molecules): keys = molecules.keys() keys = list(keys) max_atoms = 0 for key in keys: molobj, status = cheminfo.smiles_to_molobj(key) if molobj is None: continue status = filter_molobj(molobj) if not status: del molecules[key] print(key, status) continue status = filter_value(molecules[key]) if not status: print(status, key, molecules[key]) del molecules[key] continue # Report atoms = cheminfo.molobj_to_atoms(molobj) n_atoms = len(atoms) if n_atoms > max_atoms: max_atoms = n_atoms continue print("max atoms: ", max_atoms) return molecules
def clean_data(df, scratch): smiles = df.iloc[1] data = {} atom_types = [] for index, row in df.iterrows(): smi = row.smiles value = row.mpC + 273.15 molobj, status = cheminfo.smiles_to_molobj(smi) if molobj is None: print("error:", smi) continue smi = cheminfo.molobj_to_smiles(molobj, remove_hs=True) # Atoms atoms = cheminfo.molobj_to_atoms(molobj) atom_types += list(atoms) if smi not in data: data[smi] = [] data[smi].append(value) atom_types, counts = np.unique(atom_types, return_counts=True) for atom, count in zip(atom_types, counts): print(atom, count) misc.save_obj(scratch + "molecule_data", data) misc.save_json(scratch + "molecule_data", data) return
def ajax_submitquantum(request): """ Setup quantum calculation """ if not request.POST: return { 'error': 'Error 128 - empty post', 'message': "Error. Empty post." } if not request.POST["sdf"]: return { 'error': 'Error 132 - sdf key error', 'message': "Error. Missing information." } # Get coordinates from request sdfstr = request.POST["sdf"].encode('utf-8') # Get rdkit molobj, status = cheminfo.sdfstr_to_molobj(sdfstr) if molobj is None: status = status.split("]") status = status[-1] return {'error': 'Error 141 - rdkit error', 'message': status} try: conf = molobj.GetConformer() except ValueError: # Error return { 'error': 'Error 141 - rdkit error', 'message': "Error. Server was unable to generate conformations for this molecule" } # If hydrogens not added, assume graph and optimize with forcefield atoms = cheminfo.molobj_to_atoms(molobj) if 1 not in atoms: molobj = cheminfo.molobj_add_hydrogens(molobj) cheminfo.molobj_optimize(molobj) # TODO Check lengths of atoms # TODO Define max in settings # Fix sdfstr sdfstr = sdfstr.decode('utf8') for _ in range(3): i = sdfstr.index('\n') sdfstr = sdfstr[i + 1:] sdfstr = "\n" * 3 + sdfstr # hash on sdf (conformer) hshobj = hashlib.md5(sdfstr.encode()) hashkey = hshobj.hexdigest() calculation = request.dbsession.query(models.GamessCalculation) \ .filter_by(hashkey=hashkey).first() if calculation is not None: msg = {'hashkey': hashkey} calculation.created = datetime.datetime.now() return msg print("new:", hashkey) molecule_info = {"sdfstr": sdfstr, "molobj": molobj, "hashkey": hashkey} msg = pipelines.gamess_quantum_pipeline(request, molecule_info) return msg # # # calculation = request.dbsession.query(models.GamessCalculation) \ .filter_by(hashkey=hashkey).first() if calculation is not None: calculation.created = datetime.datetime.now() return msg else: pass # check if folder exists here = os.path.abspath(os.path.dirname(__file__)) + "/" datahere = here + "data/" if os.path.isdir(datahere + hashkey): # return msg pass else: os.mkdir(datahere + hashkey) os.chdir(datahere + hashkey) # Minimize with forcefield first molobj = cheminfo.molobj_add_hydrogens(molobj) cheminfo.molobj_optimize(molobj) header = """ $basis gbasis=pm3 $end $contrl runtyp=optimize icharg=0 $end $statpt opttol=0.0005 nstep=200 projct=.F. $end """ # Prepare gamess input # inpstr = gamess.molobj_to_gmsinp(molobj, header) # Save and run file # with open("optimize.inp", "w") as f: # f.write(inpstr) # # stdout, stderr = gamess.calculate(hashkey+".inp", store_output=False) # with open("start.sdf", 'w') as f: # f.write(cheminfo.molobj_to_sdfstr(molobj)) # Check output # status, message = gamess.check_output(stdout) os.chdir(here) # if not status: # msg["error"] = "error 192: QM Calculation fail" # msg["message"] = message # return msg # Saveable sdf and reset title sdfstr = cheminfo.molobj_to_sdfstr(molobj) sdfstr = str(sdfstr) for _ in range(2): i = sdfstr.index('\n') sdfstr = sdfstr[i + 1:] sdfstr = "\n\n" + sdfstr # Get a 2D Picture # TODO Compute 2D coordinates svgstr = cheminfo.molobj_to_svgstr(molobj, removeHs=True) # Success, setup database calculation = models.GamessCalculation() calculation.smiles = smiles calculation.hashkey = hashkey calculation.sdf = sdfstr calculation.svg = svgstr calculation.created = datetime.datetime.now() # Add calculation to the database request.dbsession.add(calculation) # Add smiles to counter countobj = request.dbsession.query(models.Counter) \ .filter_by(smiles=smiles).first() if countobj is None: counter = models.Counter() counter.smiles = smiles counter.count = 1 request.dbsession.add(counter) print(counter) else: countobj.count += 1 return msg
def get_conformations(molobj, torsions, resolutions, method="sqm", debug=False): molobj = copy.deepcopy(molobj) n_torsions = len(torsions) # init energy energies = [] states = [] coordinates = [] # no constraints ffprop, forcefield = get_forcefield(molobj) # Forcefield generation failed if forcefield is None: return [], [], [] # Get conformer and origin conformer = molobj.GetConformer() origin = conformer.GetPositions() # Origin angle origin_angles = [] # HACK rdkit requires int type for index torsions = [[int(y) for y in x] for x in torsions] for idxs in torsions: angle = Chem.rdMolTransforms.GetDihedralDeg(conformer, *idxs) origin_angles.append(angle) # Get resolution angles angle_iterator = clockwork.generate_angles(resolutions, n_torsions) # set calculate func if method == "ff": # rdkit mmff calculate_method = calculate_forcefield cal_kwargs = { "ffprop": ffprop, "ff": forcefield } else: atoms = cheminfo.molobj_to_atoms(molobj) atoms_str = [cheminfo.convert_atom(atom) for atom in atoms] smiles = quantum.get_smiles(atoms, origin) calculate_method = calculate_mopac cal_kwargs = { "ffprop": ffprop, "atoms": atoms, "reference_smiles": smiles } for angle in angle_iterator: # reset coordinates set_coordinates(conformer, origin) # Minimze with torsion angle constraint # energy, pos, status = calculate_forcefield(molobj, conformer, torsions, origin_angles, angle, # ffprop=ffprop, # ff=forcefield) if debug: start = time.time() energy, pos, status = calculate_method(molobj, conformer, torsions, origin_angles, angle, **cal_kwargs) if debug: end = time.time() print("{:6.5f}s".format(end-start), "{:6.2f}".format(energy), status) # collect energies += [energy] coordinates += [pos] states += [status] return np.asarray(energies), np.asarray(coordinates), np.asarray(states)
def parse_results(molidx, readtemplate, molobjs, dump_results=None, debug=True, **kwargs): """ """ if debug: filename = dump_results.format(molidx) if os.path.exists(filename): print("exists", molidx) return print("parsing", molidx) filename = readtemplate.format(molidx) molobj = molobjs[molidx] reference_smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True) atoms = cheminfo.molobj_to_atoms(molobj) n_atoms = len(atoms) energies, coordinates, costs = merge.read_txt(filename, n_atoms) oenergies = [] ocoordinates = [] ocosts = [] for i, energy, coord, cost in zip(range(len(energies)), energies, coordinates, costs): filename = "_tmp_mopac_/_" + str(molidx) + "-" + str(i) + "_" try: oenergy, ocoord = optmize_conformation(atoms, coord, filename=filename) except: print("unconverged", filename) continue m = get_molobj(atoms, ocoord) smiles = cheminfo.molobj_to_smiles(m) same_graph = (smiles == reference_smiles) if same_graph: oenergies.append(oenergy) ocoordinates.append(ocoord) ocosts.append(cost) # print(smiles == reference_smiles, "{:5.2f}".format(energy), "{:5.2f}".format(oenergy), cost) idxs = merge.merge_cost(atoms, oenergies, ocoordinates, ocosts) renergies = [] rcoords = [] rcosts = [] for idx in idxs: energy = oenergies[idx] coord = ocoordinates[idx] cost = ocosts[idx] renergies.append(energy) rcoords.append(coord) rcosts.append(cost) if dump_results is not None: out = merge.dump_txt(renergies, rcoords, rcosts) filename = dump_results.format(molidx) f = open(filename, 'w') f.write(out) f.close() return renergies, rcoords, rcosts
def merge_results_cumulative_prime(molid, molobj, filenametemplate, debug=True, dump_results=None, iolock=None): # the G list combos = clockwork.generate_linear_costlist() # init energies = [] coordinates = [] representations = [] atoms = [] costs = [] n_total = 0 atoms = cheminfo.molobj_to_atoms(molobj) n_atoms = len(atoms) for combo in combos: filename = filenametemplate.format(molid, *combo) stdprint(filename, std="err", iolock=iolock) energies_next, coordinates_next = read_txt(filename, n_atoms) # file did not exists if energies_next is None: continue representations_next = [ sim.get_representation(atoms, coord) for coord in coordinates_next ] if len(energies) == 0: n_new = len(energies_next) energies += energies_next coordinates += coordinates_next representations += representations_next costs += [combo] * n_new n_total += n_new continue idxs = merge_asymmetric(atoms, energies_next, energies, representations_next, representations) n_new = 0 for i, idxl in enumerate(idxs): N = len(idxl) if N > 0: continue energies.append(energies_next[i]) coordinates.append(coordinates_next[i]) representations.append(representations_next[i]) costs.append(combo) n_new += 1 if debug: n_total += n_new print(" - new", n_new, file=sys.stderr) print("total", n_total, file=sys.stderr) if dump_results: out = dump_txt(energies, coordinates, costs) filename = dump_results.format(molid) f = open(filename, 'w') f.write(out) f.close() return energies, coordinates, costs
def parse_molandprop(*args, debug=False, **kwargs): if len(args) > 1: molobj = args[0] props = args[1] else: molobj, props = args[0] if molobj is None: return None, None keys = props.keys() if "SMILES" not in keys: return None, None prop_smiles = props["SMILES"] # Ignore multi molecules if "." in prop_smiles: if debug: print(f"ignore: {prop_smiles}") return None, None # Count atoms = cheminfo.molobj_to_atoms(molobj) # if len(atoms) < 3: # if debug: # print("ignore small", props) # return None, None # if len(atoms) > 40: # if debug: # print("ignore large", props) # return None, None # atoms_carbons, = np.where(atoms == 6) # if len(atoms_carbons) < 1: # if debug: # print("ignore non-org", props) # return None, None # Add hydrogens and optimize structure molobj = cheminfo.molobj_add_hydrogens(molobj) status = cheminfo.molobj_optimize(molobj) # if unconverged if status == 5: # try the smiles molobj, status = cheminfo.smiles_to_molobj(prop_smiles) if molobj is None: print("error", props) return None, None molobj = cheminfo.molobj_add_hydrogens(molobj) status = cheminfo.molobj_optimize(molobj) if status == 5: print("error", props) return None, None idx_ref = [key for key in keys if "{measured}" in key] idx_ref = idx_ref[0] value = str(props[idx_ref]) if "<" in value: return None, None if ">" in value: return None, None idx_value = [key for key in keys if "measured, converted" in key] idx_value = idx_value[0] idx_unit = [key for key in keys if "UNIT" in key] idx_unit = [key for key in idx_unit if "Point" in key] idx_unit = idx_unit[0] prop_unit = props[idx_unit] prop_value = props[idx_value] if prop_unit == "Celsius": prop_value += 273.15 elif prop_unit == "K": pass else: print("error unknown unit", prop_unit, props) return None, None return molobj, prop_value