def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="DIR", default="_tmp_") parser.add_argument('--json', action='store', help='', metavar="FILE") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" data = misc.load_json(args.json) keys = data.keys() keys = list(keys) canonical_data = {} for key in keys: molobj, status = cheminfo.smiles_to_molobj(key) if molobj is None: print("error none mol:", key) continue smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True) if "." in smiles: print("error multi mol:", smiles) continue atoms = cheminfo.molobj_to_atoms(molobj) if not is_mol_allowed(atoms): print("error heavy mol:", smiles) continue canonical_data[smiles] = data[key] misc.save_json(args.scratch + "molecule_data", canonical_data) misc.save_obj(args.scratch + "molecule_data", canonical_data) return
def search_molcules(mollist, proplist, conf_scr="_tmp_ensemble_"): sublist_mol = [] sublist_prop = [] sublist_idxs = [] for idx, (molobj, prop) in enumerate(zip(mollist, proplist)): atoms = molobj.GetAtoms() atoms = [atom.GetSymbol() for atom in atoms] atoms = np.array(atoms) uatm, counts = np.unique(atoms, return_counts=True) if 'C' not in uatm: continue c_idx, = np.where(uatm == 'C') c_idx = c_idx[0] if counts[c_idx] > 8: continue if counts[c_idx] < 3: continue h_idx, = np.where(uatm == 'C') h_idx = h_idx[0] counts[h_idx] = 0 N = sum(counts) if N > 10: continue smi = cheminfo.molobj_to_smiles(molobj, remove_hs=True) try: value, stddev = prop.strip().split() value = float(value) stddev = float(stddev) except: value = prop.strip() value = float(value) sublist_mol.append(molobj) sublist_prop.append(value) sublist_idxs.append(idx) return sublist_mol, sublist_prop, sublist_idxs
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="DIR", default="_tmp_") parser.add_argument('--sdf', action='store', help='', metavar="FILE", nargs="+") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" if args.procs == -1: args.procs = os.cpu_count() print("starting", args.procs, "procs") # fsdf = gzip.open(args.scratch + "structures.sdf.gz", 'w') # fprop = open(args.scratch + "properties.csv", 'w') mol_val_dict = {} for sdf in args.sdf: print("reading", sdf) molobjs, values = parse_ochem(sdf, debug=True, procs=args.procs) for molobj, value in zip(molobjs, values): smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True) if "smiles" not in mol_val_dict: mol_val_dict[smiles] = [] else: print("duplicate", smiles) mol_val_dict[smiles].append(value) # sdfstr = cheminfo.molobj_to_sdfstr(molobj) # sdfstr += "$$$$\n" # # propstr = "{:} {:}\n".format(value, 0.0) # fprop.write(propstr) # fsdf.close() # fprop.close() keys = mol_val_dict.keys() print("TOTAL ITEMS", len(keys)) misc.save_json(args.scratch + "molecule_data", mol_val_dict) misc.save_obj(args.scratch + "molecule_data", mol_val_dict) return
def parse_molobj(molobj, debug=False, **kwargs): if molobj is None: return None, None mol_smi = cheminfo.molobj_to_smiles(molobj) props = molobj.GetPropsAsDict() keys = props.keys() result = parse_molandprop(molobj, props) return result
def clean_data(listdata): data = {} atom_types = [] for row in listdata: idx = row[0] smi = row[1] value = row[3] value = float(value) molobj, status = cheminfo.smiles_to_molobj(smi) if molobj is None: print("error:", smi) continue smi = cheminfo.molobj_to_smiles(molobj, remove_hs=True) atoms = cheminfo.molobj_to_atoms(molobj) # filter for organic chemistry if not is_mol_allowed(atoms): continue atom_types += list(atoms) if smi not in data: data[smi] = [] data[smi].append(value) atom_types, counts = np.unique(atom_types, return_counts=True) for atom, count in zip(atom_types, counts): print(atom, count) keys = data.keys() print("Total molecules", len(keys)) return data
def clean_data(df, scratch): smiles = df.iloc[1] data = {} atom_types = [] for index, row in df.iterrows(): smi = row.smiles value = row.mpC + 273.15 molobj, status = cheminfo.smiles_to_molobj(smi) if molobj is None: print("error:", smi) continue smi = cheminfo.molobj_to_smiles(molobj, remove_hs=True) # Atoms atoms = cheminfo.molobj_to_atoms(molobj) atom_types += list(atoms) if smi not in data: data[smi] = [] data[smi].append(value) atom_types, counts = np.unique(atom_types, return_counts=True) for atom, count in zip(atom_types, counts): print(atom, count) misc.save_obj(scratch + "molecule_data", data) misc.save_json(scratch + "molecule_data", data) return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Read properties properties = misc.load_npy(args.scratch + "properties") molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") molecules = list(molecules) heavy_atoms = [] predictions = [] errors = [] for mol, prop in zip(molecules, properties): smi = cheminfo.molobj_to_smiles(mol, remove_hs=True) J = thermo.joback.Joback(smi) # J = thermo.joback.Joback('CC(=O)C') # J = thermo.joback.Joback('CCC(=O)OC(=O)CC') status = J.status atoms, coord = cheminfo.molobj_to_xyz(mol) idx = np.where(atoms != 1) atoms = atoms[idx] N = len(atoms) heavy_atoms.append(N) if "Did not match all atoms present" in status: errors.append(1) predictions.append(float("nan")) continue try: estimate = J.estimate() except TypeError: errors.append(1) predictions.append(float("nan")) continue errors.append(0) T_b = estimate["Tb"] T_m = estimate["Tm"] predictions.append(T_m) errors = np.array(errors, dtype=int) idx_success, = np.where(errors == 0) heavy_atoms = np.array(heavy_atoms) predictions = np.array(predictions) properties = np.array(properties) predictions = predictions[idx_success] properties = properties[idx_success] heavy_atoms = heavy_atoms[idx_success] print("total", errors.shape[0], "filter", idx_success.shape[0]) print() print(rmse(properties, predictions)) plt.plot(properties, properties, "-k") plt.scatter(properties, predictions, s=0.95, alpha=0.8, c=heavy_atoms) plt.xlabel("True") plt.ylabel("Predicted") plt.savefig("_fig_joback") plt.clf() return
def gamess_quantum_pipeline(request, molinfo): """ Assumed that rdkit understands the molecule """ # TODO Read gamess settings from ini # Read input molobj = molinfo["molobj"] sdfstr = molinfo["sdfstr"] if "name " in request.POST: name = request.POST["name"].encode('utf-8') else: name = None # Get that smile on your face smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True) # hash on sdf (conformer) hshobj = hashlib.md5(sdfstr.encode()) hashkey = hshobj.hexdigest() # Start respond message msg = {"smiles": smiles, "hashkey": hashkey} # Check if calculation already exists if False: calculation = request.dbsession.query(models.GamessCalculation) \ .filter_by(hashkey=hashkey).first() if calculation is not None: calculation.created = datetime.datetime.now() return msg # Create new calculation calculation = models.GamessCalculation() # check if folder exists here = os.path.abspath(os.path.dirname(__file__)) + "/" datahere = here + "data/" if not os.path.isdir(datahere + hashkey): os.mkdir(datahere + hashkey) os.chdir(datahere + hashkey) # GAMESS DEBUG # TODO Add error messages when gamess fails # TODO add timeouts for all gamess calls # Optimize molecule gmsargs = { "scr": datahere + hashkey, "autoclean": True, "debug": False, } properties = gamess.calculate_optimize(molobj, **gmsargs) if properties is None: return { 'error': 'Error g-80 - gamess optimization error', 'message': "Error. Server was unable to optimize molecule" } print(smiles, list(properties.keys())) # Save and set coordinates coord = properties["coord"] calculation.coordinates = save_array(coord) calculation.enthalpy = properties["h"] cheminfo.molobj_set_coordinates(molobj, coord) # Optimization is finished, do other calculation async-like # Vibrate molecule vibheader = """ $basis gbasis=PM3 $end $contrl scftyp=RHF runtyp=hessian icharg={:} maxit=60 $end """ orbheader = """ $contrl coord=cart units=angs scftyp=rhf icharg={:} maxit=60 $end $basis gbasis=sto ngauss=3 $end """ solheader = """ $system mwords=125 $end $basis gbasis=PM3 $end $contrl scftyp=RHF runtyp=energy icharg={:} $end $pcm solvnt=water mxts=15000 icav=1 idisp=1 $end $tescav mthall=4 ntsall=60 $end """ headers = [vibheader, orbheader, solheader] readers = [ gamess.read_properties_vibration, gamess.read_properties_orbitals, gamess.read_properties_solvation ] def procfunc(conn, reader, *args, **kwargs): stdout, status = gamess.calculate(*args, **kwargs) try: properties = reader(stdout) except: # TODO Error reading properties properties = None conn.send(properties) conn.close() procs = [] conns = [] for header, reader in zip(headers, readers): parent_conn, child_conn = Pipe() p = Process(target=procfunc, args=(child_conn, reader, molobj, header), kwargs=gmsargs) p.start() procs.append(p) conns.append(parent_conn) for proc in procs: proc.join() properties_vib = conns[0].recv() properties_orb = conns[1].recv() properties_sol = conns[2].recv() if properties_vib is None: return { 'error': 'Error g-104 - gamess vibration error', 'message': "Error. Server was unable to vibrate molecule" } print(smiles, list(properties_vib.keys())) calculation.islinear = properties_vib["linear"] calculation.vibjsmol = properties_vib["jsmol"] calculation.vibfreq = save_array(properties_vib["freq"]) calculation.vibintens = save_array(properties_vib["intens"]) calculation.thermo = save_array(properties_vib["thermo"]) if properties_orb is None: return { 'error': 'Error g-128 - gamess orbital error', 'message': "Error. Server was unable to orbital the molecule" } print(smiles, list(properties_orb.keys())) calculation.orbitals = save_array(properties_orb["orbitals"]) calculation.orbitalstxt = properties_orb["stdout"] if properties_sol is None: return { 'error': 'Error g-159 - gamess solvation error', 'message': "Error. Server was unable to run solvation calculation" } # 'charges', 'solvation_total', 'solvation_polar', 'solvation_nonpolar', # 'surface', 'total_charge', 'dipole', 'dipole_total' print(smiles, list(properties_sol.keys())) charges = properties_sol["charges"] calculation.charges = save_array(charges) calculation.soltotal = properties_sol["solvation_total"] calculation.solpolar = properties_sol["solvation_polar"] calculation.solnonpolar = properties_sol["solvation_nonpolar"] calculation.solsurface = properties_sol["surface"] calculation.soldipole = save_array(properties_sol["dipole"]) calculation.soldipoletotal = properties_sol["dipole_total"] # GAMESS DEBUG os.chdir(here) # Saveable sdf and reset title sdfstr = cheminfo.molobj_to_sdfstr(molobj) sdfstr = str(sdfstr) for _ in range(2): i = sdfstr.index('\n') sdfstr = sdfstr[i + 1:] sdfstr = "\n\n" + sdfstr # Save mol2 fmt mol2 = cheminfo.molobj_to_mol2(molobj, charges=charges) calculation.mol2 = mol2 # Get a 2D Picture # TODO Compute 2D coordinates svgstr = cheminfo.molobj_to_svgstr(molobj, removeHs=True) # Success, setup database # calculation = models.GamessCalculation() calculation.smiles = smiles calculation.hashkey = hashkey calculation.sdf = sdfstr calculation.svg = svgstr calculation.created = datetime.datetime.now() # Add calculation to the database request.dbsession.add(calculation) # Add smiles to counter countobj = request.dbsession.query(models.Counter) \ .filter_by(smiles=smiles).first() if countobj is None: counter = models.Counter() counter.smiles = smiles counter.count = 1 request.dbsession.add(counter) else: countobj.count += 1 return msg
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="DIR", default="_tmp_") parser.add_argument('--sdf', action='store', help='', metavar="FILE") #, nargs="+", default=[]) parser.add_argument('--properties', action='store', help='', metavar="FILE") #, nargs="+", default=[]) parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" fsdf = gzip.open(args.scratch + "structures.sdf.gz", 'w') fprop = open(args.scratch + "properties.csv", 'w') molecules = cheminfo.read_sdffile(args.sdf) properties = open(args.properties, 'r') moledict = {} for molobj, line in zip(molecules, properties): status = molobjfilter(molobj) if not status: continue status = valuefilter(line) if not status: continue smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True) print(smiles) sdfstr = cheminfo.molobj_to_sdfstr(molobj) sdfstr += "$$$$\n" fsdf.write(sdfstr.encode()) fprop.write(line) values = [float(x) for x in line.split()[1:]] moledict[smiles] = values fsdf.close() fprop.close() properties.close() misc.save_json(args.scratch + "molecules", moledict) misc.save_obj(args.scratch + "molecules", moledict) return
molobjs = cheminfo.read_sdffile(filename) for i, molobj in enumerate(molobjs): molobj = next(molobjs) # stat = cheminfo.molobj_optimize(molobj) # print(stat) dist = Chem.rdmolops.Get3DDistanceMatrix(molobj) np.fill_diagonal(dist, 10.0) min_dist = np.min(dist) if min_dist < 0.01: print(i, min_dist) smi = cheminfo.molobj_to_smiles(molobj) molobj = cheminfo.conformationalsearch(smi) dist = Chem.rdmolops.Get3DDistanceMatrix(molobj) np.fill_diagonal(dist, 10.0) min_dist = np.min(dist) print(smi) print(min_dist) # atoms, coord = cheminfo.molobj_to_xyz(molobj) # atoms = list(atoms) # many_atoms = [atoms] # mbtypes = qml.representations.get_slatm_mbtypes(many_atoms)
def parse_results(molidx, readtemplate, molobjs, dump_results=None, debug=True, **kwargs): """ """ if debug: filename = dump_results.format(molidx) if os.path.exists(filename): print("exists", molidx) return print("parsing", molidx) filename = readtemplate.format(molidx) molobj = molobjs[molidx] reference_smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True) atoms = cheminfo.molobj_to_atoms(molobj) n_atoms = len(atoms) energies, coordinates, costs = merge.read_txt(filename, n_atoms) oenergies = [] ocoordinates = [] ocosts = [] for i, energy, coord, cost in zip(range(len(energies)), energies, coordinates, costs): filename = "_tmp_mopac_/_" + str(molidx) + "-" + str(i) + "_" try: oenergy, ocoord = optmize_conformation(atoms, coord, filename=filename) except: print("unconverged", filename) continue m = get_molobj(atoms, ocoord) smiles = cheminfo.molobj_to_smiles(m) same_graph = (smiles == reference_smiles) if same_graph: oenergies.append(oenergy) ocoordinates.append(ocoord) ocosts.append(cost) # print(smiles == reference_smiles, "{:5.2f}".format(energy), "{:5.2f}".format(oenergy), cost) idxs = merge.merge_cost(atoms, oenergies, ocoordinates, ocosts) renergies = [] rcoords = [] rcosts = [] for idx in idxs: energy = oenergies[idx] coord = ocoordinates[idx] cost = ocosts[idx] renergies.append(energy) rcoords.append(coord) rcosts.append(cost) if dump_results is not None: out = merge.dump_txt(renergies, rcoords, rcosts) filename = dump_results.format(molidx) f = open(filename, 'w') f.write(out) f.close() return renergies, rcoords, rcosts
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="DIR", default="_tmp_") parser.add_argument('--sdf', action='store', help='', metavar="FILE", nargs="+", default=[]) parser.add_argument('--dict', action='store', help='', metavar="FILE", nargs="+", default=[]) parser.add_argument('--name', action='store', help='', metavar="STR", nargs="+") parser.add_argument('--filename', action='store', help='', metavar="STR") parser.add_argument('--filter', action='store_true', help='') parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" print() databases_set = [] databases_dict = [] for sdf in args.sdf: molobjs = cheminfo.read_sdffile(sdf) molobjs = list(molobjs) smiles = [ cheminfo.molobj_to_smiles(molobj, remove_hs=True) for molobj in molobjs ] smiles = set(smiles) databases_set.append(smiles) print(sdf, len(smiles)) for filename in args.dict: data = misc.load_obj(filename) smiles = data.keys() smiles = set(smiles) databases_set.append(smiles) databases_dict.append(data) print(filename, len(smiles)) if args.scratch is not None: # Merge databases everything = {} for data in databases_dict: keys = data.keys() for key in keys: if key not in everything: everything[key] = [] everything[key] += data[key] if args.filter: everything = filter_dict(everything) keys = everything.keys() print("n items", len(keys)) # Save misc.save_json(args.scratch + "molecule_data", everything) misc.save_obj(args.scratch + "molecule_data", everything) if args.name is not None: n_db = len(databases_set) if n_db == 2: venn2(databases_set, set_labels=args.name) elif n_db == 3: venn3(databases_set, set_labels=args.name) plt.savefig(args.scratch + "venndiagram") return