def generate_sdf(filename): suppl = cheminfo.read_sdffile(filename) # start molobj = next(suppl) atoms, coord = cheminfo.molobj_to_xyz(molobj) energy = worker.get_energy(molobj) representation = sim.get_representation(atoms, coord) # init lists molobjs = [molobj] energies = [energy] coordinates = [coord] representations = [representation] # collect the rest for molobj in suppl: energy = worker.get_energy(molobj) coord = cheminfo.molobj_get_coordinates(molobj) representation = sim.get_representation(atoms, coord) molobjs.append(molobj) energies.append(energy) coordinates.append(coord) representations.append(representation) return molobjs, energies, coordinates, representations
def merge_sdfs(filenames): molobjs = [] energies = [] coordinates = [] representations = [] atoms = [] n_total = 0 for filename in filenames: try: molobjs_next, energies_next, coordinates_next, representations_next = generate_sdf( filename) except: continue if len(molobjs) == 0: atoms, coord = cheminfo.molobj_to_xyz(molobjs_next[0]) energies += energies_next coordinates += coordinates_next representations += representations_next molobjs += molobjs_next n_total += len(molobjs_next) continue if args.debug: print(" {:} = {:} confs".format(filename, len(molobjs_next))) idxs = merge_asymmetric(atoms, energies_next, energies, representations_next, representations) n_new = 0 for i, idxl in enumerate(idxs): N = len(idxl) if N > 0: continue energies.append(energies_next[i]) coordinates.append(coordinates_next[i]) representations.append(representations_next[i]) molobjs.append(molobjs_next[i]) n_new += 1 if args.debug: n_total += n_new print(" - new", n_new) print("total", n_total) if args.dump: sdfstr = [cheminfo.molobj_to_sdfstr(molobj) for molobj in molobjs] sdfstr = "".join(sdfstr) print(sdfstr) return
def molobjs_to_xyzs(molobjs): mol_atoms = [] mol_coord = [] for molobj in molobjs: atoms, coord = cheminfo.molobj_to_xyz(molobj) mol_atoms.append(atoms) mol_coord.append(coord) return mol_atoms, mol_coord
def get_sdfcontent(sdffile, rtn_atoms=False): coordinates = [] energies = [] reader = cheminfo.read_sdffile(sdffile) molobjs = [molobj for molobj in reader] atoms = "" for molobj in molobjs: atoms, coordinate = cheminfo.molobj_to_xyz(molobj) energy = get_energy(molobj) coordinates.append(coordinate) energies.append(energy) if rtn_atoms: return molobjs[0], atoms, energies, coordinates return energies, coordinates
def run_jobfile(molobjs, tordbs, filename, threads=0): # Prepare molobjs to xyz origins = [] for molobj in molobjs: atoms, xyz = cheminfo.molobj_to_xyz(molobj) origins.append(xyz) with open(filename, 'r') as f: lines = f.readlines() lines = [line.strip() for line in lines] if threads > 0: run_joblines_threads(origins, molobjs, tordbs, lines, threads=threads, dump=False) else: run_joblines(origins, molobjs, tordbs, lines, dump=False) return True
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Read properties properties = misc.load_npy(args.scratch + "properties") molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") molecules = list(molecules) heavy_atoms = [] predictions = [] errors = [] for mol, prop in zip(molecules, properties): smi = cheminfo.molobj_to_smiles(mol, remove_hs=True) J = thermo.joback.Joback(smi) # J = thermo.joback.Joback('CC(=O)C') # J = thermo.joback.Joback('CCC(=O)OC(=O)CC') status = J.status atoms, coord = cheminfo.molobj_to_xyz(mol) idx = np.where(atoms != 1) atoms = atoms[idx] N = len(atoms) heavy_atoms.append(N) if "Did not match all atoms present" in status: errors.append(1) predictions.append(float("nan")) continue try: estimate = J.estimate() except TypeError: errors.append(1) predictions.append(float("nan")) continue errors.append(0) T_b = estimate["Tb"] T_m = estimate["Tm"] predictions.append(T_m) errors = np.array(errors, dtype=int) idx_success, = np.where(errors == 0) heavy_atoms = np.array(heavy_atoms) predictions = np.array(predictions) properties = np.array(properties) predictions = predictions[idx_success] properties = properties[idx_success] heavy_atoms = heavy_atoms[idx_success] print("total", errors.shape[0], "filter", idx_success.shape[0]) print() print(rmse(properties, predictions)) plt.plot(properties, properties, "-k") plt.scatter(properties, predictions, s=0.95, alpha=0.8, c=heavy_atoms) plt.xlabel("True") plt.ylabel("Predicted") plt.savefig("_fig_joback") plt.clf() return
def get_clockwork_conformations(molobj, torsions, resolution, atoms=None, debug=False, timings=False): """ Get all conformation for specific cost cost defined from torsions and resolution """ n_torsions = len(torsions) if atoms is None: atoms, xyz = cheminfo.molobj_to_xyz(molobj, atom_type="int") del xyz combinations = clockwork.generate_clockwork_combinations(resolution, n_torsions) # Collect energies and coordinates end_energies = [] end_coordinates = [] end_representations = [] first = True for resolutions in combinations: time_start = time.time() # Get all conformations c_energies, c_coordinates, c_states = get_conformations(molobj, torsions, resolutions) N = len(c_energies) # Filter unconverged success = np.argwhere(c_states == 0) success = success.flatten() c_energies = c_energies[success] c_coordinates = c_coordinates[success] N2 = len(c_energies) # Calculate representations c_representations = [sim.get_representation(atoms, coordinates) for coordinates in c_coordinates] c_representations = np.asarray(c_representations) # Clean all new conformers for energies and similarity idxs = clean_representations(atoms, c_energies, c_representations) c_energies = c_energies[idxs] c_coordinates = c_coordinates[idxs] c_representations = c_representations[idxs] if first: first = False end_energies += list(c_energies) end_coordinates += list(c_coordinates) end_representations += list(c_representations) continue # Asymmetrically add new conformers idxs = merge.merge_asymmetric(atoms, c_energies, end_energies, c_representations, end_representations) # Add new unique conformation to return collection for i, idx in enumerate(idxs): # if conformation already exists, continue if len(idx) > 0: continue # Add new unique conformation to collection end_energies.append(c_energies[i]) end_coordinates.append(c_coordinates[i]) end_representations.append(c_representations[i]) time_end = time.time() if timings: timing = time_end - time_start print("res time {:8.2f} cnf/sec - {:8.2f} tot sec".format(N/timing, timing)) continue return end_energies, end_coordinates
def converge_clockwork(molobj, tordb, max_cost=2): """ molobj torsions_idx resolution """ atoms, xyz = cheminfo.molobj_to_xyz(molobj) total_torsions = len(tordb) print("total torsions", total_torsions) # TODO Cache this cost_input, cost_cost = clockwork.generate_costlist(total_torsions=total_torsions) # TODO cost_cost and costfunc offset = 6 max_cost = 1 offset = 1 max_cost = 7 # offset = 7 # max_cost = 1 for (n_tor, resolution), cost in zip(cost_input[offset:offset+max_cost], cost_cost[offset:offset+max_cost]): start = time.time() # Iterate over torsion combinations combinations = clockwork.generate_torsion_combinations(total_torsions, n_tor) cost_result_energies = [] cost_result_coordinates = [] C = 0 for combination in combinations: # TODO Move this to function com_start = time.time() torsions = [tordb[i] for i in combination] result_energies, result_coordinates = get_clockwork_conformations(molobj, torsions, resolution) n_results = len(result_energies) result_cost = [cost]*n_results com_end = time.time() # print("new confs", len(result_energies), "{:6.2f}".format(com_end-com_start)) # Merge if len(cost_result_energies) == 0: cost_result_energies += list(result_energies) cost_result_coordinates += list(result_coordinates) continue else: start_merge = time.time() # TODO Move this to function continue idxs = merge.merge_asymmetric(atoms, result_energies, cost_result_energies, result_coordinates, cost_result_coordinates, decimals=2, debug=True) for i, idx in enumerate(idxs): C += 1 if len(idx) == 0: cost_result_energies.append(result_energies[i]) cost_result_coordinates.append(result_coordinates[i]) end_merge = time.time() print("total confs", len(cost_result_energies), "{:10.2f}".format(end_merge-start_merge)) continue end = time.time() print("conv", n_tor, resolution, cost, len(cost_result_energies), "tot: {:5.2f}".format(end-start), "per sec: {:5.2f}".format(cost/(end-start))) quit() return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" properties = misc.load_npy(args.scratch + "properties") molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") heavy_atoms = [] distances = [] volumes = [] for mol in molecules: # atoms = cheminfo.molobj_to_atoms(mol) atoms, coord = cheminfo.molobj_to_xyz(mol) idx = np.where(atoms != 1) atoms = atoms[idx] N = len(atoms) heavy_atoms.append(N) hull = ConvexHull(coord, qhull_options="QJ") vol = hull.volume volumes.append(vol) avgdist = distance.pdist(coord) avgdist = np.mean(avgdist) distances.append(avgdist) heavy_atoms = np.array(heavy_atoms) volumes = np.array(volumes) distances = np.array(distances) # # # representation = distances # linear fit p = np.polyfit(representation, properties, 3) p = np.poly1d(p) results = p(representation) rmse_error = rmse(results, properties) print(rmse_error) plt.scatter(representation, properties, c=heavy_atoms, s=0.8) x_prop = np.linspace(min(representation), max(representation), 80) plt.plot(x_prop, p(x_prop), "k-") plt.savefig("i_can_member_it") plt.clf() return
def mergesdfs(sdflist): """ """ # Merge sdf0 with sdf1 and etc molobjs_list = [] for sdf in sdflist: sdfs = cheminfo.read_sdffile(sdf) molobjs = [molobj for molobj in sdfs] molobjs_list.append(molobjs) coordinates_sdf = [] for molobjs in molobjs_list: coordinates = [ cheminfo.molobj_get_coordinates(molobj) for molobj in molobjs ] coordinates_sdf.append(coordinates) atoms, xyz = cheminfo.molobj_to_xyz(molobjs_list[0][0]) coordinates_x = coordinates_sdf[0] coordinates_y = coordinates_sdf[1] # JCK representations19_x = [ sim.get_representation(atoms, coordinates) for coordinates in coordinates_x ] representations19_y = [ sim.get_representation(atoms, coordinates) for coordinates in coordinates_y ] representations19_x = np.asarray(representations19_x[:5]) representations19_y = np.asarray(representations19_y[:5]) nx = len(representations19_x) ny = len(representations19_y) similarity = sim.get_kernel(representations19_x, representations19_y, [atoms] * nx, [atoms] * ny) print(similarity) # JCK # fchl18 representations_x = get_representations_fchl(atoms, coordinates_x, max_size=len(atoms)) representations_x = np.asarray(representations_x[:5]) representations_y = get_representations_fchl(atoms, coordinates_y, max_size=len(atoms)) representations_y = np.asarray(representations_y[:5]) similarity = get_kernel_fchl(representations_x, representations_y) print("qml kernel:") print(similarity) return
def main_folder(): import argparse parser = argparse.ArgumentParser() parser.add_argument('-v', '--version', action='version', version="1.0") parser.add_argument('--sdf', nargs="+", action='store', help='', metavar='FILE') args = parser.parse_args() # TODO Merge results from redis if args.sdf is None: print("error: actually we need sdfs to merge") quit() dumpdir = "_tmp_apentane_cum/" filename = args.sdf[0] + "{:}_{:}" + ".sdf" molobjs, energies, coordinates, representations = generate_sdf( filename.format(1, 1)) atoms, xyz = cheminfo.molobj_to_xyz(molobjs[0]) # costcombos, costs = clockwork.generate_costlist(total_torsions=28) costcombos, costs = clockwork.generate_costlist() n_total = len(molobjs) molcosts = [(1, 1)] * n_total print("start", n_total) for combo in costcombos[:15]: try: molobjs_new, energies_new, coordinates_new, representations_new = generate_sdf( filename.format(*combo)) except: continue print(" merge", len(molobjs_new)) idxs = merge_asymmetric(atoms, energies_new, energies, representations_new, representations) n_new = 0 for i, idxl in enumerate(idxs): N = len(idxl) if N > 0: continue energies.append(energies_new[i]) coordinates.append(coordinates_new[i]) representations.append(representations_new[i]) molobjs.append(molobjs_new[i]) n_new += 1 molcosts += [combo] * n_new n_total += n_new print(" - new", n_new) print("total", n_total, combo) sdfstr = [cheminfo.molobj_to_sdfstr(molobj) for molobj in molobjs] sdfstr = "".join(sdfstr) f = open(dumpdir + "all.sdf", 'w') f.write(sdfstr) f.close() hellodump = "" for combo in molcosts: hello = "{:} {:}".format(*combo) hellodump += hello + "\n" f = open(dumpdir + "costs.csv", 'w') f.write(hellodump) f.close() plt.plot(energies, 'k.') plt.yscale("log") plt.savefig(dumpdir + "energies") return
def overview_properties_pca(): elements = [] with open('data/sdf/subset_properties.csv', 'r') as f: properties = f.readlines() properties = [float(x) for x in properties] properties = np.array(properties) representations = [] molobjs = cheminfo.read_sdffile("data/sdf/subset_structures.sdf") mols_atoms = [] mols_coord = [] n_atoms = 0 n_items = 500 for i, molobj in enumerate(molobjs): atoms, coord = cheminfo.molobj_to_xyz(molobj) mols_atoms.append(atoms) mols_coord.append(coord) elements += list(np.unique(atoms)) elements = list(np.unique(elements)) if len(atoms) > n_atoms: n_atoms = len(atoms) i += 1 if i == n_items: break properties = properties[:n_items] print(elements) print(n_atoms) print(len(mols_atoms)) distance_cut = 20.0 parameters = { "pad": n_atoms, 'nRs2': 22, 'nRs3': 17, 'eta2': 0.41, 'eta3': 0.97, 'three_body_weight': 45.83, 'three_body_decay': 2.39, 'two_body_decay': 2.39, "rcut": distance_cut, "acut": distance_cut, "elements": elements } for atoms, coord in zip(mols_atoms, mols_coord): representation = generate_fchl_acsf(atoms, coord, **parameters) representations.append(representation) representations = np.array(representations) sigma = 10. kernel = qml.kernels.get_local_kernel(representations, representations, mols_atoms, mols_atoms, sigma) print(kernel.shape) pca = kpca(kernel, n=2) fig, axs = plt.subplots(2, 1, figsize=(5, 10)) sc = axs[0].scatter(*pca, c=properties) fig.colorbar(sc, ax=axs[0]) im = axs[1].imshow(kernel) fig.colorbar(im, ax=axs[1]) fig.savefig("_tmp_pca_prop.png") return
def mergesdfs(sdflist): """ """ # Merge sdf0 with sdf1 and etc molobjs_list = [] for sdf in sdflist: sdfs = cheminfo.read_sdffile(sdf) molobjs = [molobj for molobj in sdfs] molobjs_list.append(molobjs) coordinates_sdf = [] for molobjs in molobjs_list: coordinates = [ cheminfo.molobj_get_coordinates(molobj) for molobj in molobjs ] coordinates_sdf.append(coordinates) atoms, xyz = cheminfo.molobj_to_xyz(molobjs_list[0][0]) coordinates_x = coordinates_sdf[0] coordinates_y = coordinates_sdf[1] # JCK sigmas = [0.8] parameters = { 'alchemy': 'off', "cut_distance": 10**6, 'kernel_args': { "sigma": sigmas }, } repObjs_x = [ workkernel.FchlRepresentation(atoms, coordinates, **parameters) for coordinates in coordinates_x[:3] ] repObjs_y = [ workkernel.FchlRepresentation(atoms, coordinates, **parameters) for coordinates in coordinates_y[:4] ] similarity = workkernel.get_kernel_from_objs(repObjs_x, repObjs_y, **parameters) print("qml obj kernel:") print(similarity) # JCK # Energy are the same, compare with FCHL representations_x = get_representations_fchl(atoms, coordinates_x, max_size=len(atoms)) representations_x = np.asarray(representations_x[:3]) representations_y = get_representations_fchl(atoms, coordinates_y, max_size=len(atoms)) representations_y = np.asarray(representations_y[:4]) similarity = get_kernel_fchl(representations_x, representations_y) print("qml kernel:") print(similarity) similarity = get_kernel_fchl_(representations_x, representations_y) print("qml fst kernel:") print(similarity) return