def generate_sdf(filename): suppl = cheminfo.read_sdffile(filename) # start molobj = next(suppl) atoms, coord = cheminfo.molobj_to_xyz(molobj) energy = worker.get_energy(molobj) representation = sim.get_representation(atoms, coord) # init lists molobjs = [molobj] energies = [energy] coordinates = [coord] representations = [representation] # collect the rest for molobj in suppl: energy = worker.get_energy(molobj) coord = cheminfo.molobj_get_coordinates(molobj) representation = sim.get_representation(atoms, coord) molobjs.append(molobj) energies.append(energy) coordinates.append(coord) representations.append(representation) return molobjs, energies, coordinates, representations
def conformation(filename, procs=0): scr = "_tmp_ensemble_/" molecules = cheminfo.read_sdffile(filename) if procs == 0: for im, molecule in enumerate(molecules): get_conformations((im, molecule), scr=scr) else: def workpackages(): for im, molecule in enumerate(molecules): yield im, molecule lines = workpackages() results = misc.parallel(lines, get_conformations, [], {"scr": scr}, procs=procs) for result in results: pass # misc.parallel(lines)#, get_conformations, [], {"scr":scr}, procs=procs) return
def get_avg_repr(idx, scr="_tmp_ensemble_/", **kwargs): name = "slatm" energies = misc.load_npy(scr + str(idx) + ".energies") molobjs = cheminfo.read_sdffile(scr + str(idx) + ".sdf") molobjs = [mol for mol in molobjs] xyzs = molobjs_to_xyzs(molobjs) reprs = xyzs_to_representations(*xyzs, **kwargs) # Boltzmann factors factors = np.exp(-energies) factors /= np.sum(factors) length = reprs.shape[1] avgrep = np.zeros(length) for rep, factor in zip(reprs, factors): avgrep += factor * rep print(idx, avgrep.shape) if "array" in kwargs: results = kwargs["array"] results[idx, :] = avgrep else: return idx, avgrep
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="tmp2/") parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=666) parser.add_argument('--sdf', action='store', help='', metavar="file") parser.add_argument('-j', '--cpu', action='store', help='pararallize', metavar="int", default=0) args = parser.parse_args() molecules = cheminfo.read_sdffile('data/sdf/structures.sdf.gz') properties = open('data/sdf/properties.csv', 'r') sub_mol, sub_prop, idxs = search_molcules(molecules, properties) properties.close() fm = open('data/sdf/subset_structures.sdf', 'w') fp = open('data/sdf/subset_properties.csv', 'w') for mol, prop in zip(sub_mol, sub_prop): sdf = cheminfo.molobj_to_sdfstr(mol) fm.write(sdf) fm.write("$$$$\n") fp.write(str(prop) + "\n") fm.close() fp.close() for i, idx in enumerate(idxs): from_dir = "_tmp_ensemble_/" to_dir = "_tmp_subset_/conformers/" cmd = "cp {:}{:}.sdf {:}{:}.sdf".format(from_dir, str(idx), to_dir, str(i)) list(misc.shell(cmd)) cmd = "cp {:}{:}.energies.npy {:}{:}.energies.npy".format( from_dir, str(idx), to_dir, str(i)) list(misc.shell(cmd)) print(cmd) return
def merge_results_cumulative(sdffile, filenames, debug=True, molid=0): # init energies = [] coordinates = [] representations = [] atoms = [] n_total = 0 molobjs = cheminfo.read_sdffile(sdffile[0]) molobjs = [molobj for molobj in molobjs] atoms_list = [cheminfo.molobj_to_atoms(molobj) for molobj in molobjs] for filename in filenames: energies_next, coordinates_next, atoms = read_resulttxt( atoms_list, filename) representations_next = [ sim.get_representation(atoms, coord) for coord in coordinates_next ] if len(energies) == 0: energies += energies_next coordinates += coordinates_next representations += representations_next n_total += len(energies_next) continue idxs = merge_asymmetric(atoms, energies_next, energies, representations_next, representations) n_new = 0 for i, idxl in enumerate(idxs): N = len(idxl) if N > 0: continue energies.append(energies_next[i]) coordinates.append(coordinates_next[i]) representations.append(representations_next[i]) n_new += 1 if debug: n_total += n_new print(" - new", n_new) print("total", n_total) return
def main_redis(args): redis_task = args.redis_task if args.redis_connect is not None: redis_connection = args.redis_connection_str else: if not os.path.exists(args.redis_connect_file): print("error: redis connection not set and file does not exists") print("error: path", args.redis_connect_file) quit() with open(args.redis_connect_file, 'r') as f: redis_connection = f.read().strip() if args.debug: print("redis: connecting to", redis_connection) tasks = rediscomm.Taskqueue(redis_connection, redis_task) # Prepare moldb molecules = cheminfo.read_sdffile(args.sdf) molecules = [molobj for molobj in molecules] # Prepare tordb if args.sdftor is None: tordb = [cheminfo.get_torsions(molobj) for molobj in molecules] else: tordb = read_tordb(args.sdftor) # Make origins origins = [] for molobj in molecules: xyz = cheminfo.molobj_get_coordinates(molobj) origins.append(xyz) # TODO if threads is > 0 then make more redis_workers do_work = lambda x: redis_worker(origins, molecules, tordb, x, debug=args.debug) tasks.main_loop(do_work) return
def main_file(args): suppl = cheminfo.read_sdffile(args.sdf) molobjs = [molobj for molobj in suppl] if args.sdftor: tordb = read_tordb(args.sdftor) else: tordb = [cheminfo.get_torsions(molobj) for molobj in molobjs] if args.jobfile: run_jobfile(molobjs, tordb, args.jobfile, threads=args.threads) else: # TODO Base on tordb generate_jobs(molobjs, args, tordb=tordb) return
def get_sdfcontent(sdffile, rtn_atoms=False): coordinates = [] energies = [] reader = cheminfo.read_sdffile(sdffile) molobjs = [molobj for molobj in reader] atoms = "" for molobj in molobjs: atoms, coordinate = cheminfo.molobj_to_xyz(molobj) energy = get_energy(molobj) coordinates.append(coordinate) energies.append(energy) if rtn_atoms: return molobjs[0], atoms, energies, coordinates return energies, coordinates
def parse_ochem(filename, procs=0, debug=False): molobjs = cheminfo.read_sdffile(filename) # molobjs = list(molobjs) success_properties = [] success_molobjs = [] if procs > 0: def generate_input(molobjs): for molobj in molobjs: if molobj is None: continue props = molobj.GetPropsAsDict() yield molobj, props pool = mp.Pool(processes=procs) results = pool.map(parse_molandprop, generate_input(molobjs)) for result in results: molobj, value = result if molobj is None: continue success_properties.append(value) success_molobjs.append(molobj) else: for molobj in molobjs: molobj, value = parse_molobj(molobj, debug=debug) if molobj is None: continue success_properties.append(value) success_molobjs.append(molobj) return success_molobjs, success_properties
def test_kernel(): smiles = ['c1ccccn1'] smiles += ['c1ccco1'] smiles += ['Oc1ccccc1'] smiles += ['Nc1ccccc1'] smiles += ['CCO'] smiles += ['CCN'] molobjs = [cheminfo.smiles_to_molobj(x)[0] for x in smiles] molobjs = cheminfo.read_sdffile("_tmp_bing_bp_/structures.sdf.gz") molobjs = [next(molobjs) for _ in range(5000)] init = time.time() vectors = molobjs_to_fps(molobjs) print("init", time.time() - init) time_pykernel = time.time() kernel = bitmap_jaccard_kernel(vectors) print("pykernel", time.time() - time_pykernel) print(kernel) del kernel n_items = vectors.shape[0] # kernel = np.zeros((n_items, n_items)) vectors = vectors.T vectors = np.array(vectors, dtype=int) # help(bitmap_kernels) time_fkernel = time.time() kernel = bitmap_kernels.symmetric_jaccard_kernel(n_items, vectors) print("fokernel", time.time() - time_fkernel) print(kernel) return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('-v', '--version', action='version', version="1.0") parser.add_argument('--sdf', type=str, help='SDF file', metavar='file', default="~/db/qm9s.sdf.gz") parser.add_argument('--debug', action="store_true", help="", default=False) args = parser.parse_args() max_mols = 500 molobjs = cheminfo.read_sdffile(args.sdf) tordb = [] for i, molobj in enumerate(molobjs): # if i > max_mols: break torsions = cheminfo.get_torsions(molobj) tordb.append(torsions) for i, torsions in enumerate(tordb): torsions_str = [] for torsion in torsions: fmt = " ".join(["{:}"] * 4).format(*torsion) torsions_str.append(fmt) state = ",".join(torsions_str) print(i, ":", state) return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="DIR", default="_tmp_") parser.add_argument('--sdf', action='store', help='', metavar="FILE") #, nargs="+", default=[]) parser.add_argument('--properties', action='store', help='', metavar="FILE") #, nargs="+", default=[]) parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" fsdf = gzip.open(args.scratch + "structures.sdf.gz", 'w') fprop = open(args.scratch + "properties.csv", 'w') molecules = cheminfo.read_sdffile(args.sdf) properties = open(args.properties, 'r') moledict = {} for molobj, line in zip(molecules, properties): status = molobjfilter(molobj) if not status: continue status = valuefilter(line) if not status: continue smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True) print(smiles) sdfstr = cheminfo.molobj_to_sdfstr(molobj) sdfstr += "$$$$\n" fsdf.write(sdfstr.encode()) fprop.write(line) values = [float(x) for x in line.split()[1:]] moledict[smiles] = values fsdf.close() fprop.close() properties.close() misc.save_json(args.scratch + "molecules", moledict) misc.save_obj(args.scratch + "molecules", moledict) return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Read properties properties = misc.load_npy(args.scratch + "properties") molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") molecules = list(molecules) heavy_atoms = [] predictions = [] errors = [] for mol, prop in zip(molecules, properties): smi = cheminfo.molobj_to_smiles(mol, remove_hs=True) J = thermo.joback.Joback(smi) # J = thermo.joback.Joback('CC(=O)C') # J = thermo.joback.Joback('CCC(=O)OC(=O)CC') status = J.status atoms, coord = cheminfo.molobj_to_xyz(mol) idx = np.where(atoms != 1) atoms = atoms[idx] N = len(atoms) heavy_atoms.append(N) if "Did not match all atoms present" in status: errors.append(1) predictions.append(float("nan")) continue try: estimate = J.estimate() except TypeError: errors.append(1) predictions.append(float("nan")) continue errors.append(0) T_b = estimate["Tb"] T_m = estimate["Tm"] predictions.append(T_m) errors = np.array(errors, dtype=int) idx_success, = np.where(errors == 0) heavy_atoms = np.array(heavy_atoms) predictions = np.array(predictions) properties = np.array(properties) predictions = predictions[idx_success] properties = properties[idx_success] heavy_atoms = heavy_atoms[idx_success] print("total", errors.shape[0], "filter", idx_success.shape[0]) print() print(rmse(properties, predictions)) plt.plot(properties, properties, "-k") plt.scatter(properties, predictions, s=0.95, alpha=0.8, c=heavy_atoms) plt.xlabel("True") plt.ylabel("Predicted") plt.savefig("_fig_joback") plt.clf() return
def mergesdfs(sdflist): """ """ # Merge sdf0 with sdf1 and etc molobjs_list = [] for sdf in sdflist: sdfs = cheminfo.read_sdffile(sdf) molobjs = [molobj for molobj in sdfs] molobjs_list.append(molobjs) coordinates_sdf = [] for molobjs in molobjs_list: coordinates = [ cheminfo.molobj_get_coordinates(molobj) for molobj in molobjs ] coordinates_sdf.append(coordinates) atoms, xyz = cheminfo.molobj_to_xyz(molobjs_list[0][0]) coordinates_x = coordinates_sdf[0] coordinates_y = coordinates_sdf[1] # JCK sigmas = [0.8] parameters = { 'alchemy': 'off', "cut_distance": 10**6, 'kernel_args': { "sigma": sigmas }, } repObjs_x = [ workkernel.FchlRepresentation(atoms, coordinates, **parameters) for coordinates in coordinates_x[:3] ] repObjs_y = [ workkernel.FchlRepresentation(atoms, coordinates, **parameters) for coordinates in coordinates_y[:4] ] similarity = workkernel.get_kernel_from_objs(repObjs_x, repObjs_y, **parameters) print("qml obj kernel:") print(similarity) # JCK # Energy are the same, compare with FCHL representations_x = get_representations_fchl(atoms, coordinates_x, max_size=len(atoms)) representations_x = np.asarray(representations_x[:3]) representations_y = get_representations_fchl(atoms, coordinates_y, max_size=len(atoms)) representations_y = np.asarray(representations_y[:4]) similarity = get_kernel_fchl(representations_x, representations_y) print("qml kernel:") print(similarity) similarity = get_kernel_fchl_(representations_x, representations_y) print("qml fst kernel:") print(similarity) return
import qml from chemhelp import cheminfo import numpy as np from rdkit import Chem import sys args = sys.argv[1:] filename = args[0] molobjs = cheminfo.read_sdffile(filename) for i, molobj in enumerate(molobjs): molobj = next(molobjs) # stat = cheminfo.molobj_optimize(molobj) # print(stat) dist = Chem.rdmolops.Get3DDistanceMatrix(molobj) np.fill_diagonal(dist, 10.0) min_dist = np.min(dist) if min_dist < 0.01: print(i, min_dist) smi = cheminfo.molobj_to_smiles(molobj) molobj = cheminfo.conformationalsearch(smi) dist = Chem.rdmolops.Get3DDistanceMatrix(molobj) np.fill_diagonal(dist, 10.0)
def mergesdfs(sdflist): """ """ # Merge sdf0 with sdf1 and etc molobjs_list = [] for sdf in sdflist: sdfs = cheminfo.read_sdffile(sdf) molobjs = [molobj for molobj in sdfs] molobjs_list.append(molobjs) coordinates_sdf = [] for molobjs in molobjs_list: coordinates = [ cheminfo.molobj_get_coordinates(molobj) for molobj in molobjs ] coordinates_sdf.append(coordinates) atoms, xyz = cheminfo.molobj_to_xyz(molobjs_list[0][0]) coordinates_x = coordinates_sdf[0] coordinates_y = coordinates_sdf[1] # JCK representations19_x = [ sim.get_representation(atoms, coordinates) for coordinates in coordinates_x ] representations19_y = [ sim.get_representation(atoms, coordinates) for coordinates in coordinates_y ] representations19_x = np.asarray(representations19_x[:5]) representations19_y = np.asarray(representations19_y[:5]) nx = len(representations19_x) ny = len(representations19_y) similarity = sim.get_kernel(representations19_x, representations19_y, [atoms] * nx, [atoms] * ny) print(similarity) # JCK # fchl18 representations_x = get_representations_fchl(atoms, coordinates_x, max_size=len(atoms)) representations_x = np.asarray(representations_x[:5]) representations_y = get_representations_fchl(atoms, coordinates_y, max_size=len(atoms)) representations_y = np.asarray(representations_y[:5]) similarity = get_kernel_fchl(representations_x, representations_y) print("qml kernel:") print(similarity) return
def overview_properties_pca(): elements = [] with open('data/sdf/subset_properties.csv', 'r') as f: properties = f.readlines() properties = [float(x) for x in properties] properties = np.array(properties) representations = [] molobjs = cheminfo.read_sdffile("data/sdf/subset_structures.sdf") mols_atoms = [] mols_coord = [] n_atoms = 0 n_items = 500 for i, molobj in enumerate(molobjs): atoms, coord = cheminfo.molobj_to_xyz(molobj) mols_atoms.append(atoms) mols_coord.append(coord) elements += list(np.unique(atoms)) elements = list(np.unique(elements)) if len(atoms) > n_atoms: n_atoms = len(atoms) i += 1 if i == n_items: break properties = properties[:n_items] print(elements) print(n_atoms) print(len(mols_atoms)) distance_cut = 20.0 parameters = { "pad": n_atoms, 'nRs2': 22, 'nRs3': 17, 'eta2': 0.41, 'eta3': 0.97, 'three_body_weight': 45.83, 'three_body_decay': 2.39, 'two_body_decay': 2.39, "rcut": distance_cut, "acut": distance_cut, "elements": elements } for atoms, coord in zip(mols_atoms, mols_coord): representation = generate_fchl_acsf(atoms, coord, **parameters) representations.append(representation) representations = np.array(representations) sigma = 10. kernel = qml.kernels.get_local_kernel(representations, representations, mols_atoms, mols_atoms, sigma) print(kernel.shape) pca = kpca(kernel, n=2) fig, axs = plt.subplots(2, 1, figsize=(5, 10)) sc = axs[0].scatter(*pca, c=properties) fig.colorbar(sc, ax=axs[0]) im = axs[1].imshow(kernel) fig.colorbar(im, ax=axs[1]) fig.savefig("_tmp_pca_prop.png") return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" properties = misc.load_npy(args.scratch + "properties") molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") heavy_atoms = [] distances = [] volumes = [] for mol in molecules: # atoms = cheminfo.molobj_to_atoms(mol) atoms, coord = cheminfo.molobj_to_xyz(mol) idx = np.where(atoms != 1) atoms = atoms[idx] N = len(atoms) heavy_atoms.append(N) hull = ConvexHull(coord, qhull_options="QJ") vol = hull.volume volumes.append(vol) avgdist = distance.pdist(coord) avgdist = np.mean(avgdist) distances.append(avgdist) heavy_atoms = np.array(heavy_atoms) volumes = np.array(volumes) distances = np.array(distances) # # # representation = distances # linear fit p = np.polyfit(representation, properties, 3) p = np.poly1d(p) results = p(representation) rmse_error = rmse(results, properties) print(rmse_error) plt.scatter(representation, properties, c=heavy_atoms, s=0.8) x_prop = np.linspace(min(representation), max(representation), 80) plt.plot(x_prop, p(x_prop), "k-") plt.savefig("i_can_member_it") plt.clf() return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('-v', '--version', action='version', version="1.0") parser.add_argument('--method', action='store', help='What QM program to use', metavar='str', default="PM6") parser.add_argument('--txtfmt', help='format for cost mergeing', metavar="STR") parser.add_argument('--sdf', nargs="+", action='store', help='', metavar='FILE') parser.add_argument('--molid', action='store', help='What molid from sdf should be used for txt', metavar='int(s)') parser.add_argument('--debug', action='store_true', help='debug statements') parser.add_argument('-j', '--procs', type=int, help='Use multiple processes (default=0)', default=0) args = parser.parse_args() if args.sdf is None: print("error: actually we need sdfs information") quit() molidxs = args.molid molidxs = molidxs.split("-") if len(molidxs) == 1: molidxs = [int(molidx) for molidx in molidxs] else: molidxs = [int(molidx) for molidx in molidxs] molidxs = range(molidxs[0], molidxs[1] + 1) # molobj db molobjs = [molobj for molobj in cheminfo.read_sdffile(args.sdf[0])] dump_results = "_tmp_results_data1/{:}.results" if args.procs > 0: parallel_parse_results(args.txtfmt, molobjs, molidxs, dump_results, procs=args.procs) return for idx in molidxs: parse_results(idx, args.txtfmt, molobjs, dump_results="_tmp_results_data1/{:}.results".format( args.molid)) return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="DIR", default="_tmp_") parser.add_argument('--sdf', action='store', help='', metavar="FILE", nargs="+", default=[]) parser.add_argument('--dict', action='store', help='', metavar="FILE", nargs="+", default=[]) parser.add_argument('--name', action='store', help='', metavar="STR", nargs="+") parser.add_argument('--filename', action='store', help='', metavar="STR") parser.add_argument('--filter', action='store_true', help='') parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" print() databases_set = [] databases_dict = [] for sdf in args.sdf: molobjs = cheminfo.read_sdffile(sdf) molobjs = list(molobjs) smiles = [ cheminfo.molobj_to_smiles(molobj, remove_hs=True) for molobj in molobjs ] smiles = set(smiles) databases_set.append(smiles) print(sdf, len(smiles)) for filename in args.dict: data = misc.load_obj(filename) smiles = data.keys() smiles = set(smiles) databases_set.append(smiles) databases_dict.append(data) print(filename, len(smiles)) if args.scratch is not None: # Merge databases everything = {} for data in databases_dict: keys = data.keys() for key in keys: if key not in everything: everything[key] = [] everything[key] += data[key] if args.filter: everything = filter_dict(everything) keys = everything.keys() print("n items", len(keys)) # Save misc.save_json(args.scratch + "molecule_data", everything) misc.save_obj(args.scratch + "molecule_data", everything) if args.name is not None: n_db = len(databases_set) if n_db == 2: venn2(databases_set, set_labels=args.name) elif n_db == 3: venn3(databases_set, set_labels=args.name) plt.savefig(args.scratch + "venndiagram") return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1) parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Not that random np.random.seed(args.randomseed) # Get properties properties = misc.load_npy(args.scratch + "properties") molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") # Get features filename = "repr.ols" if os.path.exists(args.scratch + filename + ".pkl"): features = misc.load_obj(args.scratch + filename) else: features = extract_features(properties, molobjs, procs=args.procs) features = pd.DataFrame(features) features = features.fillna(0) misc.save_obj(args.scratch + filename, features) n_items = len(features) X = np.arange(n_items) assert len(properties) == n_items # Train n_splits = 5 n_train = misc.load_npy(args.scratch + "n_train") fold_five = sklearn.model_selection.KFold(n_splits=n_splits, random_state=45, shuffle=True) scores = [] for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)): # un-ordered idxs_train np.random.seed(45 + i) np.random.shuffle(idxs_train) learning_curve = [] for n in n_train: idxs = idxs_train[:n] # signed difference sign_diff = fit_model(features, idxs, idxs_test) # rmse diff = sign_diff**2 rmse_test = np.sqrt(diff.mean()) # save learning_curve.append(rmse_test) scores.append(learning_curve) scores = np.array(scores) scores = scores.T mean_score = np.mean(scores, axis=1) print(mean_score) misc.save_npy(args.scratch + "score.ols", scores) return
def main(): # L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1) parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Not that random np.random.seed(args.randomseed) # Get properties properties = misc.load_npy(args.scratch + "properties") molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") X = [] try: X = misc.load_npy(args.scratch + "repr.rdkitfp") print("loaded") except: for molobj in molobjs: bitmap = fingerprints.get_rdkitfp(molobj) X.append(bitmap) X = np.asarray(X) y = properties # load predefined training points n_train = misc.load_npy(args.scratch + "n_train") # CV idxs = np.array(list(range(len(properties))), dtype=int) scores = [] for idxs_train, idxs_test in cv.cross_view(idxs): learning_curve = [] for n in n_train: idxs = idxs_train[:n] clf = get_best_rfr(X[idxs], y[idxs]) # training error # predictions = clf.predict(X) # predictions predictions = clf.predict(X[idxs_test]) diff = predictions-y[idxs_test] diff = diff**2 rmse_test = np.sqrt(diff.mean()) learning_curve.append(rmse_test) print(n, rmse_test) scores.append(learning_curve) scores = np.array(scores) scores = scores.T mean_score = np.mean(scores, axis=1) print(mean_score) misc.save_npy(args.scratch + "score.rfr", scores)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--conformers', action='store_true', help='') parser.add_argument('--sdf', action='store', help='', metavar="file") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) parser.add_argument('-r', '--representations', action='store', help='', metavar="STR", nargs="+") args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" if args.procs == -1: args.procs = int(os.cpu_count()) print("set procs", args.procs) representation_names_coordbased = [ "cm", "fchl18", "fchl19", "slatm", "bob" ] representation_names_molbased = ["morgan", "rdkitfp"] if args.representations is None: # representation_names = ["cm", "fchl18", "fchl19", "slatm", "bob"] # representation_names = ["fchl18"] # representation_names = ["bob"] representation_names = ["slatm", "bob", "cm", "rdkitfp", "morgan"] else: representation_names = args.representations molobjs = cheminfo.read_sdffile(args.sdf) molobjs = [mol for mol in molobjs] xyzs = molobjs_to_xyzs(molobjs) mol_atoms, mol_coords = xyzs misc.save_obj(args.scratch + "atoms", mol_atoms) # Print unique atoms unique_atoms = [] for atoms in mol_atoms: unique_atoms += list(np.unique(atoms)) unique_atoms = np.array(unique_atoms) unique_atoms = unique_atoms.flatten() unique_atoms = np.unique(unique_atoms) # Calculate max_size max_atoms = [len(atoms) for atoms in mol_atoms] max_atoms = max(max_atoms) n_items = len(mol_coords) print("total mols:", n_items) print("atom types:", unique_atoms) print("max atoms: ", max_atoms) print() print("representations:", representation_names) print() misc.save_txt(args.scratch + "n_items", n_items) # Gas phase for name in representation_names: if name not in representation_names_coordbased: continue representations = xyzs_to_representations(mol_atoms, mol_coords, name=name, scr=args.scratch, max_atoms=max_atoms, procs=args.procs) if isinstance(representations, (np.ndarray, np.generic)): misc.save_npy(args.scratch + "repr." + name, representations) else: misc.save_obj(args.scratch + "repr." + name, representations) representations = None del representations for name in representation_names: if name not in representation_names_molbased: continue representations = molobjs_to_representations(molobjs, name=name, procs=args.procs) if isinstance(representations, (np.ndarray, np.generic)): misc.save_npy(args.scratch + "repr." + name, representations) else: misc.save_obj(args.scratch + "repr." + name, representations) representations = None del representations quit() # Ensemble # if args.conformers: # generate_conformer_representation(scr=args.scratch, procs=args.procs) return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('-v', '--version', action='version', version="1.0") parser.add_argument('--txt', nargs="+", help='Read results from txt file (require sdf)', metavar="FILE") parser.add_argument('--txtfmt', help='format for cost mergeing', metavar="STR") parser.add_argument('--sdf', nargs="+", action='store', help='', metavar='FILE') parser.add_argument('--sdfstdin', action='store_true', help='Read sdf files from stdin') parser.add_argument('--txtstdin', action='store_true', help='Read txt files from stdin') parser.add_argument('--molid', action='store', help='What molid from sdf should be used for txt', metavar='int') parser.add_argument('--format', action='store', help='What output format? (sdf, txt)', metavar='str') parser.add_argument('--dump', action='store_true', help='dump sdf str to stdout') parser.add_argument('--debug', action='store_true', help='debug statements') parser.add_argument('-j', '--procs', type=int, help='Merge using multiprocessing', default=0) args = parser.parse_args() if args.sdf is None: print("error: actually we need sdfs to merge") quit() molobjs = [molobj for molobj in cheminfo.read_sdffile(args.sdf[0])] # Parse txt files individually if args.txtstdin: filenames = easyusage.stdin() merge_individual(molobjs, filenames, procs=args.procs) elif args.txt: filenames = [txt for txt in args.txt] merge_individual(molobjs, filenames, procs=args.procs) # Cost merge using txtfilenames format. Should have three {:} expandables if args.txtfmt: molidxs = args.molid molidxs = molidxs.split("-") if len(molidxs) == 1: molidxs = [int(molidx) for molidx in molidxs] else: molidxs = [int(molidx) for molidx in molidxs] molidxs = range(molidxs[0], molidxs[1] + 1) merge_cost_multiple(molidxs, molobjs, args.txtfmt) # molobj = molobjs[args.molid] # energies, coordinates, costs = merge_results_cumulative_prime(args.molid, molobj, args.txtfmt) if args.format == "sdf": dump_sdf(molobj, energies, coordinates, costs) else: out = dump_txt(energies, coordinates, costs) print(out) quit() if args.txt is None: merge_sdfs(args.sdf) else: #TODO Need flags # merge_results(args.sdf, args.txt) # merge_results_cumulative(args.sdf, args.txt) # molobj, energies, coordinates, costs = merge_results_cumulative_prime(args.sdf, args.txt) if args.format == "sdf": dump_sdf(molobj, energies, coordinates, costs) else: out = dump_txt(energies, coordinates, costs) print(out) return