def __init__(self, final_matrix_name, word_dict_name, train_test_split): self.final_matrix = load_obj(final_matrix_name).astype('float') self.word_dict = load_obj(word_dict_name) self.num_words = len(self.word_dict) self.train_test_split = train_test_split self.X_train = self.final_matrix[:self.train_test_split, :] self.X_test = self.final_matrix[self.train_test_split:, :]
def create_indices_matrix(self, is_dict_existing): """ :param is_dict_existing: if a dictionary already exists (from previous calls), skip creating one""" if is_dict_existing: word_dict = load_obj(self.dict_name) else: # create a new vocabulary word_dict = self.voc2index() word_2_num_sentence = lambda t: [ word_dict[word] for word in t.split() ] # replace every word in the cell with the matching vocab number word_2_num_one_word = lambda t: [ word_dict[t] ] # refer to the cell content as one string and replace this string with the matching vocab number ''' for each column of the table, replace (if needed) the words / sentence with the matching index from the vocabulary''' names_indices = np.array( [word_2_num_sentence(t) for t in self.short_sentences_table[:, 0]]) item_conditions = np.expand_dims( self.short_sentences_table[:, 1].astype('float'), axis=1) category_names_indices = np.array( [word_2_num_one_word(t) for t in self.short_sentences_table[:, 2]]) brand_names_indices = np.array( [word_2_num_one_word(t) for t in self.short_sentences_table[:, 3]]) price = np.expand_dims(self.short_sentences_table[:, 4].astype('float'), axis=1) is_shipping = np.expand_dims( self.short_sentences_table[:, 5].astype('float'), axis=1) indices_matrix = np.concatenate( (names_indices, item_conditions, category_names_indices, brand_names_indices, price, is_shipping), axis=1) save_obj(indices_matrix, self.final_matrix_name) return indices_matrix
def dump_distances_and_kernels(scr, name, procs=0): # TODO Properties should be read by scr!! # properties # print("Saving properties") # with open(scr + 'properties.csv', 'r') as f: # properties = f.readlines() # properties = [x.split()[0] for x in properties] # properties = [float(x) for x in properties] # properties = np.array(properties) # print(properties.shape) # misc.save_npy(scr + "properties", properties) representation_names_coordbased = ["cm", "slatm", "bob"] representation_names_molbased = ["morgan", "rdkitfp"] if procs != 0: os.environ["OMP_NUM_THREADS"] = str(procs) # Prepare fchl kernels if name == "fclh18": print("Generating fchl18 kernel") start = time.time() reps = misc.load_npy(scr + "repr." + "fchl18") print("shape:", reps.shape) sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True) end = time.time() print("time:", end - start) misc.save_npy(scr + "fchl18." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl18", kernels) reps = None del reps kernels = None del kernels elif name == "fchl19": print("Generating fchl19 kernel") reps = misc.load_npy(scr + "repr." + "fchl19") print("shape:", reps.shape) atoms = misc.load_obj(scr + "atoms") start = time.time() sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True) end = time.time() print("time:", end - start) misc.save_npy(scr + "fchl19." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl19", kernels) elif name in representation_names_coordbased: print("Distance", name) representations = misc.load_npy(scr + "repr." + name) print(representations.shape) dist = generate_l2_distances(representations) misc.save_npy(scr + "dist." + name, dist) dist = None del dist elif name == "rdkitfp" or name == "morgan": print("Generating fingerprint kernel", name) representations_fp = misc.load_npy(scr + "repr." + name) representations_fp = np.asarray(representations_fp, dtype=np.float) # t = time.time() # print("jaccard numpy") # kernel = fingerprints.bitmap_jaccard_kernel(representations_fp) # print("time", time.time()-t) # print("saving kernel") # # kernel = None # del kernel print(os.environ["OMP_NUM_THREADS"]) n_items = representations_fp.shape[0] # FORTRAN KERNEL # t = time.time() # print("jaccard fortran") # representations_fp = np.array(representations_fp, dtype=int).T # kernel = bitmap_kernels.symmetric_jaccard_kernel(n_items, representations_fp) # print("time", time.time()-t) # kernel = fingerprints.fingerprints_to_kernel(representations_fp, representations_fp, procs=procs) # misc.save_npy(scr + "kernel." + name, kernel) # DISTANCE print("make dist") dist = generate_l2_distances(representations_fp) print("save dist") misc.save_npy(scr + "dist." + name, dist) print("saved") print(dist.shape) kernel = None del kernel else: print("error: unknown representation", name) quit() return
from showing import ploting from misc import load_obj # dict = load_obj("all_favourite_methods_x1000", resolution=0.7) # dict = load_obj("exp-ep-greedy-Dolinar_x500", resolution=0.7) name = "all_methods_x12_ep100" dict = load_obj(name, resolution=0.1, layers=2) # dict = load_obj("ep-greedy-Dolinar_x1", resolution=0.33) # # for i in dict.keys(): # # print(i, dict[i]["label"]) # interesting = ["run_10","run_16", "run_2"] # interesting = ["run_1","run_2", "run_3", "run_4","run_5"] interesting = ["run_1","run_2", "run_3","run_4","run_5"] # interesting = dict.keys() # # dict_plot = {} for i in interesting: dict_plot[i] = dict[i] ploting(dict_plot,mode_log="off",save=True,show=True, particular_name=name,mode="stds")
def dump_distances_and_kernels(scr): # TODO Properties should be read by scr!! # properties print("Saving properties") with open(scr + 'properties.csv', 'r') as f: properties = f.readlines() properties = [x.split()[0] for x in properties] properties = [float(x) for x in properties] properties = np.array(properties) print("properties", properties.shape) misc.save_npy(scr + "properties", properties) # Prepare distances representation_names = ["cm", "bob", "slatm"] # + ["avgslatm"] for name in representation_names: print("Distance", name) representations = misc.load_npy(scr + "repr." + name) print(representations.shape) dist = generate_l2_distances(representations) misc.save_npy(scr + "dist." + name, dist) dist = None del dist # Prepare fchl kernels if False: print("Generating fchl18 kernel") start = time.time() reps = misc.load_npy(scr + "repr." + "fchl18") print("shape:", reps.shape) sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True) end = time.time() print("time:", end-start) misc.save_npy(scr + "fchl18." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl18", kernels) reps = None del reps kernels = None del kernels if False: print("Generating fchl19 kernel") reps = misc.load_npy(scr + "repr." + "fchl19") print("shape:", reps.shape) atoms = misc.load_obj(scr + "atoms") start = time.time() sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True) end = time.time() print("time:", end-start) misc.save_npy(scr + "fchl19." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl19", kernels) if True: print("Generating fingerprint kernel") representations_fp = misc.load_obj(scr + "repr.fp") kernel = get_fp_kernel(representations_fp) misc.save_npy(scr + "kernel.fp", kernel) return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="DIR", default="_tmp_") parser.add_argument('--sdf', action='store', help='', metavar="FILE", nargs="+", default=[]) parser.add_argument('--dict', action='store', help='', metavar="FILE", nargs="+", default=[]) parser.add_argument('--name', action='store', help='', metavar="STR", nargs="+") parser.add_argument('--filename', action='store', help='', metavar="STR") parser.add_argument('--filter', action='store_true', help='') parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" print() databases_set = [] databases_dict = [] for sdf in args.sdf: molobjs = cheminfo.read_sdffile(sdf) molobjs = list(molobjs) smiles = [ cheminfo.molobj_to_smiles(molobj, remove_hs=True) for molobj in molobjs ] smiles = set(smiles) databases_set.append(smiles) print(sdf, len(smiles)) for filename in args.dict: data = misc.load_obj(filename) smiles = data.keys() smiles = set(smiles) databases_set.append(smiles) databases_dict.append(data) print(filename, len(smiles)) if args.scratch is not None: # Merge databases everything = {} for data in databases_dict: keys = data.keys() for key in keys: if key not in everything: everything[key] = [] everything[key] += data[key] if args.filter: everything = filter_dict(everything) keys = everything.keys() print("n items", len(keys)) # Save misc.save_json(args.scratch + "molecule_data", everything) misc.save_obj(args.scratch + "molecule_data", everything) if args.name is not None: n_db = len(databases_set) if n_db == 2: venn2(databases_set, set_labels=args.name) elif n_db == 3: venn3(databases_set, set_labels=args.name) plt.savefig(args.scratch + "venndiagram") return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1) parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Not that random np.random.seed(args.randomseed) # Get properties properties = misc.load_npy(args.scratch + "properties") molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") # Get features filename = "repr.ols" if os.path.exists(args.scratch + filename + ".pkl"): features = misc.load_obj(args.scratch + filename) else: features = extract_features(properties, molobjs, procs=args.procs) features = pd.DataFrame(features) features = features.fillna(0) misc.save_obj(args.scratch + filename, features) n_items = len(features) X = np.arange(n_items) assert len(properties) == n_items # Train n_splits = 5 n_train = misc.load_npy(args.scratch + "n_train") fold_five = sklearn.model_selection.KFold(n_splits=n_splits, random_state=45, shuffle=True) scores = [] for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)): # un-ordered idxs_train np.random.seed(45 + i) np.random.shuffle(idxs_train) learning_curve = [] for n in n_train: idxs = idxs_train[:n] # signed difference sign_diff = fit_model(features, idxs, idxs_test) # rmse diff = sign_diff**2 rmse_test = np.sqrt(diff.mean()) # save learning_curve.append(rmse_test) scores.append(learning_curve) scores = np.array(scores) scores = scores.T mean_score = np.mean(scores, axis=1) print(mean_score) misc.save_npy(args.scratch + "score.ols", scores) return
def main(datafile, procs=0, scr="_tmp_"): db = misc.load_obj(datafile) keys = db.keys() print("total keys:", len(keys)) xaxis = [] yaxis = [] if procs == 0: def get_results(): for i, key in enumerate(keys): smi = key kelvin = db[key] result = prepare_sdf_and_csv(smi, kelvin) if result is None: continue yield result results = get_results() else: def workpackages(): for i, key in enumerate(keys): # if i > 5000: break smi = key kelvin = db[key] yield smi, kelvin lines = workpackages() results = misc.parallel(lines, prepare_sdf_and_csv_procs, [], {}, procs=procs) print("streaming results") # Write results fullsdf = "" fsdf = gzip.open("data/sdf/structures.sdf.gz", 'w') fprop = open("data/sdf/properties.csv", 'w') for i, result in enumerate(results): if result is None: continue molobj, values = result sdfstr = cheminfo.molobj_to_sdfstr(molobj) fsdf.write(sdfstr.encode()) valuesstr = " ".join(values) # propstr = "{:} {:}\n".format(mean, standard_deviation) propstr = f"{i} " + valuestr fprop.write(propstr) fsdf.close() fprop.close() return
if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('--datadict', action='store', help='', metavar='FILE') parser.add_argument('--data', action='store', help='', metavar='FILE') parser.add_argument('--sdf', action='store', help='', metavar='FILE') parser.add_argument('--scratch', action='store', help='', metavar='DIR') parser.add_argument('-j', '--procs', action='store', help='', type=int, metavar='int', default=0) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" if args.datadict: data = misc.load_obj(args.datadict) set_structures(data, args.scratch, procs=args.procs) if args.data: main(args.data, procs=args.procs) if args.sdf: conformation("data/sdf/structures.sdf.gz", procs=args.procs)
def get_representations_slatm(atoms, structures, scr="_tmp_/", mbtypes=None, debug=True, procs=0, **kwargs): """ atoms -- list of molecule atoms """ # from qml.representations import get_slatm_mbtypes # Assume 'qm7' is a # list of Compound() objects. mbtypes = # get_slatm_mbtypes([mol.nuclear_charges for compound in qm7]) # Assume the # QM7 dataset is loaded into a list of Compound() for compound in qm7: # # Generate the desired representation for each compound # compound.generate_slatm(mbtypes, local=True, rcut=2.7) if mbtypes is None: filename_mbtypes = scr + "slatm.mbtypes" try: mbtypes = misc.load_obj(filename_mbtypes) except FileNotFoundError: print("Generate slatm mbtypes") mbtypes = qml.representations.get_slatm_mbtypes(atoms) misc.save_obj(filename_mbtypes, mbtypes) if debug: print("Generate slatm representations") replist = [] # Set OMP if procs > 1: os.environ["OMP_NUM_THREADS"] = "1" workargs = zip(structures, atoms) workargs = list(workargs) pool = Pool(processes=procs) funcname = partial(procs_representation_slatm, mbtypes=mbtypes) replist = pool.map(funcname, workargs) else: for i, (coord, atom) in enumerate(zip(structures, atoms)): rep = qml.representations.generate_slatm(coord, atom, mbtypes) replist.append(rep) # replist = [qml.representations.generate_slatm(coordinate, atom, mbtypes) for coordinate, atom in zip(structures, atoms)] replist = np.array(replist) # for i, rep in enumerate(replist): # m = rep.mean() # if np.isnan(m): # print(i, rep.mean()) # print(replist.mean()) return replist
def generate_conformer_representation(scr="_tmp_ensemble_/", procs=0): names = ["cm", "slatm", "bob"] name = "slatm" mbtypes = misc.load_npy(scr + "slatm.mbtypes") # TODO Calculate max_size mol_atoms = misc.load_obj(scr + "atoms") max_atoms = [len(atoms) for atoms in mol_atoms] max_atoms = max(max_atoms) kwargs = { "name": name, "mbtypes": mbtypes, "debug": False, "max_atoms": max_atoms, } # n_total = 1285 n_total = 3456 idxs = range(n_total) avgreps = [0] * n_total if procs == 0: for idx in idxs: idx, avgrep = get_avg_repr(idx, **kwargs) avgreps[idx] = avgrep else: idx, rep = get_avg_repr(0, **kwargs) rep_size = rep.shape[0] print("rep size", rep_size) m = MyManager() m.start() results = m.np_zeros((n_total, rep_size)) # TODO Hardcoded, puuuha pool = Pool(32) kwargs["array"] = results func = partial(get_avg_repr, **kwargs) pool.map(func, idxs) avgreps = results # results = misc.parallel(idxs, get_avg_repr, [], kwargs, procs=nprocs) # # for result in results: # idx, avgrep = result # avgreps[idx] = avgrep # print(idx, avgrep.mean()) avgreps = np.array(avgreps) misc.save_npy(scr + "repr.avgslatm", avgreps) return