def test_chembl(self): """ Problem with fixed-prop testing: Almost all of the results (<10% for init_pool of 50) seem to be outside of the database, and even less for smaller pool. Hence cannot get its score for testing; setting them to zero leads to slow exploration. """ pool_all, dd = get_chembl_prop() # loading with mol conversions takes 8 minutes # pool_all = [Molecule(smiles, conv_enabled=True) for smiles in tqdm(pool_all[:10000])] pool_all = [Molecule(smiles, conv_enabled=False) for smiles in pool_all] start_pool = list(np.random.choice(pool_all, size=100, replace=False)) def print_props(pool): props = [dd[mol.smiles] for mol in pool] print("Props of pool", len(pool), np.min(props), np.mean(props), np.max(props)) print_props(pool_all) print_props(start_pool) func = lambda mol: dd[mol.smiles] exp = RandomExplorer(lambda mol_list: func(mol_list[0]), initial_pool=start_pool) print("Starting ChEMBL score 1 optimization") t0 = time() exp.run(30) print("Completed ChEMBL score 1 optimization, time elapsed: %.3fs" % (time()-t0)) # print(exp.pool) top = exp.get_best(1)[0] print(top.get_synthesis_path()) print("Best achieved score: %.3f" % func(top)) props = [dd[mol.smiles] for mol in pool_all] print("Best possible score: %.3f" % np.max(props))
def _test_len(self): dummy_func = lambda mol: len(mol.smiles) test_pool = ["CC", "O=C=O", "C#N", "CCN(CC)CC", "CC(=O)O", "C1CCCCC1", "c1ccccc1"] test_pool = [Molecule(smiles) for smiles in test_pool] exp = RandomExplorer(dummy_func, initial_pool=test_pool) print("Starting len of SMILES optimization") exp.run(2) #check print(exp.pool)
def _test_sas(self): sas_func = lambda mol: calculateSAScore(Chem.MolFromSmiles(mol.smiles)) print(sas_func(Molecule("CC"))) test_pool = ["CC", "O=C=O", "C#N", "CCN(CC)CC", "CC(=O)O", "C1CCCCC1", "c1ccccc1"] test_pool = [Molecule(smiles) for smiles in test_pool] exp = RandomExplorer(sas_func, initial_pool=test_pool) print("Starting SA score optimization") t0 = time() exp.run(10) #check print("Completed SA score optimization, time elapsed: %.3fs" % (time()-t0)) print(exp.pool) top = exp.get_best(1)[0] print(top.get_synthesis_path())
def explore_and_validate_synth(init_pool_size, seed, budget, objective, dataset, max_pool_size, reporter): """ This experiment is equivalent to unlimited-evaluation optimization. It compares optimal found vs optimal over pool, and checks if synthesizeability is improved. """ obj_func = get_objective_by_name(objective) sampler = MolSampler(dataset, sampling_seed=seed) pool = sampler(init_pool_size) exp = RandomExplorer(obj_func, initial_pool=pool, max_pool_size=max_pool_size) real_budget = budget - init_pool_size props = [obj_func(mol) for mol in pool] reporter.writeln( f"Properties of pool: quantity {len(pool)}, min {np.min(props)}, avg {np.mean(props)}, max {np.max(props)}" ) reporter.writeln(f"Starting {objective} optimization") t0 = time.time() top_value, top_point, history = exp.run(real_budget) reporter.writeln("Finished run in {:.3f} minutes".format( (time.time() - t0) / 60)) reporter.writeln(f"Is a valid molecule: {check_validity(top_point)}") reporter.writeln(f"Resulting molecule: {top_point}") reporter.writeln(f"Top score: {obj_func(top_point)}") reporter.writeln( f"Minimum synthesis score over the path: {compute_min_sa_score(top_point)}" ) with open(SYN_PATH_FILE, 'wb') as f: pkl.dump(top_point.get_synthesis_path(), f) sorted_by_prop = sorted(pool, key=obj_func)[-5:] for opt_mol in sorted_by_prop: min_sa_score = compute_min_sa_score(opt_mol) reporter.writeln( f"Minimum synthesis score of optimal molecules: {min_sa_score}") vals = history['objective_vals'] plt.title(f'Optimizing {objective} with random explorer') plt.plot(range(len(vals)), vals) plt.savefig(PLOT_FILE, format='eps', dpi=1000) with open(OPT_VALS_FILE, 'w') as f: f.write(' '.join([str(v) for v in vals]))
def _opt_method_optimise_initalise(self): """ Important setup: creating the optimization object """ initial_pool = [mol_lst[0] for mol_lst in self.history.query_points] logging.info( f'Length of initial pool {len(initial_pool)}, should be equal to init_capital.' ) if self.acq_opt_method == 'rand_explorer': self.acq_optimizer = RandomExplorer( initial_pool=initial_pool, max_pool_size=self.options.max_pool_size) else: raise NotImplementedError( "Acq opt method {} not implemented.".format( self.acq_opt_method))