def generate_optimized_molecules( self, scoring_function: ScoringFunction, number_molecules: int, starting_population: Optional[List[str]] = None) -> List[str]: """ Will iterate through the reference set of SMILES strings and select the best molecules. It will create a heap and keep it to the required size so that minimal memory is used. """ top_molecules: List[Tuple[float, str]] = [] for smiles in self.smiles_reader: score = scoring_function.score(smiles) # Put molecule and corresponding score in a tuple that allows for appropriate comparison operator for the heap. item = (score, smiles) if len(top_molecules) < number_molecules: heapq.heappush(top_molecules, item) else: # Equivalent to a push, then a pop, but faster # NB: pop removes the smallest value, i.e. in this case the molecule with the lowest score. heapq.heappushpop(top_molecules, item) return [x[1] for x in top_molecules]
def generate_optimized_molecules(self, scoring_function: ScoringFunction, number_molecules: int, starting_population: Optional[List[str]] = None) -> List[str]: cuda_available = torch.cuda.is_available() device = "cuda" if cuda_available else "cpu" model_def = Path(self.pretrained_model_path).with_suffix('.json') smiles_rnn = load_rnn_model(model_def, self.pretrained_model_path, device, copy_to_cpu=True) model = SmilesRnnActorCritic(smiles_rnn=smiles_rnn).to(device) generator = PPOMoleculeGenerator(model=model, max_seq_length=self.max_seq_len, device=device) molecules = generator.optimise(objective=scoring_function, start_population=[], **self.model_args) # take the molecules seen during the hill-climbing, and also sample from the final model samples = [m.smiles for m in molecules] if self.sample_final_model_only: samples.clear() samples += generator.sample(max(number_molecules, self.number_final_samples)) # calculate the scores and return the best ones samples = canonicalize_list(samples) scores = scoring_function.score_list(samples) scored_molecules = zip(samples, scores) sorted_scored_molecules = sorted(scored_molecules, key=lambda x: (x[1], hash(x[0])), reverse=True) top_scored_molecules = sorted_scored_molecules[:number_molecules] return [x[0] for x in top_scored_molecules]
def generate_optimized_molecules(self, scoring_function: ScoringFunction, number_molecules: int, starting_population: Optional[ List[str]] = None, get_history=False) -> List[str]: # fetch initial population? if starting_population is None: print('selecting initial population...') if self.random_start: starting_population = [] else: all_smiles = self.load_smiles_from_file(self.smi_file) starting_population = self.top_k(all_smiles, scoring_function, self.mols_to_sample) cuda_available = torch.cuda.is_available() device = "cuda" if cuda_available else "cpu" model_def = Path(self.pretrained_model_path).with_suffix('.json') model = load_rnn_model(model_def, self.pretrained_model_path, device, copy_to_cpu=True) generator = SmilesRnnMoleculeGenerator(model=model, max_len=self.max_len, device=device) molecules, smiles_history = generator.optimise( objective=scoring_function, start_population=starting_population, n_epochs=self.n_epochs, mols_to_sample=self.mols_to_sample, keep_top=self.keep_top, optimize_batch_size=self.optimize_batch_size, optimize_n_epochs=self.optimize_n_epochs, pretrain_n_epochs=self.pretrain_n_epochs) # take the molecules seen during the hill-climbing, and also sample from the final model samples = [m.smiles for m in molecules] if self.sample_final_model_only: samples.clear() samples += generator.sample( max(number_molecules, self.number_final_samples)) # calculate the scores and return the best ones samples = canonicalize_list(samples) scores = scoring_function.score_list(samples) scored_molecules = zip(samples, scores) sorted_scored_molecules = sorted(scored_molecules, key=lambda x: (x[1], hash(x[0])), reverse=True) top_scored_molecules = sorted_scored_molecules[:number_molecules] return smiles_history
def pretrain_on_initial_population(self, scoring_function: ScoringFunction, start_population, pretrain_epochs) -> List[OptResult]: """ Takes an objective and tries to optimise it :param scoring_function: MPO :param start_population: Initial compounds (list of smiles) or request new (random?) population :param pretrain_epochs: number of epochs to finetune with start_population :return: Candidate molecules """ seed: List[OptResult] = [] start_population_size = len(start_population) training = canonicalize_list(start_population, include_stereocenters=True) if len(training) != start_population_size: logger.warning( "Some entries for the start population are invalid or duplicated" ) start_population_size = len(training) if start_population_size == 0: return seed logger.info("finetuning with {} molecules for {} epochs".format( start_population_size, pretrain_epochs)) scores = scoring_function.score_list(training) seed.extend( OptResult(smiles=smiles, score=score) for smiles, score in zip(training, scores)) train_seqs, _ = load_smiles_from_list(training, max_len=self.max_len) train_set = get_tensor_dataset(train_seqs) batch_size = min(int(len(training)), 32) print_every = len(training) / batch_size losses = self.trainer.fit(train_set, train_set, batch_size=batch_size, n_epochs=pretrain_epochs, print_every=print_every, valid_every=print_every) logger.info(losses) return seed
def optimise(self, objective: ScoringFunction, start_population, keep_top, n_epochs, mols_to_sample, optimize_n_epochs, optimize_batch_size, pretrain_n_epochs) -> List[OptResult]: """ Takes an objective and tries to optimise it :param objective: MPO :param start_population: Initial compounds (list of smiles) or request new (random?) population :param kwargs need to contain: keep_top: number of molecules to keep at each iterative finetune step mols_to_sample: number of molecules to sample at each iterative finetune step optimize_n_epochs: number of episodes to finetune optimize_batch_size: batch size for fine-tuning pretrain_n_epochs: number of epochs to pretrain on start population :param get_history: If true also return intermediate samples as well :return: Candidate molecules """ int_results = self.pretrain_on_initial_population( objective, start_population, pretrain_epochs=pretrain_n_epochs) results: List[OptResult] = [] seen: Set[str] = set() for k in int_results: if k.smiles not in seen: results.append(k) seen.add(k.smiles) smiles_history = [] for epoch in range(1, 1 + n_epochs): t0 = time.time() samples = self.sampler.sample(self.model, mols_to_sample, max_seq_len=self.max_len) t1 = time.time() canonicalized_samples = set( canonicalize_list(samples, include_stereocenters=True)) smiles_history.append(list(canonicalized_samples)) payload = list(canonicalized_samples.difference(seen)) payload.sort( ) # necessary for reproducibility between different runs seen.update(canonicalized_samples) scores = objective.score_list(payload) int_results = [ OptResult(smiles=smiles, score=score) for smiles, score in zip(payload, scores) ] t2 = time.time() results.extend(sorted(int_results, reverse=True)[0:keep_top]) results.sort(reverse=True) subset = [i.smiles for i in results][0:keep_top] np.random.shuffle(subset) sub_train = subset[0:int(3 * len(subset) / 4)] sub_test = subset[int(3 * len(subset) / 4):] train_seqs, _ = load_smiles_from_list(sub_train, max_len=self.max_len) valid_seqs, _ = load_smiles_from_list(sub_test, max_len=self.max_len) train_set = get_tensor_dataset(train_seqs) valid_set = get_tensor_dataset(valid_seqs) opt_batch_size = min(len(sub_train), optimize_batch_size) print_every = int(len(sub_train) / opt_batch_size) if optimize_n_epochs > 0: self.trainer.fit(train_set, valid_set, n_epochs=optimize_n_epochs, batch_size=opt_batch_size, print_every=print_every, valid_every=print_every) t3 = time.time() logger.info(f'Generation {epoch} --- timings: ' f'sample: {(t1 - t0):.3f} s, ' f'score: {(t2 - t1):.3f} s, ' f'finetune: {(t3 - t2):.3f} s') top4 = '\n'.join(f'\t{result.score:.3f}: {result.smiles}' for result in results[:4]) logger.info(f'Top 4:\n{top4}') print(f'Top 4:\n{top4}') return sorted(results, reverse=True), smiles_history
def generate_optimized_molecules( self, scoring_function: ScoringFunction, number_molecules: int, starting_population: Optional[List[str]] = None) -> List[str]: if number_molecules > self.population_size: self.population_size = number_molecules print( f'Benchmark requested more molecules than expected: new population is {number_molecules}' ) # fetch initial population? if starting_population is None: print('selecting initial population...') init_size = self.population_size + self.n_mutations all_smiles = copy.deepcopy(self.all_smiles) if self.random_start: starting_population = np.random.choice(all_smiles, init_size) else: starting_population = self.top_k(all_smiles, scoring_function, init_size) # The smiles GA cannot deal with '%' in SMILES strings (used for two-digit ring numbers). starting_population = [ smiles for smiles in starting_population if '%' not in smiles ] # calculate initial genes initial_genes = [ cfg_to_gene(cfg_util.encode(s), max_len=self.gene_size) for s in starting_population ] # score initial population initial_scores = scoring_function.score_list(starting_population) population = [ Molecule(*m) for m in zip(initial_scores, starting_population, initial_genes) ] population = sorted(population, key=lambda x: x.score, reverse=True)[:self.population_size] population_scores = [p.score for p in population] # evolution: go go go!! t0 = time() patience = 0 for generation in range(self.generations): old_scores = population_scores # select random genes all_genes = [molecule.genes for molecule in population] choice_indices = np.random.choice(len(all_genes), self.n_mutations, replace=True) genes_to_mutate = [all_genes[i] for i in choice_indices] # evolve genes joblist = (delayed(mutate)(g, scoring_function) for g in genes_to_mutate) new_population = self.pool(joblist) # join and dedup population += new_population population = deduplicate(population) # survival of the fittest population = sorted(population, key=lambda x: x.score, reverse=True)[:self.population_size] # stats gen_time = time() - t0 mol_sec = (self.population_size + self.n_mutations) / gen_time t0 = time() population_scores = [p.score for p in population] # early stopping if population_scores == old_scores: patience += 1 print(f'Failed to progress: {patience}') if patience >= self.patience: print(f'No more patience, bailing...') break else: patience = 0 print(f'{generation} | ' f'max: {np.max(population_scores):.3f} | ' f'avg: {np.mean(population_scores):.3f} | ' f'min: {np.min(population_scores):.3f} | ' f'std: {np.std(population_scores):.3f} | ' f'{gen_time:.2f} sec/gen | ' f'{mol_sec:.2f} mol/sec') # finally return [molecule.smiles for molecule in population[:number_molecules]]
def generate_optimized_molecules( self, scoring_function: ScoringFunction, number_molecules: int, starting_population: Optional[List[str]] = None) -> List[str]: """The function called by the benchmarking software: All backend has to be controlled here""" # starting population is provided for some benchmarks. if number_molecules > self.population: self.population = number_molecules print( f'Benchmark requested more molecules than expected: new population is {number_molecules}' ) self.task += 1 if self.task < self.start_task: return ['CCC'] self.init_deriver() scored_population = [] if starting_population is None: print('selecting initial population...') if self.random_start: all_smiles = self.load_smiles_from_file(self.smi_file) selected_smiles = np.random.choice(all_smiles, self.population) scored_population = [(s, scoring_function.score(s)) for s in selected_smiles] else: # we are just going to get the top scoring mols, and we've checked before so we'll just load a file scored_population = self.get_precomputed_scores( self.population) self.deriver.set_seeds([s[0] for s in scored_population]) elif len(starting_population) == 1: self.deriver.set_seeds(starting_population) scored_population = [(s, scoring_function.score(s)) for s in starting_population] # allow self-mating in deriver for new methods #if len(scored_population) == 1: # scored_population = [scored_population[0], scored_population[0]] best = max([s[1] for s in scored_population]) p_max = best no_progess_counter = 0 old_avg = 0 mean_scores_by_gen = [] best_scores_by_gen = [] worst_scores_by_gen = [] temperature = self.initial_temperature early_stop_annealing = False anneal_counter = 0 filter_enabled = False current_population = self.population if self.derive_population: self.derive_size = self.population for generation in range(self.generations): # filter annealing ####################################################################### if ((generation >= (self.generations * self.delayed_filtering)) and self.delayed_filtering) or early_stop_annealing: if (not anneal_counter) and (not filter_enabled): filter_enabled = True print(f'Enabling filter at generation {generation}') self.deriver.enable_and_expand_filter() alerts = pd.read_csv('data/alert_collection.csv') sure_chembl = set( alerts.loc[alerts['rule_set_name'] == 'SureChEMBL', 'smarts']) bai = set(alerts.loc[alerts['rule_set_name'] == 'BAI', 'smarts']) self.deriver.set_must_not_have_patterns( list(sure_chembl.union(bai))) print( 'Expanding population to introduce filtered candidates' ) current_population *= 2 print('Generating filtered candidates...') anneal_counter += 1 ########################################################################################### scored_seeds = make_mating_pool( scored_population=scored_population, selection_size=self.selection_size, method=self.selection_method, best=best, temperature=temperature) # we want the best score from the previous generation, else there can only be ties best = p_max good_children = derive( deriver=self.deriver, seeds=[s[0] for s in scored_seeds], mut_rate=self.mut_rate, n_brics=self.derive_size * self.brics_proportion, n_selfies=self.derive_size * self.selfies_proportion, n_smiles_gb=self.derive_size * self.smiles_gb_proportion, n_selfies_gb=self.derive_size * self.selfies_gb_proportion, scanner=self.enable_scanner) scored_children = self.rank_and_score(good_children, scoring_function) scored_population = list( (set(scored_children)).union(set(scored_population))) scored_population = sorted(scored_population, key=lambda x: x[1], reverse=True)[:current_population] relevant_scores = [s[1] for s in scored_population ][:max([100, number_molecules])] # summarization p_max, p_avg, p_min, p_std, p_sum = summarize_results( generation, relevant_scores) mean_scores_by_gen.append(p_avg) best_scores_by_gen.append(p_max) worst_scores_by_gen.append(p_min) if early_stop_annealing: p_avg = np.mean([s[1] for s in scored_population]) print(f'Population mean: {p_avg}') else: print( f'Population mean: {np.mean([s[1] for s in scored_population])}' ) # early stopping if p_avg == old_avg: no_progess_counter += 1 else: no_progess_counter = 0 anneal_counter = 0 if self.task < 4 and max(relevant_scores) == 1: if self.delayed_filtering: early_stop_annealing = True else: print('Finished early on a rediscovery benchmark!') break if (no_progess_counter >= self.patience) or ( p_avg == 1 and len(scored_population) > 1): if self.delayed_filtering: early_stop_annealing = True else: print("Finished early!") break if (anneal_counter == self.patience) and self.delayed_filtering: print("Converged after filtering!") break old_avg = p_avg temperature *= self.temperature_decay self.clean_up() if self.delayed_filtering: scored_population = self.filter_mols(scored_population, number_molecules) if self.counterscreen: self.deriver.enable_and_expand_filter() alerts = pd.read_csv('data/alert_collection.csv') sure_chembl = set( alerts.loc[alerts['rule_set_name'] == 'SureChEMBL', 'smarts']) bai = set(alerts.loc[alerts['rule_set_name'] == 'BAI', 'smarts']) self.deriver.set_must_not_have_patterns( list(sure_chembl.union(bai))) scored_population = self.filter_mols(scored_population, number_molecules, add_bad=False) return save_and_exit(scored_population, number_molecules, mean_scores_by_gen, best_scores_by_gen, worst_scores_by_gen)
def generate_optimized_molecules(self, scoring_function: ScoringFunction, number_molecules: int, starting_population: Optional[ List[str]] = None, get_history=False) -> List[str]: if number_molecules > self.population_size: self.population_size = number_molecules print( f'Benchmark requested more molecules than expected: new population is {number_molecules}' ) # fetch initial population? if starting_population is None: print('selecting initial population...') if self.random_start: starting_population = np.random.choice(self.all_smiles, self.population_size) else: starting_population = self.top_k(self.all_smiles, scoring_function, self.population_size) # select initial population # this is also slow # population_smiles = heapq.nlargest(self.population_size, starting_population, key=scoring_function.score) starting_scores = scoring_function.score_list(starting_population) population_smiles = [ x for _, x in sorted(zip(starting_scores, starting_population), key=lambda pair: pair[0], reverse=True) ] population_mol = [Chem.MolFromSmiles(s) for s in population_smiles] # this is slow. Don't know exactly why. maybe pickling classifiers is not too nice # population_scores_old = self.pool(delayed(score_mol)(m, scoring_function.score) for m in population_mol) population_scores = scoring_function.score_list( mols2smiles(population_mol)) # evolution: go go go!! t0 = time() patience = 0 population_history = [] population_history.append( [Chem.MolToSmiles(m) for m in population_mol]) for generation in range(self.generations): # new_population mating_pool = make_mating_pool(population_mol, population_scores, self.offspring_size) offspring_mol = self.pool( delayed(reproduce)(mating_pool, self.mutation_rate) for _ in range(self.population_size)) # add new_population population_mol += offspring_mol population_mol = sanitize(population_mol) # stats gen_time = time() - t0 mol_sec = self.population_size / gen_time t0 = time() old_scores = population_scores # population_scores = self.pool(delayed(score_mol)(m, scoring_function.score) for m in population_mol) population_scores = scoring_function.score_list( [Chem.MolToSmiles(m) for m in population_mol]) population_tuples = list(zip(population_scores, population_mol)) population_tuples = sorted(population_tuples, key=lambda x: x[0], reverse=True)[:self.population_size] population_mol = [t[1] for t in population_tuples] population_scores = [t[0] for t in population_tuples] # early stopping if population_scores == old_scores: patience += 1 print(f'Failed to progress: {patience}') if patience >= self.patience: print(f'No more patience, bailing...') break else: patience = 0 res_time = time() - t0 print(f'{generation} | ' f'max: {np.max(population_scores):.3f} | ' f'avg: {np.mean(population_scores):.3f} | ' f'min: {np.min(population_scores):.3f} | ' f'std: {np.std(population_scores):.3f} | ' f'sum: {np.sum(population_scores):.3f} | ' f'{gen_time:.2f} sec/gen | ' f'{mol_sec:.2f} mol/sec | ' f'{res_time:.2f} rest ') population_history.append( [Chem.MolToSmiles(m) for m in population_mol]) # finally if get_history: return population_history else: return [Chem.MolToSmiles(m) for m in population_mol][:number_molecules]
def generate_optimized_molecules( self, scoring_function: ScoringFunction, number_molecules: int, starting_population: Optional[List[str]] = None) -> List[str]: instance = self.pop_alg.copy_instance_with_parameters() # Updating benchmark id self.curr_benchmark_id += 1 # Extracting benchmark name curr_benchmark_name = self._get_benchmark_name(self.curr_benchmark_id) # Setting folder to save the results instance.output_folder_path = join(self.output_save_path, curr_benchmark_name) # instance.output_folder_path = join(self.output_save_path, name) # Extracting GuacaMol evaluation function # guacamol_evaluation_strategy = GuacamolEvaluationStrategy(scoring_function, name) guacamol_evaluation_strategy = GuacamolEvaluationStrategy( scoring_function, curr_benchmark_name) # Merging the evaluation strategy of the PopAlg instance to the GuacaMol objective if isinstance(instance.evaluation_strategy, UndefinedGuacaMolEvaluationStrategy): instance.evaluation_strategy = guacamol_evaluation_strategy else: define_GuacaMol_evaluation_strategies( instance.evaluation_strategy, guacamol_evaluation_strategy) # Updating mutation strategy evaluator instance.mutation_strategy.evaluation_strategy = instance.evaluation_strategy # Setting additional stop criterion, stopping the execution when best possible score is obtained instance.kth_score_to_record_key = curr_benchmark_name # instance.kth_score_to_record_key = name additional_stop_criterion = KthScoreMaxValue(1, round=3) instance.stop_criterion_strategy.set_additional_strategy( additional_stop_criterion) instance.stop_criterion_strategy.set_pop_alg_instance(instance) # Setting kth score to record instance.kth_score_to_record = number_molecules # PopAlg instance initialization instance.initialize() # Population initialization if self.guacamol_init_top_100: # Extracting the top 100 SMILES for the property from ChEMBL and setting it at initial population # From https://github.com/BenevolentAI/guacamol_baselines/blob/master/graph_ga/goal_directed_generation.py with open(self.init_pop_path, "r") as f: smiles_list = f.readlines() scores = [scoring_function.score(s) for s in smiles_list] top_100_smiles = np.array(smiles_list)[np.argsort(scores)[::-1] [:100]] instance.load_pop_from_smiles_list(smiles_list=top_100_smiles) else: instance.load_pop_from_smiles_list(smiles_list=["C"]) # Running EvoMol instance.run() # Extracting the vector containing the guacamol objective property value for all individuals if instance.kth_score_to_record_key == "total": obj_prop_vector = instance.curr_total_scores else: obj_prop_vector = instance.curr_scores[ instance.kth_score_to_record_idx] # Extracting best individuals ind_to_return_indices = np.argsort( obj_prop_vector)[::-1].flatten()[:number_molecules] output_population = [] for ind_idx in ind_to_return_indices: output_population.append( instance.pop[ind_idx].to_aromatic_smiles()) # Returning optimized population return output_population