def predict_outcome(self, reaction, k=1): """ Using a predictor, produce top-k most likely reactions Params: reaction {Reaction} k {int} - how many top predictions to set and return Returns: {list[Molecule]} - list of products of reaction """ react = reaction.get_input_str() try: (react, bond_preds, bond_scores, cur_att_score) = self.directcorefinder.predict(react) outcomes = self.directcandranker.predict(react, bond_preds, bond_scores) except RuntimeError as e: logging.error(f"Error occured in DirectCandRanker.predict: {e}") raise e res = [] for out in outcomes: if out["smiles"]: # may be empty for some reason? smiles = out["smiles"][0] mol = Molecule(smiles) mol.set_synthesis(reaction.inputs) res.append(mol) else: continue res = res[:k] # setting predicted products, if not already set: reaction.set_products(res) return res
def _draw_node(self, node: Molecule): import os self._node_counter += 1 if self._draw_mode == "smiles": self._dot.node(name=node.to_smiles(), label=node.to_smiles()) elif self._draw_mode == "formula": self._dot.node(name=node.to_smiles(), label=node.to_formula()) elif self._draw_mode == "plot": mol_img_path = os.path.join(self._sub_dir, str(self._node_counter) + ".png") visualize_mol(node, path=mol_img_path) self._dot.node(name=node.to_smiles(), label="", image=mol_img_path, shape="plaintext")
def _test_sas(self): sas_func = lambda mol: calculateSAScore(Chem.MolFromSmiles(mol.smiles)) print(sas_func(Molecule("CC"))) test_pool = ["CC", "O=C=O", "C#N", "CCN(CC)CC", "CC(=O)O", "C1CCCCC1", "c1ccccc1"] test_pool = [Molecule(smiles) for smiles in test_pool] exp = RandomExplorer(sas_func, initial_pool=test_pool) print("Starting SA score optimization") t0 = time() exp.run(10) #check print("Completed SA score optimization, time elapsed: %.3fs" % (time()-t0)) print(exp.pool) top = exp.get_best(1)[0] print(top.get_synthesis_path())
def get_chembl(n_mols=None, as_mols=True, option='', max_size=1000): """ Return list of SMILES. NOTE: this function should be located in the same directory as data files. """ path = os.path.join(__location__, "ChEMBL.txt") with open(path, "r") as f: if n_mols is None: res = [line.strip() for line in f] else: res = [f.readline().strip() for _ in range(n_mols)] mols = [Molecule(smile) for smile in res] if len(mols) < max_size: return mols gen = np.random.RandomState(42) mols = list(gen.choice(mols, max_size, replace=False)) if option == '': return mols elif option == 'small_qed': qed_func = get_objective_by_name("qed") return [mol for mol in mols if qed_func(mol) < 0.6] elif option == 'large_qed': qed_func = get_objective_by_name("qed") return [mol for mol in mols if qed_func(mol) >= 0.6] else: raise ValueError(f"Dataset filter {option} not supported.")
def test_chembl(self): """ Problem with fixed-prop testing: Almost all of the results (<10% for init_pool of 50) seem to be outside of the database, and even less for smaller pool. Hence cannot get its score for testing; setting them to zero leads to slow exploration. """ pool_all, dd = get_chembl_prop() # loading with mol conversions takes 8 minutes # pool_all = [Molecule(smiles, conv_enabled=True) for smiles in tqdm(pool_all[:10000])] pool_all = [Molecule(smiles, conv_enabled=False) for smiles in pool_all] start_pool = list(np.random.choice(pool_all, size=100, replace=False)) def print_props(pool): props = [dd[mol.smiles] for mol in pool] print("Props of pool", len(pool), np.min(props), np.mean(props), np.max(props)) print_props(pool_all) print_props(start_pool) func = lambda mol: dd[mol.smiles] exp = RandomExplorer(lambda mol_list: func(mol_list[0]), initial_pool=start_pool) print("Starting ChEMBL score 1 optimization") t0 = time() exp.run(30) print("Completed ChEMBL score 1 optimization, time elapsed: %.3fs" % (time()-t0)) # print(exp.pool) top = exp.get_best(1)[0] print(top.get_synthesis_path()) print("Best achieved score: %.3f" % func(top)) props = [dd[mol.smiles] for mol in pool_all] print("Best possible score: %.3f" % np.max(props))
def get_min_score(syn): res = float('inf') for mol, syn_graph in syn.items(): # if mol.begin_flag: if isinstance(syn_graph, str): return sa_score(Molecule(mol)) res = min(res, get_min_score(syn_graph)) return res
def get_graph_data_for_distance_computation(mol): """ Returns graph representation for a molecule. """ if isinstance(mol, str): from mols.molecule import Molecule mol = Molecule(mol) rdk_mol = mol.to_rdkit() rdk_mol = Chem.AddHs(rdk_mol) adj_matrix = Chem.rdmolops.GetAdjacencyMatrix(rdk_mol) bonds = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in rdk_mol.GetBonds()] bond_types = [ rdk_mol.GetBondBetweenAtoms(b[0], b[1]).GetBondType() for b in bonds ] atom_idxs = list(range(len(rdk_mol.GetAtoms()))) atomic_numbers = [ rdk_mol.GetAtomWithIdx(idx).GetAtomicNum() for idx in atom_idxs ] atomic_symbols = [ rdk_mol.GetAtomWithIdx(idx).GetSymbol() for idx in atom_idxs ] atomic_masses = [ rdk_mol.GetAtomWithIdx(idx).GetMass() for idx in atom_idxs ] num_atoms = len(atom_idxs) bonds_of_each_atom = [ get_neighbors_and_bond_types(idx, bonds, atomic_symbols, bond_types) for idx in range(num_atoms) ] bond_type_counts_of_each_atom = [ get_bond_type_counts(bt) for bt in bonds_of_each_atom ] # Return graph_data = Namespace( rdk_mol=rdk_mol, adj_matrix=adj_matrix, bonds=bonds, bond_types=bond_types, atom_idxs=atom_idxs, atomic_numbers=atomic_numbers, atomic_symbols=atomic_symbols, atomic_masses=atomic_masses, num_atoms=num_atoms, bonds_of_each_atom=bonds_of_each_atom, bond_type_counts_of_each_atom=bond_type_counts_of_each_atom, ) return graph_data
def draw_molecule(mol: Molecule) -> PIL.Image.Image: """ Draw a single molecule `mol` (make it `PIL.Image.Image`) :param mol: molecule to draw :return: corresponding image to `mol` """ img = Draw.MolToImage(mol.to_rdkit()) return img
def compute_synthesizability(exp_path): sas = get_objective_by_name("sascore") mol = None with open(os.path.join(exp_path, 'exp_log'), 'r') as f: for line in f: if 'Resulting molecule' in line: mol = Molecule(smiles=line.split()[2]) if not mol: return sa_score = sas(mol) return sa_score
def _test_len(self): dummy_func = lambda mol: len(mol.smiles) test_pool = ["CC", "O=C=O", "C#N", "CCN(CC)CC", "CC(=O)O", "C1CCCCC1", "c1ccccc1"] test_pool = [Molecule(smiles) for smiles in test_pool] exp = RandomExplorer(dummy_func, initial_pool=test_pool) print("Starting len of SMILES optimization") exp.run(2) #check print(exp.pool)
def draw_synthesis_path(target_smiles: str, synth_path: str, out_path: str) -> None: """Draw the synthesis path and save to provided location. :param target_smiles: SMILES of the molecule being synthesized :param synth_path: dictionary of format SMILES m -> synpath of m :param out_path: where to save the resulting pdf """ with open("./mols/best_molecule.pkl", "rb") as f: synpath = pickle.load(f) synpath = smile_synpath_to_mols(Molecule(smiles=target_smiles), synpath) drawer = SynPathDrawer(synpath, "plot") drawer.render(out_path)
def get_chembl(option='', max_size=1000, as_mols=True): """ Return list of Molecules. NOTE: this function should be located in the same directory as data files. Arguments: option {str} -- either empty or of format '{small,large}_{objective name}' max_size {int} -- number of molecules to sample, if None, returns all, else randomly samples a subset. Attention: there is a randomly set random seed that seeds this sampler now, so the subset will always be the same. as_mols {bool} -- whether to wrap SMILES into the Molecule class """ path = os.path.join(__location__, "ChEMBL.txt") with open(path, "r") as f: mols = [line.strip() for line in f] if as_mols: mols = [Molecule(smile) for smile in mols] if max_size == -1: max_size = len(mols) if len(mols) <= max_size: return mols # TODO: this logic is off, if filtering afterwards, # we get less than max_size molecules in the end. # Fix this if needed. gen = np.random.RandomState(42) mols = list(gen.choice(mols, max_size, replace=False)) if option == '': return mols elif option.startswith('small_'): obj_name = option.split("_")[1] obj_func = get_objective_by_name(obj_name) small_thresh = get_threshold(obj_name, mode='low') return [mol for mol in mols if obj_func(mol) < small_thresh] elif option.startswith('large_'): obj_name = option.split("_")[1] obj_func = get_objective_by_name(obj_name) large_thresh = get_threshold(obj_name, mode='high') return [mol for mol in mols if obj_func(mol) >= large_thresh] else: raise ValueError(f"Dataset filter {option} not supported.")
def compute_min_sa_score(mol): """ Compute sas scores along the synthesis path of molecule. """ sa_score = get_objective_by_name("sascore") def get_min_score(syn): res = float('inf') for mol, syn_graph in syn.items(): # if mol.begin_flag: if isinstance(syn_graph, str): return sa_score(Molecule(mol)) res = min(res, get_min_score(syn_graph)) return res synthesis_path = mol.get_synthesis_path() if isinstance(synthesis_path, dict): min_sa_score = get_min_score(synthesis_path) else: min_sa_score = sa_score(Molecule(synthesis_path)) return min_sa_score
def __init__(self, mol: Molecule, draw_mode: str): """ :param mol: the molecule to draw synthesis path for :param draw_mode: "smiles" | "formula" | "plot" way of plotting each single molecule Examples:: >>> drawer = SynPathDrawer(root_mol, "smiles") # or "formula" or "plot" >>> drawer.render("some_output_dir/some_file_name") # please, no file extension """ assert draw_mode in ["smiles", "formula", "plot"] from graphviz import Digraph self._mol = mol self._dot = Digraph(comment="Synthesis path for {}".format( mol.to_smiles()), format="pdf") self._draw_mode = draw_mode self._node_counter = 0 self._sub_dir = None
def get_zinc250(option='', max_size=1000): path = os.path.join(__location__, "zinc250k.csv") zinc_df = pd.read_csv(path) list_of_smiles = list(map(lambda x: x.strip(), zinc_df.smiles.values)) # other columns are logP, qed, and sas mols = [Molecule(smile) for smile in res] if len(mols) < max_size: return mols gen = np.random.RandomState(42) mols = list(gen.choice(mols, max_size, replace=False)) if option == '': return mols elif option == 'small_qed': qed_func = get_objective_by_name("qed") return [mol for mol in mols if qed_func(mol) < 0.6] elif option == 'large_qed': qed_func = get_objective_by_name("qed") return [mol for mol in mols if qed_func(mol) >= 0.6] else: raise ValueError(f"Dataset filter {option} not supported.")
def get_zinc250(option='', max_size=1000, as_mols=True): """ Return list of Molecules. NOTE: this function should be located in the same directory as data files. Arguments: option {str} -- either empty or of format '{small,large}_{objective name}' max_size {int} -- number of molecules to sample, if None, returns all, else randomly samples a subset. Attention: there is a randomly set random seed that seeds this sampler now, so the subset will always be the same. as_mols {bool} -- whether to wrap SMILES into the Molecule class """ path = os.path.join(__location__, "zinc250k.csv") zinc_df = pd.read_csv(path) list_of_smiles = list(map(lambda x: x.strip(), zinc_df.smiles.values)) # other columns are logP, qed, and sas mols = [Molecule(smile) for smile in list_of_smiles] if max_size == -1: max_size = len(mols) if len(mols) <= max_size: return mols gen = np.random.RandomState(42) mols = list(gen.choice(mols, max_size, replace=False)) if option == '': return mols elif option.startswith('small_'): obj_func = get_objective_by_name(option.split("_")[1]) return [mol for mol in mols if obj_func(mol) < 0.6] elif option.startswith('large_'): obj_func = get_objective_by_name(option.split("_")[1]) return [mol for mol in mols if obj_func(mol) >= 0.6] else: raise ValueError(f"Dataset filter {option} not supported.")
except RuntimeError as e: logging.error(f"Error occured in DirectCandRanker.predict: {e}") raise e res = [] for out in outcomes: if out["smiles"]: # may be empty for some reason? smiles = out["smiles"][0] mol = Molecule(smiles) mol.set_synthesis(reaction.inputs) res.append(mol) else: continue res = res[:k] # setting predicted products, if not already set: reaction.set_products(res) return res if __name__=="__main__": list_of_mols = ["[CH3:26][c:27]1[cH:28][cH:29][cH:30][cH:31][cH:32]1", "[Cl:18][C:19](=[O:20])[O:21][C:22]([Cl:23])([Cl:24])[Cl:25]", "[NH2:1][c:2]1[cH:3][cH:4][c:5]([Br:17])[c:6]2[c:10]1[O:9][C:8]"+ "([CH3:11])([C:12](=[O:13])[O:14][CH2:15][CH3:16])[CH2:7]2" ] list_of_mols = [Molecule(smiles) for smiles in list_of_mols] t = RexgenForwardSynthesizer() reaction = Reaction(list_of_mols) t.predict_outcome(reaction)
def setUp(self): S1, S2, S3 = "Cc1ccccc1", "C1OC1", "CCOC(=O)C1=C[C@@H](OC(CC)CC)[C@H](NC(C)=O)[C@@H](N)C1" self.mols = [Molecule(S1), Molecule(S2), Molecule(S3)]
def members_are_equal(cls, point_1, point_2): """ Technically, because SMILES are not unique, this may sometimes give false negatives. TODO: graph structure matching? """ return mol1.to_smiles() == mol2.to_smiles() def __str__(self): """ Returns a string representation. """ cc_attrs = "" if hasattr(self, "constraint_checker") and self.constraint_checker is not None: cc_attrs = {key:getattr(self.constraint_checker, key) for key in self.constraint_checker.constraint_names} return 'Mol(%s):%s'%(self.mol_type, cc_attrs) # Different constraint checker functions(Molecule -> bool) -------------------- def has_carbon(mol): rdk = mol.to_rdkit() atomic_symbols = [rdk.GetAtomWithIdx(idx).GetSymbol() for idx in range(len(rdk.GetAtoms()))] mol_has_carbon = ('C' in atomic_symbols) print(atomic_symbols, mol_has_carbon) return mol_has_carbon if __name__ == "__main__": mol = Molecule("C=C1NC(N(C)C)=NC12CCN(CC(C)c1ccccc1)CC2") has_carbon(mol)
def setUp(self): self.mol = Molecule("CC")
# def draw_synthesis_path(mol): # def compute_depth(syn_path): # depth = 1 # if not mol.begin_flag: # for inp, inp_syn_path in syn_path: # inp_depth = compute_depth(inp_syn_path) # depth = max(depth, inp_depth) # return depth # # syn_path = mol.get_syn_path() # depth = compute_depth(syn_path) # number of rows to allocate for plotting # imgs_per_row = [] # min_shape = None # # # traverse the synthesis path and append images to imgs_per_row # # each row should be concatenated: see # # https://stackoverflow.com/questions/30227466/combine-several-images-horizontally-with-python # # # TODO # # imgs_comb = np.vstack([np.asarray(img.resize(min_shape)) # for img in imgs_per_row]) # result_img = PIL.Image.fromarray(imgs_comb) # return result_img if __name__ == "__main__": mol = Molecule("CCCC") img = draw_molecule(mol) img.save('./experiments/results/test.png')
def _draw_edge(self, tail: Molecule, head: Molecule): self._dot.edge(tail_name=tail.to_smiles(), head_name=head.to_smiles())