def extract_shingles(self, individual): qry_shingles = set() radius_constr = self.radius + 1 # Reloading molecule to make it aromatic mol = MolFromSmiles(individual.to_aromatic_smiles()) for atm_idx in range(individual.mol_graph.GetNumAtoms()): for N in range(1, radius_constr): bonds = AllChem.FindAtomEnvironmentOfRadiusN(mol, N, atm_idx) if not bonds: break # the reportedly faster method atoms = set() for bond_id in bonds: bond = mol.GetBondWithIdx(bond_id) atoms.add(bond.GetBeginAtomIdx()) atoms.add(bond.GetEndAtomIdx()) if self.rooted: new_shingle = Chem.rdmolfiles.MolFragmentToSmiles( mol, list(atoms), bonds, 0, 0, False, False, atm_idx, True, False, False) else: new_shingle = Chem.rdmolfiles.MolFragmentToSmiles( mol, list(atoms), bonds, 0, 0, False, False, -1, True, False, False) qry_shingles.add(new_shingle) return qry_shingles
def rdkit_mmff94_xyz(smiles, **kwargs): """ Returns the string of the XYZ file obtained performing the MMFF94 molecular mechanics optimization of the given SMILES using RDKit. Writing temporary files in $MM_WORKING_DIR if defined or otherwise in /tmp :param smiles: input_SMILES :param max_iterations: max number of iterations (default 500) :return : XYZ string of optimized geometry, success (whether the MM optimization was successful and the smiles has stayed identical after optimization) """ working_dir = os.environ[ "MM_WORKING_DIR"] if "MM_WORKING_DIR" in os.environ else "/tmp" # Converting the molecule to RDKit object mol = MolFromSmiles(smiles) smi_canon = MolToSmiles(MolFromSmiles(smiles)) # Setting paths filename_smiles = str(os.getpid()) + "_" + smi_to_filename(smi_canon) xyz_path = join(working_dir, filename_smiles + '.xyz') post_MM_smi_path = join(working_dir, filename_smiles + '.smi') # Computing geometry try: # Adding implicit hydrogens mol = AddHs(mol) # MM optimization EmbedMolecule(mol) value = MMFFOptimizeMolecule(mol, maxIters=kwargs["max_iterations"]) # Success if returned value is null success_RDKIT_output = value == 0 # Computing XYZ from optimized molecule xyz_str = MolToXYZBlock(mol) # Writing optimized XYZ to file with open(xyz_path, "w") as f: f.writelines(xyz_str) # Success if the optimization has converged and the post MM smiles is identical the pre MM smiles success = success_RDKIT_output and check_identical_geometries( xyz_path, smi_canon, post_MM_smi_path) except Exception as e: success = False xyz_str = None finally: # Removing files remove_files([post_MM_smi_path, xyz_path]) return xyz_str, success
def get_mol_from_smiles(smiles: str) -> Mol: mol = MolFromSmiles(smiles) try: mol = Chem.AddHs(mol) AllChem.EmbedMolecule(mol, maxAttempts=5000) AllChem.UFFOptimizeMolecule(mol) mol = Chem.RemoveHs(mol) except ValueError: mol = MolFromSmiles(smiles) AllChem.Compute2DCoords(mol) return mol
def load_pop_from_smiles_list(self, smiles_list, atom_mutability=True): """ Loading the population from the given smiles list. Setting the internal variables to their values :param smiles_list: list of SMILES :param atom_mutability: whether the core of the molecules of the starting population can be modified :return: """ if self.shuffle_init_pop: np.random.shuffle(smiles_list) # Iterating over all the given smiles for i, smi in enumerate(smiles_list): # Loading QuMolGraph object self.pop[i] = MolGraph(MolFromSmiles(smi), sanitize_mol=True, mutability=atom_mutability) # Saving smiles in the tabu dictionary and in action history initialization self.pop_tabu_list[i] = self.pop[i].to_aromatic_smiles() self.actions_history[i] = self.pop[i].to_aromatic_smiles() # Evaluation of the population (not recording the count of calls to the objective function) print("Computing scores at initialization...") self.evaluation_strategy.set_params(**self.evaluation_strategy_parameters["evaluate_init_pop"]) self.evaluation_strategy.disable_calls_count() self.evaluation_strategy.compute_record_scores_init_pop(self.pop) self.evaluation_strategy.enable_calls_count() self.evaluation_strategy.set_params(**self.evaluation_strategy_parameters["evaluate_new_solution"])
def compute_mol_legend(action_history_k, smi, action_history_scores, legend_scores_keys_strat=None): legend = "" last = 0 scores_float = [] if legend_scores_keys_strat is not None: for i, key_strat in enumerate(legend_scores_keys_strat): score = None if isinstance(key_strat, str): score = action_history_scores[action_history_k][key_strat] elif isinstance(key_strat, EvaluationStrategy): score = key_strat.evaluate_individual( MolGraph(MolFromSmiles(smi), sanitize_mol=True)) scores_float.append(score) score_str = "{:.2f}".format(score) if i == 1: legend += " [" elif i > 1: legend += ", " legend += score_str last = i if last >= 1: legend += "]" return legend, scores_float
def evaluate_individual(self, individual): if individual is None: return None else: mol_graph = MolFromSmiles(individual.to_aromatic_smiles()) score = qed(mol_graph) return score, [score]
def compute_mol_attributes(graph, labels_dict, actions_history_smi_pop, actions_history_smi_removed, actions_history_scores_pop, actions_history_scores_removed, legend_scores_keys_strat=None): images_attributes = {} scores_attributes = {} draw_opt = DrawingOptions() draw_opt.coordScale = 0.9 draw_opt.dotsPerAngstrom = 30 for action_history_k in labels_dict.keys(): if action_history_k in actions_history_smi_pop: smi = actions_history_smi_pop[action_history_k] img = MolToImage(MolFromSmiles(smi), size=(800, 800), options=draw_opt) images_attributes[action_history_k] = crop_image_with_transparency( img) legend, _ = compute_mol_legend(action_history_k, smi, actions_history_scores_pop, legend_scores_keys_strat) scores_attributes[action_history_k] = legend else: smi = actions_history_smi_removed[action_history_k] img = MolToImage(MolFromSmiles(smi), size=(800, 800), options=draw_opt) images_attributes[action_history_k] = crop_image_with_transparency( img) legend, _ = compute_mol_legend(action_history_k, smi, actions_history_scores_removed, legend_scores_keys_strat) scores_attributes[action_history_k] = legend nx.set_node_attributes(graph, images_attributes, "image") nx.set_node_attributes(graph, scores_attributes, "score_label")
def test_react(reactant, expected_products): reactor = Reactor() reactant_mol = MolFromSmiles(reactant) AllChem.EmbedMolecule(reactant_mol, AllChem.ETKDG()) products = reactor.react(reactant_mol) products = mols2smiles(products) assert products == expected_products
def _transform(self, x): try: mol = MolFromInchi(x['standard_inchi']) except: mol = MolFromSmiles(x['Compound_SMILES']) info = {} AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, self.dim, bitInfo=info) return list(info.keys())
def get_score_components(smiles): ''' Get the non-normalized score components :param smiles: a VALID smiles string :return: a tuple of floats ''' this_mol = MolFromSmiles(smiles) return get_score_components_from_mol(this_mol)
def _set_target_fps(self, pickaxe: Pickaxe): for smiles in pickaxe.target_smiles: mol = MolFromSmiles(smiles) if self.fingerprint_method == "Morgan": fp = AllChem.GetMorganFingerprintAsBitVect(mol, **self.fingerprint_args) else: fp = RDKFingerprint(mol) self.target_fps.append(fp)
def evaluate_individual(self, individual): mol_graph = MolFromSmiles(individual.to_aromatic_smiles()) log_p = Descriptors.MolLogP(mol_graph) sas_score = sascorer.calculateScore(mol_graph) largest_ring_size = self.get_largest_ring_size(mol_graph) cycle_score = max(largest_ring_size - 6, 0) score = log_p - sas_score - cycle_score return score, [score]
def draw_mol_labels(labels_dict, actions_history_smi_pop, actions_history_smi_removed, actions_history_scores_pop, actions_history_scores_removed, legend_scores_keys_strat=None, problem_type="max", mols_per_row=4, draw_n_mols=None): smi_to_draw = {} legends_to_draw = {} scores_float = {} for action_history_k in labels_dict.keys(): if labels_dict[action_history_k] != "": if action_history_k in actions_history_smi_pop: smi = actions_history_smi_pop[action_history_k] smi_to_draw[labels_dict[action_history_k]] = smi legend, scores = compute_mol_legend(action_history_k, smi, actions_history_scores_pop, legend_scores_keys_strat) legends_to_draw[labels_dict[action_history_k]] = legend scores_float[labels_dict[action_history_k]] = scores else: smi = actions_history_smi_removed[action_history_k] smi_to_draw[labels_dict[action_history_k]] = smi legend, scores = compute_mol_legend(action_history_k, smi, actions_history_scores_removed, legend_scores_keys_strat) legends_to_draw[labels_dict[action_history_k]] = legend scores_float[labels_dict[action_history_k]] = scores mols = [] legends = [] scores_to_sort = [] for k, smi in smi_to_draw.items(): mols.append(MolFromSmiles(smi)) legends.append(legends_to_draw[k]) scores_to_sort.append(scores_float[k][0]) mols = np.array(mols) legends = np.array(legends) # Sorting molecules sorted_order = np.argsort(scores_to_sort) if problem_type == "max": sorted_order = sorted_order[::-1] # Filtering molecules if necessary if draw_n_mols is not None: mols = mols[:draw_n_mols] legends = legends[:draw_n_mols] legends = list(legends[sorted_order]) mols = list(mols[sorted_order]) img = MolsToGridImage(mols, legends=legends, molsPerRow=mols_per_row, subImgSize=(200, 200)) return img
def get_all_metrics(smiles): mols = [MolFromSmiles(s) for s in smiles] scorer = NormalizedScorer() scores, norm_scores = scorer.get_scores_from_mols(mols) arom_rings = np.array([Descriptors.NumAromaticRings(m) for m in mols]) metrics = np.concatenate([scores.sum(axis=1)[:, None], norm_scores.sum(axis=1)[:, None], scores[:, 1][:, None], norm_scores[:, 1][:, None], arom_rings[:, None]], axis=1) return (smiles, metrics)
def extract_shingles(smiles, level, as_list=False): """ Extracting up to the given level from the given smiles see https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0321-8 """ if as_list: qry_shingles = list() else: qry_shingles = set() radius_constr = level + 1 # Reloading molecule to make it aromatic mol = MolFromSmiles(smiles) for atm_idx in range(mol.GetNumAtoms()): for N in range(1, radius_constr): bonds = AllChem.FindAtomEnvironmentOfRadiusN(mol, N, atm_idx) if not bonds: break # the reportedly faster method atoms = set() for bond_id in bonds: bond = mol.GetBondWithIdx(bond_id) atoms.add(bond.GetBeginAtomIdx()) atoms.add(bond.GetEndAtomIdx()) # Computed rooted shingle new_shingle = Chem.rdmolfiles.MolFragmentToSmiles( mol, list(atoms), bonds, 0, 0, False, False, atm_idx, True, False, False) if as_list: qry_shingles.append(new_shingle) else: qry_shingles.add(new_shingle) return qry_shingles
def extract_descriptors(self, individual): """ Returning the descriptor(s) extracted from the given individual :param individual: :return: """ if self.descriptor_key == "gen_scaffolds": return [ MolToSmiles( MurckoScaffold.MakeScaffoldGeneric( MolFromSmiles(individual.to_smiles()))) ] elif self.descriptor_key == "ifg": curr_ifgs = ifg.identify_functional_groups( MolFromSmiles(individual.to_smiles())) return list(set([curr_ifg[2] for curr_ifg in curr_ifgs])) elif self.descriptor_key == "atoms": return list(set(individual.get_atom_types())) elif self.descriptor_key == "shg_1": return list(extract_shingles(individual, 1)) elif self.descriptor_key == "checkmol": return list(set(extract_checkmol(individual)))
def obabel_mmff94_xyz(smiles, **kwargs): """ Returns the string of the XYZ file obtained performing the MMFF94 molecular mechanics optimization of the given SMILES using obabel. Writing temporary files in $MM_WORKING_DIR if defined or otherwise in /tmp :param smiles : input SMILES :return : XYZ string of optimized geometry, success (whether the MM optimization was successful and the smiles has stayed identical after optimization) """ working_dir = os.environ[ "MM_WORKING_DIR"] if "MM_WORKING_DIR" in os.environ else "/tmp" # Computing RDKIT canonical SMILES smi_canon = MolToSmiles(MolFromSmiles(smiles)) filename_smiles = str(os.getpid()) + "_" + smi_to_filename(smi_canon) # Computing files paths smi_path = join(working_dir, filename_smiles + ".smi") xyz_path = join(working_dir, filename_smiles + ".xyz") post_MM_smi_path = join(working_dir, filename_smiles + ".post_MM.smi") try: # Writing smiles to file with open(smi_path, "w") as f: f.write(smi_canon) # Converting SMILES to XYZ after computing MM (Obabel MMFF94) command_obabel = join(os.getenv("OPT_LIBS"), "obabel/openbabel-2.4.1/bin/obabel") + " -ismi " + smi_path \ + " -oxyz -O " + xyz_path + " --gen3d" os.system(command_obabel + " > /dev/null 2> /dev/null") # Reading XYZ string with open(xyz_path, "r") as f: xyz_str = f.read() # Success if the post MM smiles is identical the pre MM smiles success = check_identical_geometries(xyz_path, smi_canon, post_MM_smi_path) except Exception as e: success = False xyz_str = None finally: # Removing files remove_files([smi_path, xyz_path, post_MM_smi_path]) return xyz_str, success
def load_obabel_smi(smi_path): """ Converting a OpenBabel SMILES into a canonical aromatic RDKit SMILES :param smi_path: :return: """ # Extracting smiles with open(smi_path, "r") as f: new_smi = f.readline() # Loading converged mol new_mol = MolFromSmiles(new_smi) # Removing stereo information RemoveStereochemistry(new_mol) # Removing hydrogens new_mol = RemoveHs(new_mol) # Converting to SMILES smi_rdkit = MolToSmiles(MolFromSmiles(MolToSmiles(new_mol))) return smi_rdkit
def load_obabel_smi(smi_path, sanitize_mol): # Extracting smiles with open(smi_path, "r") as f: new_smi = f.readline() print("obabel new smi : " + new_smi) # Loading converged mol new_mol = MolFromSmiles(new_smi) # Removing stereo information RemoveStereochemistry(new_mol) # Removing hydrogens new_mol = RemoveHs(new_mol) # Converting to SMILES smi_rdkit = MolGraph(new_mol, sanitize_mol=sanitize_mol).to_aromatic_smiles() print("rdkit new smi : " + smi_rdkit) return smi_rdkit
def load_pop_from_smiles_list(self, smiles_list, atom_mutability=True): """ Loading the population from the given smiles list. Setting the internal variables to their values :param smiles_list: list of SMILES :param atom_mutability: whether the core of the molecules of the starting population can be modified :return: """ # Iterating over all the given smiles for i, smi in enumerate(smiles_list): # Loading QuMolGraph object self.pop[i] = MolGraph(MolFromSmiles(smi), sanitize_mol=True, mutability=atom_mutability) # Saving smiles in the tabu dictionary and in action history initialization self.pop_tabu_list[i] = self.pop[i].to_aromatic_smiles() self.actions_history[i] = self.pop[i].to_aromatic_smiles() # Evaluation of the population print("Computing descriptors at initialization...") self.evaluation_strategy.compute_record_scores(self.pop)
def evaluate_individual(self, individual): """ from https://github.com/bowenliu16/rl_graph_generation/blob/master/gym-molecule/gym_molecule/envs/molecule.py """ # normalization constants, statistics from 250k_rndm_zinc_drugs_clean.smi logP_mean = 2.4570953396190123 logP_std = 1.434324401111988 SA_mean = -3.0525811293166134 SA_std = 0.8335207024513095 cycle_mean = -0.0485696876403053 cycle_std = 0.2860212110245455 mol_graph = MolFromSmiles(individual.to_aromatic_smiles()) log_p = Descriptors.MolLogP(mol_graph) SA = -sascorer.calculateScore(mol_graph) # cycle score cycle_list = nx.cycle_basis( nx.Graph(Chem.rdmolops.GetAdjacencyMatrix(mol_graph))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length normalized_log_p = (log_p - logP_mean) / logP_std normalized_SA = (SA - SA_mean) / SA_std normalized_cycle = (cycle_score - cycle_mean) / cycle_std score = normalized_log_p + normalized_SA + normalized_cycle return score, [score]
def __init__(self, smiles): self._mol = MolFromSmiles(smiles)
def get_scores(self, smiles): mols = [MolFromSmiles(s) for s in smiles] return self.get_scores_from_mols(mols)
def initialize(self): """ Initialization of EvoMol with starting values. This method MUST BE CALLED BEFORE running the algorithm. :return: """ # Initialization of population self.pop = list(np.full((self.pop_max_size,), None)) # Initialization of the dictionaries containing the smiles of former and current individuals as keys self.pop_tabu_list = list(np.full((self.pop_max_size,), None)) # Intialization of the list of all individual ever inserted in the population, the list of their # corresponding number of calls to the objective function at insertion and the list of the corresponding steps. # Also recording the values of the objective function self.all_generated_individuals_smiles = [] self.all_generated_individuals_n_obj_calls = [] self.all_generated_individuals_step = [] self.all_generated_individuals_obj_value = [] self.all_generated_individuals_improver = [] self.all_generated_individuals_success_obj_computation = [] # Insuring the SMILES of the external tabu list are canonical if self.external_tabu_list is not None: self.external_tabu_list = [MolGraph(MolFromSmiles(smi)).to_aromatic_smiles() for smi in self.external_tabu_list] # Initialization of the dictionary containing the traces of steps of the algorithm self.step_traces = { 'scores': {}, 'n_replaced': [], 'additional_values': {}, 'timestamps': [] } # Initialization of keys in the self.step_traces dict declared by the evaluation strategy instance for k in self.evaluation_strategy.keys() + ["total"]: for stat in ["mean", "med", "min", "max", "std"]: self.step_traces["scores"][k + "_" + stat] = [] # Initialization of keys in the self.step_traces dict for additional population scores for k in self.evaluation_strategy.get_additional_population_scores().keys(): print(k) self.step_traces['additional_values'][k] = [] # Initialization of the step counter. self.curr_step_id = 0 # Initialization of errors list self.errors = [] self.curr_total_scores = None self.curr_scores = None self.timestamp_start = None # Computing idx of kth score to be recorded vector for i, k in enumerate(self.evaluation_strategy.keys()): if k == self.kth_score_to_record_key: self.kth_score_to_record_idx = i self.kth_score_history = deque(maxlen=500) self.n_success_mut = np.zeros(self.pop_max_size, dtype=np.int) self.n_fail_mut = np.zeros(self.pop_max_size, dtype=np.int) self.actions_history = list(np.full(self.pop_max_size, None)) self.removed_actions_score_smi_tuple = {} # Computing start timestamp self.timestamp_start = time.time()
def mol_from_smiles(smiles): if type(smiles) == 'str': return MolFromSmiles(smiles) else: # assume we have a list-like return [MolFromSmiles(s) for s in smiles]
def edit_smiles(request): smiles = request.GET.get('SMILES', '') mol = MolFromSmiles(smiles) return render(request, 'cspace/chemical-editor.html', {'molblock': MolToMolBlock(mol)})
if pre_parser(t) is not None])) print(len(these_smiles)) these_actions = my_model.strings_to_actions(these_smiles) action_seq_length = my_model.action_seq_length(these_actions) onehot = my_model.actions_to_one_hot(these_actions) append_data = { 'smiles': np.array(these_smiles, dtype=dt), 'indices': np.array(these_indices), 'actions': these_actions, 'valid': np.ones((len(these_smiles))), 'seq_len': action_seq_length, 'data': onehot } if molecules: from rdkit.Chem.rdmolfiles import MolFromSmiles mols = [MolFromSmiles(s) for s in these_smiles] raw_scores = np.array([get_score_components_from_mol(m) for m in mols]) append_data['raw_scores'] = raw_scores num_atoms = np.array([len(m.GetAtoms()) for m in mols]) append_data['num_atoms'] = num_atoms ds.append(append_data) if molecules: # also calculate mean and std of the scores, to use in the ultimate objective raw_scores = np.array(ds.h5f['raw_scores']) score_std = raw_scores.std(0) score_mean = raw_scores.mean(0) ds.append_to_dataset('score_std', score_std) ds.append_to_dataset('score_mean', score_mean)
def get_mol(self): return MolFromSmiles(self.smiles)
def main(): # change this to False to produce the equation dataset molecules = True # change this to False to get character-based encodings instead of grammar-based grammar = 'new' #use True for the grammar used by Kusner et al # can't define model class inside settings as it itself uses settings a lot _, my_model = get_vae(molecules, grammar) def pre_parser(x): try: return next(my_model._parser.parse(x)) except Exception as e: return None settings = get_settings(molecules,grammar) MAX_LEN = settings['max_seq_length'] #feature_len = settings['feature_len'] dest_file = settings['data_path'] source_file = settings['source_data'] # Read in the strings f = open(source_file,'r') L = [] for line in f: line = line.strip() L.append(line) f.close() # convert to one-hot and save, in small increments to save RAM #dest_file = dest_file.replace('.h5','_new.h5') ds = IncrementingHDF5Dataset(dest_file) step = 100 dt = h5py.special_dtype(vlen=str) # PY3 hdf5 datatype for variable-length Unicode strings size = min(10000, len(L)) for i in tqdm(range(0, size, step)):#for i in range(0, 1000, 2000): #print('Processing: i=[' + str(i) + ':' + str(i + step) + ']') these_indices = list(range(i, min(i + step,len(L)))) these_smiles = L[i:min(i + step,len(L))] if grammar=='new': # have to weed out non-parseable strings tokens = [my_model._tokenize(s.replace('-c','c')) for s in these_smiles] these_smiles, these_indices = list(zip(*[(s,ind) for s,t,ind in zip(these_smiles, tokens, these_indices) if pre_parser(t) is not None])) #print(len(these_smiles)) these_actions = torch.tensor(my_model.strings_to_actions(these_smiles)) action_seq_length = my_model.action_seq_length(these_actions) onehot = my_model.actions_to_one_hot(these_actions) append_data = {'smiles': np.array(these_smiles, dtype=dt), 'indices': np.array(these_indices), 'actions': these_actions, 'valid': np.ones((len(these_smiles))), 'seq_len': action_seq_length, 'data': onehot} if molecules: from rdkit.Chem.rdmolfiles import MolFromSmiles mols = [MolFromSmiles(s) for s in these_smiles] raw_scores = np.array([get_score_components_from_mol(m) for m in mols]) append_data['raw_scores'] = raw_scores num_atoms = np.array([len(m.GetAtoms()) for m in mols]) append_data['num_atoms'] = num_atoms ds.append(append_data) if molecules: # also calculate mean and std of the scores, to use in the ultimate objective raw_scores = np.array(ds.h5f['raw_scores']) score_std = raw_scores.std(0) score_mean = raw_scores.mean(0) ds.append_to_dataset('score_std',score_std) ds.append_to_dataset('score_mean', score_mean) print('success!')
def _is_radical(self, mol): return NumRadicalElectrons(MolFromSmiles( mol.to_aromatic_smiles())) != 0