def __call__(self, smiles: str): mol = Chem.MolFromSmiles(smiles) if not (self.rule_dict["MW"][0] <= MolWt(mol) <= self.rule_dict["MW"][1]): return False if not (self.rule_dict["LogP"][0] <= MolLogP(mol) <= self.rule_dict["LogP"][1]): return False if not (self.rule_dict["HBD"][0] <= NumHDonors(mol) <= self.rule_dict["HBD"][1]): return False if not (self.rule_dict["HBA"][0] <= NumHAcceptors(mol) <= self.rule_dict["HBA"][1]): return False if not (self.rule_dict["TPSA"][0] <= TPSA(mol) <= self.rule_dict["TPSA"][1]): return False for row in self.rule_list: patt, max_val, desc = row if len(mol.GetSubstructMatches(patt)) > max_val: return False return True
def evaluate(self, lst_in): """ Evaluate structure alerts on a list of SMILES :param lst_in: input list of [SMILES, Name] :return: list of alerts matched or "OK" """ smiles, name = lst_in mol = Chem.MolFromSmiles(smiles) if mol is None: return [ smiles, name, 'INVALID', -999, -999, -999, -999, -999, -999 ] desc_list = [ MolWt(mol), MolLogP(mol), NumHDonors(mol), NumHAcceptors(mol), TPSA(mol), CalcNumRotatableBonds(mol) ] for row in self.rule_list: patt, max_val, desc = row if len(mol.GetSubstructMatches(patt)) > max_val: return [smiles, name] + [desc + " > %d" % (max_val)] + desc_list return [smiles, name] + ["OK"] + desc_list
def reward_target_logp(mol, target, ratio=0.5, max=4): """ Reward for a target log p :param mol: rdkit mol object :param target: float :return: float (-inf, max] """ x = MolLogP(mol) reward = -1 * np.abs((x - target) / ratio) + max return reward
def worker(line): smiles, cid = line.strip().split()[:2] mol = MolFromSmiles(smiles) if mol: if '.' in smiles: mol = remover.StripMol(mol) logp = MolLogP(mol) num_heavy_atoms = mol.GetNumHeavyAtoms() if num_heavy_atoms > 99: num_heavy_atoms = 99 sign = 'M' if logp < 0.0 else 'P' return f'{smiles} {cid} H{num_heavy_atoms:02}{sign}{abs(scale_logp_value(logp)):03}\n'
def get_normalized_values(): fname = '/home/bowen/pycharm_deployment_directory/rl_graph_generation/gym-molecule/gym_molecule/dataset/250k_rndm_zinc_drugs_clean.smi' with open(fname) as f: smiles = f.readlines() for i in range(len(smiles)): smiles[i] = smiles[i].strip() smiles_rdkit = [] for i in range(len(smiles)): smiles_rdkit.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles[i]))) print(i) logP_values = [] for i in range(len(smiles)): logP_values.append(MolLogP(Chem.MolFromSmiles(smiles_rdkit[i]))) print(i) SA_scores = [] for i in range(len(smiles)): SA_scores.append(-calculateScore(Chem.MolFromSmiles(smiles_rdkit[i]))) print(i) cycle_scores = [] for i in range(len(smiles)): cycle_list = nx.cycle_basis( nx.Graph( Chem.rdmolops.GetAdjacencyMatrix( Chem.MolFromSmiles(smiles_rdkit[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_scores.append(-cycle_length) print(i) SA_scores_normalized = (np.array(SA_scores) - np.mean(SA_scores)) / np.std(SA_scores) logP_values_normalized = (np.array(logP_values) - np.mean(logP_values)) / np.std(logP_values) cycle_scores_normalized = (np.array(cycle_scores) - np.mean(cycle_scores)) / np.std(cycle_scores) return np.mean(SA_scores), np.std(SA_scores), np.mean(logP_values), np.std( logP_values), np.mean(cycle_scores), np.std(cycle_scores)
def get_normalized_values(smi_filename): with open(smi_filename) as f: smiles = f.readlines() for i in range(len(smiles)): smiles[i] = smiles[i].strip() smiles_rdkit = [] for i in range(len(smiles)): smiles_rdkit.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles[i]))) print(i) logP_values = [] for i in range(len(smiles)): logP_values.append(MolLogP(Chem.MolFromSmiles(smiles_rdkit[i]))) print(i) SA_scores = [] for i in range(len(smiles)): SA_scores.append(-calculateScore(Chem.MolFromSmiles(smiles_rdkit[i]))) print(i) cycle_scores = [] for i in range(len(smiles)): cycle_list = nx.cycle_basis( nx.Graph( Chem.rdmolops.GetAdjacencyMatrix( Chem.MolFromSmiles(smiles_rdkit[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_scores.append(-cycle_length) print(i) SA_scores_normalized = (np.array(SA_scores) - np.mean(SA_scores)) / np.std(SA_scores) logP_values_normalized = (np.array(logP_values) - np.mean(logP_values)) / np.std(logP_values) cycle_scores_normalized = (np.array(cycle_scores) - np.mean(cycle_scores)) / np.std(cycle_scores) return np.mean(SA_scores), np.std(SA_scores), np.mean(logP_values), np.std( logP_values), np.mean(cycle_scores), np.std(cycle_scores)
def reward_penalized_log_p(mol): """ Reward that consists of log p penalized by SA and # long cycles, as described in (Kusner et al. 2017). Scores are normalized based on the statistics of 250k_rndm_zinc_drugs_clean.smi dataset Code taken from implementation of: You, Jiaxuan, et al. "Graph Convolutional Policy Network for Goal-Directed Molecular Graph Generation." arXiv preprint arXiv:1806.02473 (2018). https://github.com/bowenliu16/rl_graph_generation """ # normalization constants, statistics from 250k_rndm_zinc_drugs_clean.smi logP_mean = 2.4570953396190123 logP_std = 1.434324401111988 SA_mean = -3.0525811293166134 SA_std = 0.8335207024513095 cycle_mean = -0.0485696876403053 cycle_std = 0.2860212110245455 try: log_p = MolLogP(mol) except ValueError: return 0 try: SA = -sascorer.calculateScore(mol) except ZeroDivisionError: return 0 # cycle score cycle_list = nx.cycle_basis( nx.Graph(Chem.rdmolops.GetAdjacencyMatrix(mol))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length normalized_log_p = (log_p - logP_mean) / logP_std normalized_SA = (SA - SA_mean) / SA_std normalized_cycle = (cycle_score - cycle_mean) / cycle_std return normalized_log_p + normalized_SA + normalized_cycle
def smiles_reaction_matrix(smarts, *sources, **kwargs): sep = kwargs.setdefault('sep', ' ') molValue = int(kwargs.get('molValue', 400)) logValue = float(kwargs.get('logValue', 4.0)) reaction = ReactionFromSmarts(smarts) smilesLists = [load_smiles_file(source) for source in sources] products = reaction_matrix(reaction, *smilesLists) for reactants, product in products: cids = [r.GetProp("_Name") for r in reactants] product_id = '.'.join(cids) for mol in product: smiles = MolToSmiles(mol, isomericSmiles=True) mol.UpdatePropertyCache(strict=False) mh = AddHs(mol, addCoords=True) mwt = MolWt(mol) if mwt <= molValue: logp = MolLogP(mol) if logp < logValue: yield sep.join((smiles, product_id, str(mwt), str(logp)))+"\n"
def get_task(name_of_task: str): """ Given a task name (eg handed in as an argument to a script call) return the relevant PropertyEvaluator. See code for definition of class names. NB that Guacamol names are given by 'guac_<name>' """ if name_of_task == 'qed': return PropertyEvaluator(qed) elif name_of_task == 'sas': return PropertyEvaluator( lambda smiles: [sascorer.calculateScore(Chem.MolFromSmiles(smiles))]) elif name_of_task == 'pen_logp': return PropertyEvaluator( lambda smiles: [MolLogP(Chem.MolFromSmiles(smiles))]) elif name_of_task[:5] == 'guac_': task = GuacTask.get_name_to_enum()[name_of_task[5:]] return GuacTask.get_guac_property_eval(task) else: raise NotImplementedError(f"{name_of_task} is not implemented.")
def penalized_logp(mol): """ Reward that consists of log p penalized by SA and # long cycles, as described in (Kusner et al. 2017). Scores are normalized based on the statistics of 250k_rndm_zinc_drugs_clean.smi dataset. Args: mol: Rdkit mol object :rtype: :class:`float` """ # normalization constants, statistics from 250k_rndm_zinc_drugs_clean.smi logP_mean = 2.4570953396190123 logP_std = 1.434324401111988 SA_mean = -3.0525811293166134 SA_std = 0.8335207024513095 cycle_mean = -0.0485696876403053 cycle_std = 0.2860212110245455 log_p = MolLogP(mol) SA = -calculateScore(mol) # cycle score cycle_list = nx.cycle_basis(nx.Graph( Chem.rdmolops.GetAdjacencyMatrix(mol))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length normalized_log_p = (log_p - logP_mean) / logP_std normalized_SA = (SA - SA_mean) / SA_std normalized_cycle = (cycle_score - cycle_mean) / cycle_std return normalized_log_p + normalized_SA + normalized_cycle
def cal_prop(q, return_dict_prop): nbits = 1024 while True: qqq = q.get() if qqq == 'DONE': # print('proc =', os.getpid()) break idx, smi = qqq # if idx%10000==0: # print(idx) mol = Chem.MolFromSmiles(smi) logP = MolLogP(mol) SAS = sascorer.calculateScore(mol) QED = qed(mol) MW = MolWt(mol) TPSA0 = TPSA(mol) return_dict_prop[idx] = [logP, SAS, QED, MW, TPSA0]
def penalized_logp(molecule): """Calculates the penalized logP of a molecule. Refactored from https://github.com/wengong-jin/icml18-jtnn/blob/master/bo/run_bo.py See Junction Tree Variational Autoencoder for Molecular Graph Generation https://arxiv.org/pdf/1802.04364.pdf Section 3.2 Penalized logP is defined as: y(m) = logP(m) - SA(m) - cycle(m) y(m) is the penalized logP, logP(m) is the logP of a molecule, SA(m) is the synthetic accessibility score, cycle(m) is the largest ring size minus by six in the molecule. Args: molecule: Chem.Mol. A molecule. Returns: Float. The penalized logP value. """ log_p = MolLogP(molecule) sas_score = sascorer.calculateScore(molecule) largest_ring_size = get_largest_ring_size(molecule) cycle_score = max(largest_ring_size - 6, 0) return log_p - sas_score - cycle_score
canvas.addCanvasText('%s\r\nMolWt: %g\tTPSA: %g' % (s, MolWt(m), TPSA(m)), pos, font) with open('xx' + s + '.png', 'w') as f: canvas.flush() img.save(f) if __name__ == '__main__': drawmol('CN1CCC[C@H]1c2cccnc2') drawmol('CC(=O)OC1=CC=CC=C1C(=O)O') drawmol('O1C=C[C@H]([C@H]1O2)c3c2cc(OC)c4c3OC(=O)C5=C4CCC(=O)5') sys.exit(0) # sample code to use new drawing API (older rdkit do not have DrawString) from rdkit.Chem.AllChem import EmbedMolecule assert EmbedMolecule(m) >= 0 x = Draw.rdMolDraw2D.MolDraw2DSVG(200, 250) x.DrawMolecule(m) x.DrawString('Test String', 20, 200) x.FinishDrawing() print(x.GetDrawingText()) # sample code to generate a legend legstr = '' if molname: legstr += molname + '\n' legstr += '%s\nWt=%g LogP=%g TPSA=%g\nHBA=%d HBD=%d RotBond=%d\n' % \ (smiles, MolWt(mol), MolLogP(mol), TPSA(mol), NumHAcceptors(mol), NumHDonors(mol), NumRotatableBonds(mol))
def _calculate_phys_chem_property(self, mol): return MolLogP(mol)
def calc_logp(smiles_string): """Given a smiles string (ex. C1CCCCC1), calculate and return the LogP""" mol = Chem.MolFromSmiles(smiles_string) return MolLogP(mol)
print('Usage: python rdkit_hlogp_batch.py <smiles> <batch_size>') exit() BATCH_SIZE = int(sys.argv[2]) hlogp_list = list() with open(sys.argv[1]) as smiles_file: file_lines = smiles_file.readlines() for line in file_lines: if line.strip(): smiles, cid = str(line).strip().split()[:2] mol = MolFromSmiles(smiles) remover = SaltRemover() res, deleted = remover.StripMolWithDeleted(mol) if res is not None: res.SetProp('_Name', cid) logp = MolLogP(res) num_heavy_atoms = res.GetNumHeavyAtoms() if num_heavy_atoms > 99: num_heavy_atoms = 99 scaled_logp = scale_logp_value(logp) if logp < 0.0: sign = 'M' #remove the minus sign so it's not printed scaled_logp = scaled_logp * -1 else: sign = 'P' key_string = 'H{:02}{}{:03}'.format(num_heavy_atoms, sign, scaled_logp) #store in list up to batch size, then write out to new file final_string = '{0} {1} {2}\n'.format(smiles, cid, key_string) hlogp_list.append(final_string)
def get_logp_score(states): if not isinstance(states, list): states = [states] return [MolLogP(state) for state in states]
def calc_logp(smiles_string): mol = Chem.MolFromSmiles(smiles_string) return MolLogP(mol)
check=False else: print(char1, char2, "error") error = True break X_d[istring+1]=j Y_d[istring]=j istring+=1 if error: continue for i in range(istring,seq_length): X_d[i+1]=char_dict['Y'] Y_d[i]=char_dict['Y'] m = Chem.MolFromSmiles(smiles) logP = MolLogP(m) SAS = sascorer.calculateScore(m) tpsa0 = TPSA(m) Xdata+=[X_d] Ydata+=[Y_d] Ldata+=[istring+1] cdd=[arr[1], logP/10.0, SAS/10.0, tpsa0/150.0] Pdata+=[cdd] #affinity classification Xdata = np.asarray(Xdata,dtype="int32") Ydata = np.asarray(Ydata,dtype="int32") Ldata = np.asarray(Ldata,dtype="int32") Pdata = np.asarray(Pdata,dtype="float32") print(Xdata.shape,Ydata.shape,Ldata.shape,Pdata.shape) data_dir2="./data/EGFR_property/"