def testNCI200(self): for d in readTestData(dataNCI200): self.assertAlmostEqual(QED.qed(d.mol), d.expected, msg='QED not equal to expected in line {}'.format(d.lineNo)) # Check that adding hydrogens will not change the result # This is currently not the case. Hydrogens change the number of rotatable bonds and the # number of alerts. mol = Chem.AddHs(d.mol) self.assertAlmostEqual(QED.qed(mol), d.expected, msg='QED not equal to expected in line {}'.format(d.lineNo))
def test_examples(self): # Paroxetine 0.935 self.assertAlmostEqual(QED.qed(Chem.MolFromSmiles('c1cc2OCOc2cc1OCC1CNCCC1c1ccc(F)cc1')), 0.934, places=3) # Leflunomide 0.929 self.assertAlmostEqual(QED.qed(Chem.MolFromSmiles('C1=NOC(C)=C1C(=O)Nc1ccc(cc1)C(F)(F)F')), 0.911, places=3) # Clomipramine 0.779 self.assertAlmostEqual(QED.qed(Chem.MolFromSmiles('CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21')), 0.818, places=3) # Tegaserod 0.213 self.assertAlmostEqual(QED.qed(Chem.MolFromSmiles('CCCCCNC(=N)NN=CC1=CNc2ccc(CO)cc21')), 0.235, places=3)
def testNCI200(self): for d in readTestData(dataNCI200): self.assertAlmostEqual( QED.qed(d.mol), d.expected, msg='QED not equal to expected in line {}'.format(d.lineNo)) # Check that adding hydrogens will not change the result # This is currently not the case. Hydrogens change the number of rotatable bonds and the # number of alerts. mol = Chem.AddHs(d.mol) self.assertAlmostEqual( QED.qed(mol), d.expected, msg='QED not equal to expected in line {}'.format(d.lineNo))
def one_slurm_qed(list_smiles, unique_id, name): """ :param list_smiles: :param unique_id: :param name: :return: """ dirname = os.path.join(script_dir, 'results', name, 'docking_small_results') dump_path = os.path.join(dirname, f"{unique_id}.csv") header = ['smile', 'score'] with open(dump_path, 'w', newline='') as csvfile: csv.writer(csvfile).writerow(header) for smile in list_smiles: m = Chem.MolFromSmiles(smile) if m is not None: score_smile = QED.qed(m) else: score_smile = 0 with open(dump_path, 'a', newline='') as csvfile: list_to_write = [smile, score_smile] csv.writer(csvfile).writerow(list_to_write)
def calc(smi, name): m = Chem.MolFromSmiles(smi) if m is not None: try: hba = rdMolDescriptors.CalcNumHBA(m) hbd = rdMolDescriptors.CalcNumHBD(m) nrings = rdMolDescriptors.CalcNumRings(m) rtb = rdMolDescriptors.CalcNumRotatableBonds(m) psa = rdMolDescriptors.CalcTPSA(m) logp, mr = rdMolDescriptors.CalcCrippenDescriptors(m) mw = rdMolDescriptors._CalcMolWt(m) csp3 = rdMolDescriptors.CalcFractionCSP3(m) hac = m.GetNumHeavyAtoms() if hac == 0: fmf = 0 else: fmf = GetScaffoldForMol(m).GetNumHeavyAtoms() / hac qed = QED.qed(m) nrings_fused = fused_ring_count(m) return name, hba, hbd, hba + hbd, nrings, rtb, round(psa, 2), round(logp, 2), round(mr, 2), round(mw, 2), \ round(csp3, 3), round(fmf, 3), round(qed, 3), hac, nrings_fused except: sys.stderr.write( f'molecule {name} was omitted due to an error in calculation of some descriptors\n' ) return None else: sys.stderr.write('smiles %s cannot be parsed (%s)' % (smi, name)) return None
def score_molecule(smiles): lipinski_score = 0 qed = LipinskiRuleOfFiveDecorator.MAX_QED + 1 try: m = Chem.MolFromSmiles(smiles) logp = Descriptors.MolLogP(m) lipinski_score += 1 if logp < LipinskiRuleOfFiveDecorator.MAX_LOGP else 0 wt = Descriptors.MolWt(m) lipinski_score += 1 if wt < LipinskiRuleOfFiveDecorator.MAX_MOL_WT else 0 hdonor = Lipinski.NumHDonors(m) lipinski_score += 1 if hdonor < LipinskiRuleOfFiveDecorator.MAX_H_DONORS else 0 hacceptor = Lipinski.NumHAcceptors(m) lipinski_score += 1 if hacceptor < LipinskiRuleOfFiveDecorator.MAX_H_DONORS else 0 rotatable_bond = Lipinski.NumRotatableBonds(m) lipinski_score += 1 if rotatable_bond < LipinskiRuleOfFiveDecorator.MAX_ROTATABLE_BONDS else 0 qed = QED.qed(m) except Exception as ex: lipinski_score = 0 logger.exception(ex) return lipinski_score, qed
def sample_qed(data_smiles, max_num): exam_molecules_smiles = random.sample(data_smiles, max_num) qed_list = [] for i in range(len(exam_molecules_smiles)): qed_list.append(QED.qed(Chem.MolFromSmiles(exam_molecules_smiles[i]))) print(i, exam_molecules_smiles[i], qed_list[i]) return (exam_molecules_smiles, qed_list)
def policy_evaluate(self): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ player = MCTSPlayer(self.policy_value_net.policy_value, c_puct=self.c_puct, n_playout=30) environment = Molecule(["C", "O", "N"], init_mol=self.mol, allow_removal=True, allow_no_modification=False, allow_bonds_between_rings=False, allowed_ring_sizes=[5, 6], max_steps=10, target_fn=None, record_path=False) environment.initialize() environment.init_qed = QED.qed(Chem.MolFromSmiles(self.mol)) moves, fp, _S_P, _Qs = player.get_action(environment, temp=self.temp, return_prob=1, rand=False) return moves, _S_P, _Qs
def get_score(objective, mol, smiles2score=None): try: if objective == 'qed': # print('qed call') return QED.qed(mol) elif objective == 'docking': smiles = Chem.MolToSmiles(mol) if smiles in smiles2score: return smiles2score[smiles] value = -oracle2(smiles) smiles2score[smiles] = value return value #### elif objective == 'sa': # print('sa call') x = sa_scorer.calculateScore(mol) return (10. - x) / 9. # normalized to [0, 1] elif objective == 'mw': # molecular weight return mw(mol) elif objective == 'logp': # real number print('logp call') return Descriptors.MolLogP(mol) elif objective == 'penalized_logp': print('plogp call') return penalized_logp(mol) elif 'rand' in objective: raise NotImplementedError # return rand_scorer.get_score(objective, mol) else: raise NotImplementedError except ValueError: return 0.
def _reward(self): """Calculates the reward of the current state. The reward is defined as a tuple of the similarity and QED value. Returns: A tuple of the similarity and qed value """ # calculate similarity. # if the current molecule does not contain the scaffold of the target, # similarity is zero. if self._state is None: return 0.0, 0.0 mol = Chem.MolFromSmiles(self._state) if mol is None: return 0.0, 0.0 qed_value = QED.qed(mol) sas = SA_Score.sascorer.calculateScore(mol) # c1 = soft_cst(sas, FLAGS.target_sas - 0.2, FLAGS.target_sas + 0.2) # c2 = soft_cst(qed_value, FLAGS.target_qed - 0.1, FLAGS.target_qed + 0.1) # # if c1 < 0 and c2 < 0: # # return - c1 * c2 # # else: # # return c1 * c2 return (soft_cst(sas, FLAGS.target_sas - 0.2, FLAGS.target_sas + 0.2) + soft_cst(qed_value, FLAGS.target_qed - 0.1, FLAGS.target_qed + 0.1)) * FLAGS.gamma**(self.max_steps - self._counter)
def calc(smi, name): m = Chem.MolFromSmiles(smi) if m is not None: try: hba = rdMolDescriptors.CalcNumHBA(m) hbd = rdMolDescriptors.CalcNumHBD(m) nrings = rdMolDescriptors.CalcNumRings(m) rtb = rdMolDescriptors.CalcNumRotatableBonds(m) psa = rdMolDescriptors.CalcTPSA(m) logp, mr = rdMolDescriptors.CalcCrippenDescriptors(m) mw = rdMolDescriptors._CalcMolWt(m) csp3 = rdMolDescriptors.CalcFractionCSP3(m) hac = m.GetNumHeavyAtoms() if hac == 0: fmf = 0 else: fmf = GetScaffoldForMol(m).GetNumHeavyAtoms() / hac qed = QED.qed(m) nrings_fused = fused_ring_count(m) n_unique_hba_hbd_atoms = count_hbd_hba_atoms(m) max_ring_size = len(max(m.GetRingInfo().AtomRings(), key=len, default=())) n_chiral_centers = len(FindMolChiralCenters(m, includeUnassigned=True)) fcsp3_bm = rdMolDescriptors.CalcFractionCSP3(GetScaffoldForMol(m)) return name, hba, hbd, hba + hbd, nrings, rtb, round(psa, 2), round(logp, 2), round(mr, 2), round(mw, 2), \ round(csp3, 3), round(fmf, 3), round(qed, 3), hac, nrings_fused, n_unique_hba_hbd_atoms, \ max_ring_size, n_chiral_centers, round(fcsp3_bm, 3) except: sys.stderr.write(f'molecule {name} was omitted due to an error in calculation of some descriptors\n') return None else: sys.stderr.write('smiles %s cannot be parsed (%s)' % (smi, name)) return None
def run(self): """run the training pipeline""" try: for i in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) print("batch i: {}, episode_len: {}".format( i + 1, self.episode_len)) if len(self.data_buffer) >= self.batch_size: loss, entropy = self.policy_update() print("loss is {} entropy is {}".format(loss, entropy)) # check the performance of the current model, # and save the model params if (i + 1) % self.check_freq == 0: print("current self-play batch: {}".format(i + 1)) move_list, _S_P, _Qs = self.policy_evaluate() # self.policy_value_net.save_model('./current_policy.model') print(move_list) print(_Qs) print(_S_P) self.output_smi.extend(move_list) o_qed = list( map(lambda x: QED.qed(Chem.MolFromSmiles(x)), move_list)) print(o_qed) print("#" * 30) self.output_qed.extend(o_qed) except KeyboardInterrupt: print('\n\rquit')
def _reward(self): """Calculates the reward of the current state. The reward is defined as a tuple of the similarity and QED value. Returns: A tuple of the similarity and qed value """ # calculate similarity. # if the current molecule does not contain the scaffold of the target, # similarity is zero. if self._state is None: return 0.0, 0.0 mol = Chem.MolFromSmiles(self._state) if mol is None: return 0.0, 0.0 if molecules.contains_scaffold(mol, self._target_mol_scaffold): similarity_score = self.get_similarity(self._state) else: similarity_score = 0.0 # calculate QED qed_value = QED.qed(mol) return similarity_score * FLAGS.gamma**( self.max_steps - self._counter), qed_value * FLAGS.gamma**( self.max_steps - self._counter)
def check_qed(dataset): with open('generated_smiles_%s' % dataset, 'rb') as f: all_smiles = set(pickle.load(f)) f = open('real.txt', 'rt') f2 = open('pred.txt', 'w') test = f.readline() qed_sum = 0 total = 0 qed_score_per_molecule = [] diff = [] real = [] pred = [] for idx, smiles in enumerate(all_smiles): print(idx) if idx > 5000: break real_qed = f.readline() real.append(float(real_qed)) new_mol = Chem.MolFromSmiles(smiles) try: val = QED.qed(new_mol) pred.append(val) f2.write(str(val) + "\n") except: continue qed_sum += val diff.append(abs(float(real_qed) - val)) qed_score_per_molecule.append(val) total += 1 f2.close() return qed_sum / total, qed_score_per_molecule, diff, real, pred
def _playout(self, state, n): """Run a single playout from the root to the leaf, getting a value at the leaf and propagating it back through its parents. State is modified in-place, so a copy must be provided. """ node = self._root while state._counter < state.max_steps: if node.is_leaf(): action_probs = [( state._valid_actions[i], QED.qed(Chem.MolFromSmiles(state._valid_actions[i])), ) for i in range(len(state._valid_actions_fp))] # Check for end of game. # print(state._counter, state.max_steps) node.expand(action_probs) # Greedily select next move. action, node = node.select(self._c_puct, n) state.step(action) # self.update_with_move(action, node._fp) if state._counter == state.max_steps: # qv=QED.qed(Chem.MolFromSmiles(action)) node._Q = node._P print("###") node.update_recursive(node._Q)
def __getitem__(self, idx): item = self.smiles_dataset[idx] input_random, input_label, input_adj_mask = self.random_masking(item) input_data = [self.vocab.start_index ] + input_random + [self.vocab.end_index] input_label = [self.vocab.pad_index ] + input_label + [self.vocab.pad_index] input_adj_mask = [0] + input_adj_mask + [0] # give info to start token if self.mat_pos == 'start': input_adj_mask = [1] + [0 for _ in range(len(input_adj_mask) - 1)] smiles_bert_input = input_data[:self.seq_len] smiles_bert_label = input_label[:self.seq_len] smiles_bert_adj_mask = input_adj_mask[:self.seq_len] padding = [0 for _ in range(self.seq_len - len(smiles_bert_input))] smiles_bert_input.extend(padding) smiles_bert_label.extend(padding) smiles_bert_adj_mask.extend(padding) mol = Chem.MolFromSmiles(self.adj_dataset[idx]) smiles_bert_value = QED.qed(mol) adj_mat = GetAdjacencyMatrix(mol) smiles_bert_adjmat = self.zero_padding(adj_mat, (self.seq_len, self.seq_len)) output = {"smiles_bert_input": smiles_bert_input, "smiles_bert_label": smiles_bert_label, \ "smiles_bert_adj_mask": smiles_bert_adj_mask, "smiles_bert_adjmat": smiles_bert_adjmat, "smiles_bert_value": smiles_bert_value} return {key: torch.tensor(value) for key, value in output.items()}
def _reward(self): """Calculates the reward of the current state. The reward is defined as a tuple of the similarity and QED value. Returns: A tuple of the similarity and qed value """ # calculate similarity. # if the current molecule does not contain the scaffold of the target, # similarity is zero. if self._state is None: return 0.0 mol = Chem.MolFromSmiles(self._state) if mol is None: return 0.0 qed_value = QED.qed(mol) sas = SA_Score.sascorer.calculateScore(mol) c1 = -abs(sas - FLAGS.target_sas) c2 = -abs(qed_value - FLAGS.target_qed) if FLAGS.use_multiply: if c1 < 0 and c2 < 0: reward = -c1 * c2 else: reward = c1 * c2 else: reward = (c1 + c2) return reward * FLAGS.gamma**(self.max_steps - self._counter)
def _rdkit_eval(entry: dict) -> dict: """Computes the chemical properties from RDKit, adds them to the input dictionary""" mol = Chem.MolFromSmiles(entry['smiles']) entry['logP'] = Crippen.MolLogP(mol) entry['QED'] = QED.qed(mol) entry['SA_score'] = calculateScore(mol) return entry
def quantitative_estimation_druglikeness_scores(mols, norm=False): return np.array( list( map(lambda x: 0 if x is None else x, [ MolecularMetrics._avoid_sanitization_error( lambda: QED.qed(mol)) if mol is not None else None for mol in mols ])))
def qed(s): if s is None: return 0.0 mol = Chem.MolFromSmiles(s) try: qed_score = QED.qed(mol) except: qed_score = 0 return qed_score
def score(self, smiles): mol = Chem.MolFromSmiles(smiles) qed_score = QED.qed(mol) sa_score = sascorer.calculateScore(mol) return 5 * qed_score - sa_score
def transform_text_to_qed(text_line): molecules = [rdkit_general_ops.get_molecule(mol_str, kekulize=False) for mol_str in text_line.split('.')] qed_scores = [QED.qed(mol) for mol in molecules] # May have many products so take max (given this is what we are optimising for in the optimisation part). # Expect this to be less of an issue in practice as USPTO mostly details # single product reactions. It may be interesting to look at using the Molecular Transformer prediction on # these reactions rather than this ground truth and other ways of combining multiple products eg mean. return np.max(qed_scores)
def properties_violin(filepaths, labels, pred_type): properties = [] for i, fname in enumerate(filepaths): with open(filepaths[i], 'r') as f: reader = csv.reader(f) it = iter(reader) # next(it, None) # skip first item. for row in it: if pred_type == 'pIC50': properties.append( [labels[i], 'IC50 for KOR', float(row[1])]) if i != 0: properties.append([labels[i], 'SA score', float(row[2])]) try: mol = Chem.MolFromSmiles(row[0]) q = QED.qed(mol) # x, y = desc.MolWt(mol), Crippen.MolLogP(mol) # properties.append([labels[i],'Molecular weight',x]) # properties.append([labels[i],'logP',y]) properties.append([labels[i], 'QED', q]) except: print("Non-Canonical SMILES: " + row[0]) else: try: mole = smiles2mol(row[0]) prediction_sas = SAscore(mole) properties.append( [labels[i], 'SA score', float(prediction_sas[0])]) mol = Chem.MolFromSmiles(row[0]) q = QED.qed(mol) # x, y = desc.MolWt(mol), Crippen.MolLogP(mol) # properties.append([labels[i],'Molecular weight',x]) # properties.append([labels[i],'logP',y]) properties.append([labels[i], 'QED', q]) except: print("Non-Canonical SMILES: " + row[0]) df = pd.DataFrame(properties, columns=['Sets', 'Property', 'Value']) return df
def testRegression(self): if not doLong: raise unittest.SkipTest('long test') for d in readTestData(dataRegression): self.assertAlmostEqual( QED.qed(d.mol), d.expected, msg='QED not equal to expected in line {}'.format(d.lineNo))
def QED_oracle(smiles): # takes a list of smiles and returns a list of corresponding QEDs t = torch.zeros(len(smiles)) for i, s in enumerate(smiles): m = Chem.MolFromSmiles(s) if m is not None: t[i] = QED.qed(m) return t
def _reward(self): molecule = Chem.MolFromSmiles(self._state) if molecule is None: return 0.0 try: qed = QED.qed(molecule) except ValueError: qed = 0 return qed * FLAGS.gamma**(self.max_steps - self._counter)
def updateTestData(): """ Update the test data. This should only be done if the method changes! """ for filename in (dataNCI200, dataRegression,): data = list(readTestData(filename)) with open(filename, 'w') as f: print('# Test data for QED descriptor', file=f) for d in data: expected = QED.qed(d.mol) print('{0.smiles},{1}'.format(d, expected), file=f)
def _reward(self): """Reward of a state. Returns: intermediate reward: SA score, QED score final reward: Docking score (a negative value of the binding energy) """ molecule = Chem.MolFromSmiles(self._state) if molecule is None: return 0.0 # calculate SA and QED score sa = calculateScore(molecule) sa_norm = round((10 - sa) / 9, 2) # normalize the SA score qed = round(QED.qed(molecule), 2) print("SA score and QED: {}, {} : {}".format(sa_norm, qed, self._state)) if self._counter < self.max_steps: # intermediate state return round( (sa_norm + qed) * self.discount_factor**(self.max_steps - self.num_steps_taken), 2) if self._counter >= self.max_steps: # terminal state # create SMILES file with open('ligand.smi', 'w') as f: f.write(self._state) # convert SMILES > PDBQT # --gen3d: the option for generating 3D coordinate # -h: protonation cvt_cmd = "obabel ligand.smi -O ligand.pdbqt --gen3D -h > cvt_log.txt" os.system(cvt_cmd) # docking docking_cmd = "qvina02 --config config.txt --num_modes=1 > log_docking.txt" os.system(docking_cmd) # parsing docking score from log file try: data = pd.read_csv('log_docking.txt', sep="\t", header=None) except: return 0.0 docking_score = round(float(data.values[-2][0].split()[1]), 2) print("binding energy value: " + str(round(docking_score, 2)) + '\t' + self._state) # record a optimized result with the SMILES, docking score, SA score, # and QED score. with open('./optimized_result_total.txt', 'a') as f2: f2.write(self._state + '\t' + str(docking_score) + '\t' + str(sa_norm) + '\t' + str(qed) + '\n') # we use the negative of the docking score because the lower docking score # the better. return round(-docking_score, 2)
def get_mol_props(smiles): """Get the molecular properties of a single molecule""" mol = Chem.MolFromSmiles(smiles) assert mol is not None mol_wt = Descriptors.MolWt(mol) log_p = Descriptors.MolLogP(mol) qed = QED.qed(mol) assert (mol_wt is not None and log_p is not None and qed is not None) return mol_wt, log_p, qed
def __call__(self, smiles_list): scores = [] for smiles in smiles_list: mol = Chem.MolFromSmiles(smiles) if mol is None: scores.append(0) else: scores.append(QED.qed(mol)) return np.float32(scores)
def get_property(smi): try: mol=Chem.MolFromSmiles(smi) property = [Descriptors.ExactMolWt(mol), Descriptors.MolLogP(mol), QED.qed(mol)] except: property = 'invalid' return property
def _reward(self): """Reward of a state. Returns: Float. QED of the current state. """ molecule = Chem.MolFromSmiles(self._state) if molecule is None: return 0.0 qed = QED.qed(molecule) return qed * self.discount_factor ** (self.max_steps - self.num_steps_taken)
def get_properties(smiles, target_molecule='C1CCC2CCCCC2C1'): target_mol_fp = AllChem.GetMorganFingerprintAsBitVect( Chem.MolFromSmiles(target_molecule), radius=2, nBits=2048) mol = Chem.MolFromSmiles(smiles) if mol is None: return 0.0, 0.0 fingerprint_structure = AllChem.GetMorganFingerprintAsBitVect( mol, radius=2, nBits=2048) sim = DataStructs.TanimotoSimilarity(target_mol_fp, fingerprint_structure) qed = QED.qed(mol) return sim, qed
def testRegression(self): if not doLong: raise unittest.SkipTest('long test') for d in readTestData(dataRegression): self.assertAlmostEqual(QED.qed(d.mol), d.expected, msg='QED not equal to expected in line {}'.format(d.lineNo))