def get_neighbors(query_fp, fp_list, cutoff=0.8, max_nbrs=5): sim_list = BulkTanimotoSimilarity(query_fp, [x[2] for x in fp_list]) nbr_list = [ x[0] + [x[1]] for x in zip(fp_list, sim_list) if x[1] >= cutoff ] nbr_list.sort(key=itemgetter(3), reverse=True) nbr_list = nbr_list[0:max_nbrs] return nbr_list
def Get_compare(smiles_dict): """ The function find the simmular neighbours monomers, using Tanimoto metric of molecul similarity. Parameters ---------- smiles_dict : dict Smiles of each monomers. Returns ------- reps : list Simmular neighbours monomers. """ reps = [] mols = list(smiles_dict.keys()) reps_count = 0 for idx in range(len(mols) - 1): try: Tanimoto = BulkTanimotoSimilarity( Chem.RDKFingerprint(smiles_dict[mols[idx]]), [Chem.RDKFingerprint(smiles_dict[mols[idx + 1]])])[0] except: Tanimoto = 0 if Tanimoto == 1: check = 0 if len(reps) != 0: for rep in reps: if mols[idx] in rep: check = 1 if check == 1: reps[reps_count].append(mols[idx + 1]) else: reps_count += 1 reps.append([]) reps[reps_count].extend([mols[idx], mols[idx + 1]]) else: reps.append([mols[idx], mols[idx + 1]]) return reps
def _find_similarity(self, smiles): indices = list() if len(self.clusters) == 0: return None for i, clu in self.clusters.items(): clu_fps = [s.fingerprint for s in clu if s.fingerprint is not None] sim = BulkTanimotoSimilarity(smiles.fingerprint, clu_fps) best_match = max(sim) if best_match > self.threshold: indices.append(i) if len(indices) != 0: return indices else: return None
def find_not_aa_monomers(js): """ The function searching non amino acids monomers of analyzing molecule Parameters ---------- js : dict Opend rBAN peptideGraph.json. Returns ------- not_aa_monomer : list List of non aminoacids monomers. js : dict Opend rBAN peptideGraph.json, with modified names of some monomers. """ compare_dict = { 'iva': 'CC(C)CC(=O)O', 'hiv': 'CC(C)C(C(=O)O)O', # 'dhb': 'C1=CC(=C(C(=C1)O)O)C(=O)O', #dOH-Bz 'pip': 'C1CCNC(C1)C(=O)O' # Hpr } not_aa_monomer = [] for i in js['monomericGraph']['monomericGraph']['monomers']: target = Chem.MolFromSmiles(i['monomer']['monomer']['smiles']) if target is None: # In some cases can be unparseble structure in rBAN output continue for mon in compare_dict.keys(): compare = Chem.MolFromSmiles(compare_dict[mon]) Tanimoto = BulkTanimotoSimilarity( Chem.RDKFingerprint(target), [Chem.RDKFingerprint(compare)])[0] if Tanimoto == 1: if mon == 'hiv': mon = 'iva' # iva == hiv, the most common case is hiv, but we have only iva hmm (homology structure) not_aa_monomer.append(i['monomer']['index']) i['monomer']['monomer']['monomer'] = mon return not_aa_monomer, js
def most_simi(train_test): train_names, test_names = train_test train_fps, train_props, train_labels = load_smiles(train_names) test_fps, test_props, test_labels = load_smiles(test_names) train_actives = [fp for fp, y in zip(train_fps, train_labels) if y == 1] train_decoys = [fp for fp, y in zip(train_fps, train_labels) if y == 0] test_actives = [fp for fp, y in zip(test_fps, test_labels) if y == 1] test_decoys = [fp for fp, y in zip(test_fps, test_labels) if y == 0] fps_pairs = { 'test_actives vs test_decoys': (test_actives, test_decoys), 'test_actives vs train_actives': (test_actives, train_actives), 'test_decoys vs train_actives': (test_decoys, train_actives), 'test_actives vs train_decoys': (test_actives, train_decoys), 'test_decoys vs train_decoys': (test_decoys, train_decoys), } most_simi = {} for k, (fps_a, fps_b) in fps_pairs.items(): most_simi[k] = [max(BulkTanimotoSimilarity(i, fps_b)) for i in fps_a] return most_simi
def _calculate(self, *, predictions: Sequence[str], labels: Sequence[str], **kwargs) -> Tuple[Real, int]: pipe = make_pipeline( MoleculeTransformer(invalid='raise'), MorganFingerprint( radius=self._radius, n_bits=self._n_bits, return_type='bitvect_list', ), ) label_fingerprints = pipe.fit_transform(labels) pipe[0].invalid = 'skip' prediction_fingerprints = pipe.fit_transform(predictions) total: List[float] = [ max(BulkTanimotoSimilarity(fingerprint, label_fingerprints)) for fingerprint in prediction_fingerprints ] return np.mean(total), 1
def fit_transform(self, molecules: Iterable[Mol], y_ignored=None) -> np.array: """Return Tanimoto similarity matrix. Parameters ---------- molecules : iterable of rdkit.Chem.Mol RDKit molecules. y_ignored : None This formal parameter will be ignored. Returns ------- numpy.ndarray, shape = (len(molecules), len(molecules)) """ ecfp = MorganFingerprint(radius=self.radius, n_bits=self.n_bits, return_type='bitvect_list') fingerprints = ecfp.fit_transform(molecules) # noinspection PyAttributeOutsideInit self.ecfp_ = ecfp # noinspection PyAttributeOutsideInit self.n_features_in_ = 1 n_fingerprints = len(fingerprints) sim_matrix = np.ones((n_fingerprints, n_fingerprints), dtype=self.dtype) for i in range(1, n_fingerprints): sim_index = BulkTanimotoSimilarity(fingerprints[i], fingerprints[:i]) sim_matrix[i, :i] = sim_index sim_matrix[:i, i] = sim_index return sim_matrix
BulkTverskySimilarity, ) from rdkit.ML.Cluster import Butina DEBUG = True DESCRIPTORS = { 'path': RDKFingerprint, 'ecfp4': lambda mol: GetMorganFingerprintAsBitVect(mol, radius=2), 'zinc': lambda mol: GetMorganFingerprintAsBitVect(mol, radius=2, nBits=512), 'apair': lambda mol: GetAtomPairFingerprint(mol) } COEFFICIENTS = { 'tanimoto': lambda x, ys, *args: BulkTanimotoSimilarity(x, ys), 'dice': lambda x, ys, *args: BulkDiceSimilarity(x, ys), 'tversky': lambda x, ys, a, b, *args: BulkTverskySimilarity(x, ys, a, b), } CLUSTERING_APPROACHES = [ 'butina', 'cassidy', ] def mol_parse(it, parser=MolFromSmiles): for num, line in enumerate(it, start=1): cid = str(num) try: tokens = str(line).split()