def pipe_sim_filter(stream, query, cutoff=0.8, summary=None, comp_id="pipe_sim_filter"): """Filter for compounds that have a similarity greater or equal than `cutoff` to the `query` Smiles. If the field `FP_b64` (e.g. pre-calculated) is present, this will be used, otherwise the fingerprint will be generated on-the-fly (much slower).""" rec_counter = 0 query_mol = Chem.MolFromSmiles(query) if not query_mol: print("* {} ERROR: could not generate query from SMILES.".format( comp_id)) return None query_fp = FingerprintMols.FingerprintMol(query_mol) for rec in stream: if "mol" not in rec: continue if "FP_b64" in rec: # use the pre-defined fingerprint if it is present in the stream mol_fp = pickle.loads(b64.b64decode(rec["FP_b64"])) else: mol_fp = FingerprintMols.FingerprintMol(rec["mol"]) sim = DataStructs.FingerprintSimilarity(query_fp, mol_fp) if sim >= cutoff: rec_counter += 1 if summary is not None: summary[comp_id] = rec_counter yield rec
def ScreenInDb(details, mol): try: probeFp = FingerprintMols.FingerprintMol(mol, **details.__dict__) except Exception: import traceback FingerprintMols.error('Error: problems fingerprinting molecule.\n') traceback.print_exc() return [] if details.metric not in (DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity, DataStructs.CosineSimilarity): return ScreenFingerprints(details, data=GetFingerprints(details), mol=mol) conn: DbConnect = _ConnectToDatabase(details) if details.metric == DataStructs.TanimotoSimilarity: func = 'rd_tanimoto' elif details.metric == DataStructs.DiceSimilarity: func = 'rd_dice' elif details.metric == DataStructs.CosineSimilarity: func = 'rd_cosine' pkl = probeFp.ToBitString() extraFields = f"{func}({DbModule.placeHolder},{details.fpColName}) as tani" cmd = _ConstructSQL(details, extraFields=extraFields) if details.doThreshold: # we need to do a subquery here: cmd = f"select * from ({cmd}) tmp where tani>{details.screenThresh}" cmd += " order by tani desc" if not details.doThreshold and details.topN > 0: cmd += f" limit {details.topN}" curs = conn.GetCursor() curs.execute(cmd, (pkl, )) return curs.fetchall()
def calc_similarity(compound_one, compound_two): if compound_one in joint_sim: if compound_two in joint_sim[compound_one]: return joint_sim[compound_one][compound_two] else: joint_sim[compound_one] = dict() if compound_two not in joint_sim: joint_sim[compound_two] = dict() if cg_props[compound_one.lower()]["type"] != cg_props[compound_one.lower()]["type"]: joint_sim[compound_one][compound_two] = 0.0 joint_sim[compound_two][compound_one] = 0.0 return 0.0 from rdkit import DataStructs from rdkit.Chem.Fingerprints import FingerprintMols from rdkit import Chem mol_one = Chem.MolFromSmiles(str(cg_props[compound_one.lower()]["smiles"])) mol_two = Chem.MolFromSmiles(str(cg_props[compound_two.lower()]["smiles"])) fp_1 = FingerprintMols.FingerprintMol(mol_one) fp_2 = FingerprintMols.FingerprintMol(mol_two) similarity = DataStructs.FingerprintSimilarity(fp_1, fp_2) joint_sim[compound_one][compound_two] = similarity joint_sim[compound_two][compound_one] = similarity return similarity
def __init__(self, moli, molj): """ Inizialization function Parameters ---------- moli : RDKit molecule object the first molecule used to perform the Figureprint calculation molj : RDKit molecule object the second molecule used to perform the Figureprint calculation options : argparse python object the list of user options """ # Set logging level and format logging.basicConfig(format='%(levelname)s:\t%(message)s', level=logging.INFO) # Local pointers to the passed molecules self.moli = moli self.molj = molj if not options.verbose == 'pedantic': lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) self.fps_moli = FingerprintMols.FingerprintMol(self.moli) self.fps_molj = FingerprintMols.FingerprintMol(self.molj) self.fps_tan = DataStructs.FingerprintSimilarity( self.fps_moli, self.fps_molj)
def test__init__(self): from rdkit.Chem.Fingerprints import FingerprintMols ms = [ Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC') ] fps = [FingerprintMols.FingerprintMol(x) for x in ms] self.assertAlmostEqual(FingerprintSimilarity(fps[0], fps[1]), 0.6, places=2) details = FingerprinterDetails() fpArgs = details.__dict__ fps = [] for i, x in enumerate(ms, 1): fpArgs['fpSize'] = 16 * i fps.append(FingerprintMols.FingerprintMol(x, **fpArgs)) self.assertAlmostEqual(FingerprintSimilarity(fps[0], fps[1]), 0.555, places=2) self.assertAlmostEqual(FingerprintSimilarity(fps[1], fps[0]), 0.555, places=2) fpArgs['fpSize'] = 1024 fpArgs['tgtDensity'] = 0.8 fp = FingerprintMols.FingerprintMol(ms[0], **fpArgs) self.assertEqual(len(fp), 64) fp = DataStructs.FoldToTargetDensity(fp, density=0.1, minLength=2) self.assertEqual(len(fp), 4)
def _mols_similarity_base_r0(ms_smiles_mid, ms_smiles_base): """ Input: dictionary type required such as {nick name: smiles code, ...} """ from rdkit import Chem from rdkit import DataStructs from rdkit.Chem.Fingerprints import FingerprintMols # processing for mid print("Target: " + ms_smiles_mid.keys()) ms_mid = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles_mid.values()] # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')] fps_mid = [FingerprintMols.FingerprintMol(x) for x in ms_mid] # processing for base print("Base: " + ms_smiles_base.keys()) ms_base = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles_base.values()] # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')] fps_base = [FingerprintMols.FingerprintMol(x) for x in ms_base] for (bx, f_b) in enumerate(fps_base): for (dx, f_d) in enumerate(fps_mid): print("Base:{0}, Target:{1}".format(ms_smiles_base.keys()[bx], ms_smiles_mid.keys()[dx])) print(DataStructs.FingerprintSimilarity(f_b, f_d))
def molecular_similarity(best, parent_candidates, all=False): """ returns a similarity score (0-1) of best with the closest molecular relative in parent_candidates Parameters ---------- best : object Chromosome object, the current mutated candidate parent_candidates : array parent pool of molecules to compare with best. These are represented by SMILES all : boolean, optional, default = False default behavior is false and the tanimoto similarity score is returned. If True tanimoto, dice, cosine, sokal, kulczynski, and mcconnaughey similarities are returned Returns ---------- similarity_score : float similarity_index : int if all=False the best tanimoto similarity score as well as the index of the closest molecular relative are returned if all=True an array of best scores and indeces of the closest molecular relative are returned """ scores = [] if all: indices = [] metrics = [ DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity, DataStructs.CosineSimilarity, DataStructs.SokalSimilarity, DataStructs.KulczynskiSimilarity, DataStructs.McConnaugheySimilarity ] for j in range(len(metrics)): scores_micro = [] for i in range(len(parent_candidates)): ms = [best.Mol, Chem.MolFromSmiles(parent_candidates[i])] fps = [FingerprintMols.FingerprintMol(x) for x in ms] score = DataStructs.FingerprintSimilarity(fps[0], fps[1], metric=metrics[j]) scores_micro.append(score) scores.append(max(scores_micro)) indices.append(scores_micro.index(max(scores_micro))) return scores, indices else: for i in range(len(parent_candidates)): ms = [best.Mol, Chem.MolFromSmiles(parent_candidates[i])] fps = [FingerprintMols.FingerprintMol(x) for x in ms] score = DataStructs.FingerprintSimilarity(fps[0], fps[1]) scores.append(score) return max(scores), scores.index(max(scores))
def tanimotoComparison(pred_prod_list, true_prod_list): # Return the tanimoto score pred_mol = Chem.MolFromSmiles(pred_prod_list) answer_mol = Chem.MolFromSmiles(true_prod_list) pred_fps = FingerprintMols.FingerprintMol(pred_mol) answer_fps = FingerprintMols.FingerprintMol(answer_mol) return DataStructs.FingerprintSimilarity(pred_fps, answer_fps)
def build_reactions(perturbations_all_paths, mcs_neighbours): # loop over each perturbation in the list and load the pdb files: perturbation_reactions = [] ########################################### perturbations_unnested = list( itertools.chain.from_iterable(perturbations_all_paths)) all_members = [] all_members_FPs = [] for member in perturbations_unnested: member_pdb_file = open(member, 'r').read() all_members.append(rdmolfiles.MolFromPDBBlock(member_pdb_file)) all_members_FPs.append([ FingerprintMols.FingerprintMol( rdmolfiles.MolFromPDBBlock(member_pdb_file)) ]) first_pair = perturbations_all_paths[0] member1_pdb_file = open(first_pair[0], 'r').read() member2_pdb_file = open(first_pair[1], 'r').read() size_member1 = rdMolDescriptors.CalcExactMolWt( rdmolfiles.MolFromPDBBlock(member1_pdb_file)) size_member2 = rdMolDescriptors.CalcExactMolWt( rdmolfiles.MolFromPDBBlock(member2_pdb_file)) if size_member1 >= size_member2: query_member = FingerprintMols.FingerprintMol( rdmolfiles.MolFromPDBBlock(member1_pdb_file)) else: query_member = FingerprintMols.FingerprintMol( rdmolfiles.MolFromPDBBlock(member2_pdb_file)) similarities = [ AllChem.DataStructs.FingerprintSimilarity(query_member, target_fp[0]) for target_fp in all_members_FPs ] similarities_to_query = dict(zip(all_members, similarities)) mol_similarities = dict( sorted(similarities_to_query.items(), key=lambda kv: kv[1], reverse=True)) ordered_mol_similarities = {} for key, value in mol_similarities.items(): if value not in ordered_mol_similarities.values(): ordered_mol_similarities[key] = value similar_hits = [] for key, value in ordered_mol_similarities.items(): similar_hits.append(key) neighbours = similar_hits[:5] print(neighbours)
def cal_pairwise_tanimoto(pair): i, j = (pair) x = Chem.MolFromSmiles(smiles[i]) y = Chem.MolFromSmiles(smiles[j]) fps1 = FingerprintMols.FingerprintMol(x) fps2 = FingerprintMols.FingerprintMol(y) tani = DataStructs.TanimotoSimilarity(fps1, fps2) values = (str(i) + "," + str(j) + "," + str(tani) + "\n") return values
def GetFingerprints(details): """ returns an iterable sequence of fingerprints each fingerprint will have a _fieldsFromDb member whose first entry is the id. """ if details.dbName and details.tableName: try: conn = DbConnect(details.dbName, details.tableName) if hasattr(details, 'dbUser'): conn.user = details.dbUser if hasattr(details, 'dbPassword'): conn.password = details.dbPassword except: import traceback FingerprintMols.error( 'Error: Problems establishing connection to database: %s|%s\n' % (details.dbName, details.tableName)) traceback.print_exc() cmd = _ConstructSQL(details, extraFields=details.fpColName) curs = conn.GetCursor() #curs.execute(cmd) #print 'CURSOR:',curs,curs.closed if _dataSeq: suppl = _dataSeq(curs, cmd, depickle=not details.noPickle, klass=DataStructs.ExplicitBitVect) _dataSeq._conn = conn else: suppl = DbFpSupplier.ForwardDbFpSupplier( data, fpColName=details.fpColName) elif details.inFileName: conn = None try: inF = open(details.inFileName, 'r') except IOError: import traceback FingerprintMols.error('Error: Problems reading from file %s\n' % (details.inFileName)) traceback.print_exc() supple = [] done = 0 while not done: try: id, fp = cPickle.load(inF) except: done = 1 else: fp._fieldsFromDb = [id] suppl.append(fp) else: suppl = None return suppl
def TakeInput(filepath, hmdb_filepath, OR_name): positive_Cancer = extractPositiveOnes(filepath) data_hmdb = pd.read_csv(hmdb_filepath, encoding="ISO-8859-1") positive_Cancer = pd.read_csv(filepath, encoding="ISO-8859-1") hmdb_names = data_hmdb['NAME'] hmdb_SMILES = data_hmdb['SMILES'] positive_Cancer_SMILES = positive_Cancer['Smiles'] positive_Cancer_Names = positive_Cancer["Ligand"] hmdb_data = pd.concat([hmdb_SMILES, hmdb_names], axis=1) dataframe = pd.concat([positive_Cancer_SMILES, positive_Cancer_Names], axis=1) Cancer_clean_data = dataframe.drop_duplicates() Cancer_clean_data = Cancer_clean_data.reset_index(drop=True) df1 = pd.DataFrame({ "Cancer_Molecule": [], "Cancer_SMILES": [], "HMDB_Molecule": [], "HMDB_SMILES": [], "TANIMOTO_Similarity_Value": [] }) hmdb_data = hmdb_data.reset_index(drop=True) k = 0 for i in range(len(Cancer_clean_data)): # df1=df1.iloc[0:0] # df1= pd.DataFrame({"Cancer_clean_data_Molecule":[],"Cancer_clean_data_SMILES":[],"HMDB_Molecule":[],"HMDB_SMILES":[],"TANIMOTO_Similarity_Value":[]}) y = Chem.MolFromSmiles(Cancer_clean_data['Smiles'][i]) fps1 = FingerprintMols.FingerprintMol(y) for j in range(len(hmdb_data)): try: x = Chem.MolFromSmiles(hmdb_data['SMILES'][j]) fps2 = FingerprintMols.FingerprintMol(x) sim_val = DataStructs.FingerprintSimilarity(fps1, fps2) if sim_val >= 0.85: # threshold for similarity value df1.loc[k] = [ Cancer_clean_data['Ligand'][i], Cancer_clean_data['Smiles'][i], hmdb_data['NAME'][j], hmdb_data['SMILES'][j], sim_val ] k = k + 1 except: print("WARNING") print("Comparison Done for Ligand :" + str(i)) df1.to_csv("Final_test_set_" + OR_name + ".csv") Ligand = df1["Cancer_clean_data_Molecule"] Smiles = df1["Cancer_clean_data_SMILES"] Activation_Status = [] Shortlisted_Metabolites = pd.DataFrame( list(zip(Smiles, Ligand, Activation_Status)), columns=['Smiles', 'Ligand', 'Activation Status']) Shortlisted_Metabolites = Shortlisted_Metabolites.drop_duplicates( subset='Ligand', keep='first') Shortlisted_Metabolites.to_csv("Shortlisted_Metabolites" + OR_Name + ".csv") print("Shortlisted_Metabolites" + OR_Name + ".csv" + " has been saved") print("Congrats! Final_test_set_" + OR_Name + ".csv has been successfully saved!")
def tanimoto_score(mol1, mol2): """Compute the similarity via Tanimoto fingerprints for mol1 and mol2.""" from rdkit.Chem.Fingerprints import FingerprintMols from rdkit import DataStructs fp1 = FingerprintMols.FingerprintMol(mol1) fp2 = FingerprintMols.FingerprintMol(mol2) return 1.0 / (DataStructs.FingerprintSimilarity(fp1, fp2) + 1e-15)
def ScreenInDb(details, mol): try: probeFp = apply(FingerprintMols.FingerprintMol, (mol, ), details.__dict__) except: import traceback FingerprintMols.error('Error: problems fingerprinting molecule.\n') traceback.print_exc() return [] if details.dbName and details.tableName: try: conn = DbConnect(details.dbName, details.tableName) if hasattr(details, 'dbUser'): conn.user = details.dbUser if hasattr(details, 'dbPassword'): conn.password = details.dbPassword except: import traceback FingerprintMols.error( 'Error: Problems establishing connection to database: %s|%s\n' % (details.dbName, details.tableName)) traceback.print_exc() if details.metric not in (DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity, DataStructs.CosineSimilarity): data = GetFingerprints(details) res = ScreenFingerprints(details, data, mol) else: res = [] if details.metric == DataStructs.TanimotoSimilarity: func = 'rd_tanimoto' pkl = probeFp.ToBitString() elif details.metric == DataStructs.DiceSimilarity: func = 'rd_dice' pkl = probeFp.ToBitString() elif details.metric == DataStructs.CosineSimilarity: func = 'rd_cosine' pkl = probeFp.ToBitString() extraFields = "%s(%s,%s) as tani" % (func, DbModule.placeHolder, details.fpColName) cmd = _ConstructSQL(details, extraFields=extraFields) if details.doThreshold: # we need to do a subquery here: cmd = "select * from (%s) tmp where tani>%f" % ( cmd, details.screenThresh) cmd += " order by tani desc" if not details.doThreshold and details.topN > 0: cmd += " limit %d" % details.topN curs = conn.GetCursor() curs.execute(cmd, (pkl, )) res = curs.fetchall() return res
def caculate_similarity_fingerprint(smiles_A, smiles_B): try: m1 = Chem.MolFromSmiles(smiles_A) m2 = Chem.MolFromSmiles(smiles_B) f1 = FingerprintMols.FingerprintMol(m1) f2 = FingerprintMols.FingerprintMol(m2) similaritt_f1_f2 = DataStructs.FingerprintSimilarity(f1, f2) return round(similaritt_f1_f2, 4) except: return -1
def combine_via_chemsimilarity(cypfile,cypreact): checked_file = open("final_.csv","w",newline='') csv_writer_checked = csv.writer(checked_file,quoting=csv.QUOTE_ALL) # this file is for keep track of duplicate compounds # if drugbank state the compound is inhibitor but chembl state substrate; then it needs to investigate checked_file_2 = open("duplicates.csv","w",newline='') csv_writer_checked_2 = csv.writer(checked_file,quoting=csv.QUOTE_ALL) # currently only support Drugbank data and ChEMBL data # later could add self-annotating data drugbank_csv = open(cypfile, newline='') drugbank_csvreader = csv.reader(drugbank_csv, delimiter=',') ChEMBL_csv = open(cypreact, newline='') ChEMBL_csvreader = csv.reader(ChEMBL_csv, delimiter=',') DRUGBANK = [] for row in drugbank_csvreader: DRUGBANK.append(row) CHEMBL = [] for row in ChEMBL_csvreader: CHEMBL.append(row) # if selecting drugbank compound exist in chembl; # print it/ save it to file # later need automatically store into file for cl in CHEMBL: for db in DRUGBANK: mol_object_c = Chem.MolFromSmiles(cl[1]) mol_object_d = Chem.MolFromSmiles(db[1]) fps_c = FingerprintMols.FingerprintMol(mol_object_c) fps_d = FingerprintMols.FingerprintMol(mol_object_d) similiarty = DataStructs.FingerprintSimilarity(fps_c,fps_d) if similiarty == 1: single_list = ["Duplicates"] csv_writer_checked_2.writerow(single_list) csv_writer_checked_2.writerow(cl) csv_writer_checked_2.writerow(db) # db_list = list(db) # csv_writer.writerow(db_list) DRUGBANK.remove(db) checked_file_2.close() print("remaining compound from drugbank is: "+str(len(DRUGBANK))) for i in CHEMBL: csv_writer_checked.writerow(i) for i in DRUGBANK: csv_writer_checked.writerow(i) # csv_write_file.close() checked_file.close() print("similarity check done ...") return None
def GetFingerprints(details): """ returns an iterable sequence of fingerprints each fingerprint will have a _fieldsFromDb member whose first entry is the id. """ if details.dbName and details.tableName: try: conn = DbConnect(details.dbName, details.tableName) if hasattr(details, "dbUser"): conn.user = details.dbUser if hasattr(details, "dbPassword"): conn.password = details.dbPassword except Exception: import traceback FingerprintMols.error( "Error: Problems establishing connection to database: %s|%s\n" % (details.dbName, details.tableName) ) traceback.print_exc() cmd = _ConstructSQL(details, extraFields=details.fpColName) curs = conn.GetCursor() # curs.execute(cmd) # print 'CURSOR:',curs,curs.closed if _dataSeq: suppl = _dataSeq(curs, cmd, depickle=not details.noPickle, klass=DataStructs.ExplicitBitVect) _dataSeq._conn = conn else: suppl = DbFpSupplier.ForwardDbFpSupplier(data, fpColName=details.fpColName) elif details.inFileName: conn = None try: inF = open(details.inFileName, "r") except IOError: import traceback FingerprintMols.error("Error: Problems reading from file %s\n" % (details.inFileName)) traceback.print_exc() suppl = [] done = 0 while not done: try: ID, fp = cPickle.load(inF) except Exception: done = 1 else: fp._fieldsFromDb = [ID] suppl.append(fp) else: suppl = None return suppl
def compute_single_tanimoto_metric(first_smile, second_smile): if first_smile is None or second_smile is None: return 0. first_mol = MolFromSmiles(first_smile) second_mol = MolFromSmiles(second_smile) if first_mol is None or second_mol is None: return 0. tanimoto_similarity = DataStructs.FingerprintSimilarity( FingerprintMols.FingerprintMol(first_mol), FingerprintMols.FingerprintMol(second_mol), metric=DataStructs.TanimotoSimilarity) return tanimoto_similarity
def calc_tanimoto(self, reference): """ Determin the tanimoto similarity score based on the rd mol fingureprint """ try: fps_ref = FingerprintMols.FingerprintMol(reference.rd_mol) fps_self = FingerprintMols.FingerprintMol(self.rd_mol) tanimoto = DataStructs.FingerprintSimilarity(fps_ref, fps_self) return tanimoto except: logger.exception('Caught exception attempting to run rdkit FingerprintMols.FingerprintMol or DataStructs.FingerprintSimilarity') return None
def ScreenInDb(details, mol): try: probeFp = apply(FingerprintMols.FingerprintMol, (mol,), details.__dict__) except Exception: import traceback FingerprintMols.error("Error: problems fingerprinting molecule.\n") traceback.print_exc() return [] if details.dbName and details.tableName: try: conn = DbConnect(details.dbName, details.tableName) if hasattr(details, "dbUser"): conn.user = details.dbUser if hasattr(details, "dbPassword"): conn.password = details.dbPassword except Exception: import traceback FingerprintMols.error( "Error: Problems establishing connection to database: %s|%s\n" % (details.dbName, details.tableName) ) traceback.print_exc() if details.metric not in (DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity, DataStructs.CosineSimilarity): data = GetFingerprints(details) res = ScreenFingerprints(details, data, mol) else: res = [] if details.metric == DataStructs.TanimotoSimilarity: func = "rd_tanimoto" pkl = probeFp.ToBitString() elif details.metric == DataStructs.DiceSimilarity: func = "rd_dice" pkl = probeFp.ToBitString() elif details.metric == DataStructs.CosineSimilarity: func = "rd_cosine" pkl = probeFp.ToBitString() extraFields = "%s(%s,%s) as tani" % (func, DbModule.placeHolder, details.fpColName) cmd = _ConstructSQL(details, extraFields=extraFields) if details.doThreshold: # we need to do a subquery here: cmd = "select * from (%s) tmp where tani>%f" % (cmd, details.screenThresh) cmd += " order by tani desc" if not details.doThreshold and details.topN > 0: cmd += " limit %d" % details.topN curs = conn.GetCursor() curs.execute(cmd, (pkl,)) res = curs.fetchall() return res
def is_similar_reagent(rgt1, rgt2, list_of_metal_atoms, list_of_full_metal_names): if rgt1 == rgt2: return True elif 'Reaxys ID' in rgt1 or 'Reaxys ID' in rgt2: return False else: #if have metal atoms compare the rgt1_metal = 100 rgt2_metal = 101 if any(metal in rgt1 for metal in list_of_full_metal_names): rgt1_metal = [ list_of_full_metal_names.index(metal) for metal in list_of_full_metal_names if metal in rgt1 ] elif any(metal in rgt1 for metal in list_of_metal_atoms): rgt1_metal = [ list_of_metal_atoms.index(metal) for metal in list_of_metal_atoms if metal in rgt1 ] if any(metal in rgt2 for metal in list_of_full_metal_names): rgt2_metal = [ list_of_full_metal_names.index(metal) for metal in list_of_full_metal_names if metal in rgt2 ] elif any(metal in rgt2 for metal in list_of_metal_atoms): rgt2_metal = [ list_of_metal_atoms.index(metal) for metal in list_of_metal_atoms if metal in rgt2 ] if rgt1_metal == rgt2_metal: return True if 'Reaxys' in rgt1 or 'Reaxys' in rgt2: return False try: mol1 = Chem.MolFromSmiles(rgt1) mol2 = Chem.MolFromSmiles(rgt2) fp1 = FingerprintMols.FingerprintMol(mol1) fp2 = FingerprintMols.FingerprintMol(mol2) except: print('cannot calculate fp') return False if not any(list(fp1)) or not any(list(fp2)): return False similarity = DataStructs.FingerprintSimilarity(fp1, fp2) if similarity >= 1.0: return True else: return False
def mtansr(self, ): """ This rule computes the structural similarity between the two passed molecules using the tanimoto score. Returns ------- scr_tan : float the rule score """ fps_moli = FingerprintMols.FingerprintMol(self.moli) fps_molj = FingerprintMols.FingerprintMol(self.molj) scr_tan = DataStructs.FingerprintSimilarity(fps_moli, fps_molj) return scr_tan
def create_fpmols(smiles: List[str] or str) -> List or str: if isinstance(smiles, list): smiles = [Chem.MolFromSmiles(smile) for smile in smiles] fpmols: List = [ FingerprintMols.FingerprintMol(smile) for smile in smiles ] elif isinstance(smiles, str): smiles = Chem.MolFromSmiles(smiles) fpmols: str = FingerprintMols.FingerprintMol(smiles) else: raise ValueError(f'{type(smiles)} is not supported') return fpmols
def pipe_sim_filter(stream, query, cutoff=80, summary=None, comp_id="pipe_sim_filter"): """Filter for compounds that have a similarity greater or equal than `cutoff` (in percent) to the `query` Smiles. If the field `FP_b64` (e.g. pre-calculated) is present, this will be used, otherwise the fingerprint of the Murcko scaffold will be generated on-the-fly (much slower).""" rec_counter = 0 query_mol = Chem.MolFromSmiles(query) if not query_mol: print("* {} ERROR: could not generate query from SMILES.".format( comp_id)) return None murcko_mol = MurckoScaffold.GetScaffoldForMol(query_mol) if USE_FP == "morgan": query_fp = Desc.rdMolDescriptors.GetMorganFingerprintAsBitVect( murcko_mol, 2) elif USE_FP == "avalon": query_fp = pyAv.GetAvalonFP(murcko_mol, 1024) else: query_fp = FingerprintMols.FingerprintMol(murcko_mol) for rec in stream: if "mol" not in rec: continue if "FP_b64" in rec: # use the pre-defined fingerprint if it is present in the stream mol_fp = pickle.loads(b64.b64decode(rec["FP_b64"])) else: murcko_mol = MurckoScaffold.GetScaffoldForMol(rec["mol"]) if USE_FP == "morgan": mol_fp = Desc.rdMolDescriptors.GetMorganFingerprintAsBitVect( murcko_mol, 2) elif USE_FP == "avalon": mol_fp = pyAv.GetAvalonFP(murcko_mol, 1024) else: mol_fp = FingerprintMols.FingerprintMol(murcko_mol) sim = DataStructs.FingerprintSimilarity(query_fp, mol_fp) if sim * 100 >= cutoff: rec_counter += 1 rec["Sim"] = np.round(sim * 100, 2) if summary is not None: summary[comp_id] = rec_counter yield rec
def score(end, start=None): """complexity score of a compound Should be high when that compound is difficult to synthesize and low when it is easy to synthesize """ f_end = FingerprintMols.FingerprintMol(end) f_start = FingerprintMols.FingerprintMol(start) sim = DataStructs.FingerprintSimilarity(f_end, f_start) if start: return -1 * sim #lower scores are better if Hydrobromination.has_br(end): return 2 else: return 1
def _ConnectToDatabase(details) -> DbConnect: if details.dbName and details.tableName: try: conn = DbConnect(details.dbName, details.tableName) if hasattr(details, 'dbUser'): conn.user = details.dbUser if hasattr(details, 'dbPassword'): conn.password = details.dbPassword return conn except Exception: import traceback FingerprintMols.error(f'Error: Problems establishing connection to ' f'database:{details.dbName}|{details.tableName}\n') traceback.print_exc() return None
def __post_init__(self): """Properly initialize some class variable representations (or try).""" self.mol_reactants = [Chem.MolFromSmiles(x) for x in self.reactants] self.mol_reagents = [Chem.MolFromSmiles(x) for x in self.reagents] self.mol_products = [Chem.MolFromSmiles(x) for x in self.products] self.fp_reactants = [ FingerprintMols.FingerprintMol(x) for x in self.mol_reactants ] self.fp_reagents = [ FingerprintMols.FingerprintMol(x) for x in self.mol_reagents ] self.fp_products = [ FingerprintMols.FingerprintMol(x) for x in self.mol_products ]
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X, simType): if simType == "Topological": fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList] fp = FingerprintMols.FingerprintMol( Chem.MolFromSmiles(predEx[smilesAttrName].value)) elif simType == "Morgan": fpsTrain = [ AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList ] fp = AllChem.GetMorganFingerprint( Chem.MolFromSmiles(predEx[smilesAttrName].value), 2) elif simType == "MACCS": fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList] fp = MACCSkeys.GenMACCSKeys( Chem.MolFromSmiles(predEx[smilesAttrName].value)) else: print "This type of sim is not implemented ", simType simDict = {} idx = 0 simList = [] for ex in train: if simType == "Topological": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) elif simType == "Morgan": sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp) elif simType == "MACCS": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) else: print "This type of sim is not implemented ", simType idx = idx + 1 simDict[ex[nameAttr].value] = sim simList.append(sim) simList.sort(reverse=True) simList = simList[0:X] medSim = round(numpy.median(simList), 3) stdSim = round(numpy.std(simList), 3) minSim = round(min(simList), 3) maxSim = round(max(simList), 3) entropy = round(getRespVar(simList, simDict, train, nameAttr), 3) entropyClosest = round( getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3) return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
def _pathFingerprintsClustering(rdkit_mols): """ Returns the tanimoto distance matrix based on fingerprints method Parameters ---------- rdkit_mols: list The list of rdkit.Chem.rdchem.Mol objects Returns ------- tanimotomatrix: np.array The numpy array containing the tanimoto matrix """ from rdkit.Chem.Fingerprints import FingerprintMols # calcola path fingerprints fps = [] for m in tqdm(rdkit_mols): fps.append(FingerprintMols.FingerprintMol(m)) aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) tanimoto_matrix = aprun(total=len(fps), desc='PathFingerprints Distance') \ (delayed(TanimotoDistances)(fp1, fps) for fp1 in fps) return np.array(tanimoto_matrix)
def CalculateDaylightFingerprint(mol): """ ################################################################# Calculate Daylight-like fingerprint or topological fingerprint (2048 bits). Usage: result=CalculateDaylightFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res = {} NumFinger = 2048 bv = FingerprintMols.FingerprintMol(mol) temp = tuple(bv.GetOnBits()) for i in temp: res.update({i: 1}) return NumFinger, res, bv
def computeFP(self, typeFP): from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs, Torsions from rdkit.Chem import AllChem if not "smiclean" in self.__dict__: self.log = self.log + "No smiles prepared\n" return 1 else: self.mol = Chem.MolFromSmiles(self.smiclean) #print self.smiclean dFP = {} if typeFP == "Mol" or typeFP == "All": dFP["Mol"] = FingerprintMols.FingerprintMol(self.mol) if typeFP == "MACCS" or typeFP == "All": dFP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol) if typeFP == "pairs" or typeFP == "All": dFP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol) if typeFP == "Torsion" or typeFP == "All": dFP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint( self.mol) if typeFP == "Morgan" or typeFP == "All": dFP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2) self.FP = dFP return 0
def get_fp(mols): fps = [] if (args.fpType == 'ECFP4'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 2) fps.append(z) if (args.fpType == 'ECFP6'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 3) fps.append(z) if (args.fpType == 'ECFP12'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 6) fps.append(z) if (args.fpType == 'MACCS'): for x in mols: if (x): z = Chem.MACCSkeys.GenMACCSKeys(x) fps.append(z) if (args.fpType == 'Daylight'): for x in mols: if (x): z = FingerprintMols.FingerprintMol(x) fps.append(z) if (args.fpType == 'AP'): for x in mols: if (x): z = GetHashedAtomPairFingerprintAsBitVect(x, nBits=4096) fps.append(z) return fps
def ScreenFingerprints(details,data,mol=None,probeFp=None): """ Returns a list of results """ if probeFp is None: try: probeFp = apply(FingerprintMols.FingerprintMol,(mol,),details.__dict__) except: import traceback FingerprintMols.error('Error: problems fingerprinting molecule.\n') traceback.print_exc() return [] if not probeFp: return [] res = [] if not details.doThreshold and details.topN>0: topN = TopNContainer(details.topN) else: topN = [] res = [] count = 0 for pt in data: fp1 = probeFp if not details.noPickle: if type(pt) in (types.TupleType,types.ListType): id,fp = pt else: fp = pt id = pt._fieldsFromDb[0] score = DataStructs.FingerprintSimilarity(fp1,fp,details.metric) else: id,pkl = pt score = details.metric(fp1,str(pkl)) if topN: topN.Insert(score,id) elif not details.doThreshold or \ (details.doThreshold and score>=details.screenThresh): res.append((id,score)) count += 1 if hasattr(details,'stopAfter') and count >= details.stopAfter: break for score,id in topN: res.append((id,score)) return res
def ScreenFromDetails(details, mol=None): """ Returns a list of results """ if not mol: if not details.probeMol: smi = details.probeSmiles try: mol = Chem.MolFromSmiles(smi) except Exception: import traceback FingerprintMols.error('Error: problems generating molecule for smiles: %s\n' % (smi)) traceback.print_exc() return else: mol = details.probeMol if not mol: return if details.outFileName: try: outF = open(details.outFileName, 'w+') except IOError: FingerprintMols.error("Error: could not open output file %s for writing\n" % (details.outFileName)) return None else: outF = None if not hasattr(details, 'useDbSimilarity') or not details.useDbSimilarity: data = GetFingerprints(details) res = ScreenFingerprints(details, data, mol) else: res = ScreenInDb(details, mol) if outF: for pt in res: outF.write(','.join([str(x) for x in pt])) outF.write('\n') return res
Default is *AutoFragmentFP* - --minPath=val: minimum path length to be included in fragment-based fingerprints. Default is *1*. - --maxPath=val: maximum path length to be included in fragment-based fingerprints. Default is *7*. - --nBitsPerHash: number of bits to be set in the output fingerprint for each fragment. Default is *4*. - --discrim: use of path-based discriminators to hash bits. Default is *false*. - -V: include valence information in the fingerprints Default is *false*. - -H: include Hs in the fingerprint Default is *false*. - --useMACCS: use the public MACCS keys to do the fingerprinting (instead of a daylight-type fingerprint) """ if __name__ == '__main__': FingerprintMols.message("This is MolSimilarity version %s\n\n"%(__VERSION_STRING)) FingerprintMols._usageDoc=_usageDoc details = FingerprintMols.ParseArgs() ScreenFromDetails(details)
Default is *AutoFragmentFP* - --minPath=val: minimum path length to be included in fragment-based fingerprints. Default is *1*. - --maxPath=val: maximum path length to be included in fragment-based fingerprints. Default is *7*. - --nBitsPerHash: number of bits to be set in the output fingerprint for each fragment. Default is *4*. - --discrim: use of path-based discriminators to hash bits. Default is *false*. - -V: include valence information in the fingerprints Default is *false*. - -H: include Hs in the fingerprint Default is *false*. - --useMACCS: use the public MACCS keys to do the fingerprinting (instead of a daylight-type fingerprint) """ if __name__ == '__main__': FingerprintMols.message("This is MolSimilarity\n\n") FingerprintMols._usageDoc = _usageDoc details = FingerprintMols.ParseArgs() ScreenFromDetails(details)