def GetMHFP6(mol, nBits=2048, radius=3): """ MHFP6: radius=3 """ encoder = MHFPEncoder(n_permutations=nBits) hash_values = encoder.encode_mol(mol, radius=radius, rings=True, kekulize=True, min_radius=1) arr = encoder.fold(hash_values, nBits) return arr.astype(bool)
def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False): """ MAP4 calculator class """ self.radius = radius self.is_counted = is_counted self.is_folded = is_folded if self.is_folded: self.encoder = MHFPEncoder(dimensions) else: self.encoder = tm.Minhash(dimensions)
def similarity_search(chos_fp, db, mol, number_of_hits, lf1, lf2, lf3, findID1, findID2, findID3): """returns n hits of the given query ligand Arguments: smile {string} -- smile of the query ligand number_of_hits {integer} -- number of required hits Returns: list -- n NN of the query molecule according to MXfp """ results = [] if db == 'ChEMBL': lf = lf1 findID = findID1 elif db == 'SwissProt': lf = lf2 findID = findID2 else: lf = lf3 findID = findID3 mhfp_encoder = MHFPEncoder(512) if chos_fp == 'MAP4': fp = calc_map(mol) else: fp = calc_mhfp(mhfp_encoder, mol) NNs = lf.query_linear_scan(fp, int(number_of_hits)) for i, NN in enumerate(NNs): results.append([ findID[NN[1]][1].split(';'), findID[NN[1]][0], round(NN[0], 3), findID[NN[1]][2] ]) return results
def convert(subset): target = '/cluster/chembl/chembl.' + str(subset) + '.smi' actives = pd.read_csv(target, sep=' ', usecols=[0], header=None) mh = MHFPEncoder() with open('/cluster/chembl/chembl.' + str(subset) + '.mhfp6', 'w+') as f: for _, row in actives.iterrows(): mol = AllChem.MolFromSmiles(row[0]) if mol: fp_vals = ','.join(map(str, mh.encode_mol(mol))) f.write(fp_vals + '\n') with open('/cluster/chembl/chembl.' + str(subset) + '.mhecfp4', 'w+') as f: for _, row in actives.iterrows(): mol = AllChem.MolFromSmiles(row[0]) if mol: fp_vals = ','.join( map( str, mh.from_sparse_array([ *AllChem.GetMorganFingerprint( mol, 2).GetNonzeroElements() ]))) f.write(fp_vals + '\n') with open('/cluster/chembl/chembl.' + str(subset) + '.ecfp4', 'w+') as f: for _, row in actives.iterrows(): mol = AllChem.MolFromSmiles(row[0]) if mol: fp_vals = ','.join( map( str, AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048))) f.write(fp_vals + '\n')
def LSH_Convert(mols, outpath, num_workers): # MinHash fingerprints (mhfp) encoder for molecular fingerprinting enc = MHFPEncoder(1024) # Locality Sensitive Hashing Forest Instance lf = tm.LSHForest(1024, 64) print("Number of mols to be hashed:", len(mols)) fps = process_map(enc.encode_mol, mols, chunksize=100, max_workers=num_workers) fp_vecs = [tm.VectorUint(fp) for fp in fps] lf.batch_add(fp_vecs) lf.index() # save fp and lf with open(os.path.join(outpath, "fps.pickle"), "wb") as fpfile: pickle.dump(fps, fpfile) lf.store(os.path.join(outpath, "lf.dat")) print('LSH data files saved!') return lf
def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False, return_strings=False): """ Parameters ---------- dimensions : int (default = 1024) Number of entries in the output map4 fingerprint. radius : int (default = 2) Number of bonds away from atom centre to consider. is_counted : bool (default = False) is_folded : bool (default = False) return_strings : bool (default = False) If True then returns substructure strings rather than hashed fingerprint. """ self.dimensions = int(dimensions) self.radius = int(radius) self.is_counted = bool(is_counted) self.is_folded = bool(is_folded) self.return_strings = bool(return_strings) if self.is_folded: self.encoder = MHFPEncoder(dimensions) else: self.encoder = tm.Minhash(dimensions)
def CalculateMinHashFingerprint(mol: Chem.Mol, radius: int = 3, rtype: str = 'bitstring', bits: int = 2048) -> Tuple[str, dict, Any]: """Calculate the MinHash Fingerprint (MHFP) of molecule. doi: 10.1186/s13321-018-0321-8. :param radius: maximum radius of atom-centered substructures. :param rtype: Type of output, may either be: bitstring (default), returns a binary string numpy, return the underlying numpy array dict, for a dict of bits turned on :param bits: Number of folded bits (ignored if rtype != 'bitstring') """ mhfp = MHFPEncoder() shingles = mhfp.shingling_from_mol(mol, radius, True, True, 1) hash_values = mhfp.hash(shingles) if rtype == 'numpy': return hash_values elif rtype == 'dict': return {x: 1 for x in hash_values.tolist()} else: folded = mhfp.fold(hash_values, bits) return ''.join(map(str, folded))
import pytest import numpy as np from scipy.spatial.distance import jaccard from rdkit.Chem import AllChem from mhfp.encoder import MHFPEncoder from mhfp.lsh_forest import LSHForestHelper # Keeping tests barebone and simple mhfp_encoder = MHFPEncoder() lfh = LSHForestHelper() drugbank = [] with open('test/drugbank.smi') as f: for line in f.readlines(): mol = AllChem.MolFromSmiles(line.strip().split()[0]) if mol: drugbank.append(mhfp_encoder.encode_mol(mol)) for i, fp in enumerate(drugbank): lfh.add(i, fp) lfh.index() def test_setup(): assert len(drugbank) == 226 def test_add():
type=int, help="Number of workers (CPU cores) to use for multiprocessing,\ default to the number of available CPU cores minus one", default=os.cpu_count() - 1) parser.add_argument("-d", "--dim", type=int, help="Fingerprint dimension, default to 1024", default=1024) a = parser.parse_args() outpath = os.path.abspath(a.output) mols = file_to_mols(a.filename) # Define a named properties tuple # To pickle a named tuple correctly: ## 1) The named tupple object has to be declared under __main__ ## 2) The declared variable for the named tuple has to match ## the tuple name in the quotation mark!! Props = namedtuple('Props', ['SMILES', 'MolWt', 'LogP', 'QED', 'SAS']) # MinHash fingerprints (mhfp) encoder. This is a specialized molecular fingerprint scheme enc = MHFPEncoder(a.dim) # Locality Sensitive Hashing Forest lf = tm.LSHForest(a.dim, 64) MolsToLSHForest(mol_list=mols, save_path=outpath, worker=a.worker, batch_size=a.batch)
def main(): """ The main function """ df = pd.read_csv("drugbank.csv").dropna(subset=["SMILES"]).reset_index( drop=True) enc = MHFPEncoder() lf = tm.LSHForest(2048, 128) fps = [] labels = [] groups = [] tpsa = [] logp = [] mw = [] h_acceptors = [] h_donors = [] ring_count = [] is_lipinski = [] has_coc = [] has_sa = [] has_tz = [] substruct_coc = AllChem.MolFromSmiles("COC") substruct_sa = AllChem.MolFromSmiles("NS(=O)=O") substruct_tz = AllChem.MolFromSmiles("N1N=NN=C1") total = len(df) for i, row in df.iterrows(): if i % 1000 == 0 and i > 0: print(f"{round(100 * (i / total))}% done ...") smiles = row[6] mol = AllChem.MolFromSmiles(smiles) if mol and mol.GetNumAtoms() > 5 and smiles.count(".") < 2: fps.append(tm.VectorUint(enc.encode_mol(mol, min_radius=0))) labels.append( f'{smiles}__<a href="https://www.drugbank.ca/drugs/{row[0]}" target="_blank">{row[0]}</a>__{row[1]}' .replace("'", "")) groups.append(row[3].split(";")[0]) tpsa.append(Descriptors.TPSA(mol)) logp.append(Descriptors.MolLogP(mol)) mw.append(Descriptors.MolWt(mol)) h_acceptors.append(Descriptors.NumHAcceptors(mol)) h_donors.append(Descriptors.NumHDonors(mol)) ring_count.append(Descriptors.RingCount(mol)) is_lipinski.append(lipinski_pass(mol)) has_coc.append(mol.HasSubstructMatch(substruct_coc)) has_sa.append(mol.HasSubstructMatch(substruct_sa)) has_tz.append(mol.HasSubstructMatch(substruct_tz)) # Create the labels and the integer encoded array for the groups, # as they're categorical labels_groups, groups = Faerun.create_categories(groups) tpsa_ranked = ss.rankdata(np.array(tpsa) / max(tpsa)) / len(tpsa) logp_ranked = ss.rankdata(np.array(logp) / max(logp)) / len(logp) mw_ranked = ss.rankdata(np.array(mw) / max(mw)) / len(mw) h_acceptors_ranked = ss.rankdata( np.array(h_acceptors) / max(h_acceptors)) / len(h_acceptors) h_donors_ranked = ss.rankdata( np.array(h_donors) / max(h_donors)) / len(h_donors) ring_count_ranked = ss.rankdata( np.array(ring_count) / max(ring_count)) / len(ring_count) lf.batch_add(fps) lf.index() cfg = tm.LayoutConfiguration() cfg.k = 100 # cfg.sl_extra_scaling_steps = 1 cfg.sl_repeats = 2 cfg.mmm_repeats = 2 cfg.node_size = 2 x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg) # Define a colormap highlighting approved vs non-approved custom_cmap = ListedColormap( [ "#2ecc71", "#9b59b6", "#ecf0f1", "#e74c3c", "#e67e22", "#f1c40f", "#95a5a6" ], name="custom", ) bin_cmap = ListedColormap(["#e74c3c", "#2ecc71"], name="bin_cmap") f = Faerun( clear_color="#222222", coords=False, view="front", impress= 'made with <a href="http://tmap.gdb.tools" target="_blank">tmap</a><br />and <a href="https://github.com/reymond-group/faerun-python" target="_blank">faerun</a><br /><a href="https://gist.github.com/daenuprobst/5cddd0159c0cf4758fb16b4b4acbef89">source</a>', ) f.add_scatter( "Drugbank", { "x": x, "y": y, "c": [ groups, is_lipinski, has_coc, has_sa, has_tz, tpsa_ranked, logp_ranked, mw_ranked, h_acceptors_ranked, h_donors_ranked, ring_count_ranked, ], "labels": labels, }, shader="smoothCircle", colormap=[ custom_cmap, bin_cmap, bin_cmap, bin_cmap, bin_cmap, "viridis", "viridis", "viridis", "viridis", "viridis", "viridis", ], point_scale=2.5, categorical=[ True, True, True, True, True, False, False, False, False, False ], has_legend=True, legend_labels=[ labels_groups, [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], ], selected_labels=["SMILES", "Drugbank ID", "Name"], series_title=[ "Group", "Lipinski", "Ethers", "Sulfonamides", "Tetrazoles", "TPSA", "logP", "Mol Weight", "H Acceptors", "H Donors", "Ring Count", ], max_legend_label=[ None, None, None, None, None, str(round(max(tpsa))), str(round(max(logp))), str(round(max(mw))), str(round(max(h_acceptors))), str(round(max(h_donors))), str(round(max(ring_count))), ], min_legend_label=[ None, None, None, None, None, str(round(min(tpsa))), str(round(min(logp))), str(round(min(mw))), str(round(min(h_acceptors))), str(round(min(h_donors))), str(round(min(ring_count))), ], title_index=2, legend_title="", ) f.add_tree("drugbanktree", {"from": s, "to": t}, point_helper="Drugbank") f.plot("drugbank", template="smiles")
from rdkit.Chem import AllChem # config = mstmap.LayoutConfiguration() # # config.merger = mstmap.Merger.Solar # # print(config) # # TODO: Fails for disconnected components! # u = mstmap.VectorUint([0, 1, 2, 3, 4]) # v = mstmap.VectorUint([1, 2, 0, 4, 3]) # w = mstmap.VectorFloat([1.0, 1.0, 1.0, 2.0, 6.0]) # x, y = mstmap.layout(5, u, v, config, w) # print(x) # print(y) enc = MHFPEncoder(512) fps = [] if not os.path.isfile('fps.dat'): with open('drugbank.smi', 'r') as f: i = 0 for line in f: smiles = line.split()[0].strip() mol = AllChem.MolFromSmiles(smiles) if mol: fps.append(enc.encode_mol(mol)) i += 1 if i > 2000: break pickle.dump(fps, open('fps.dat', 'wb')) else: