def test_distance(): fp_a = mhfp_encoder.from_molecular_shingling(mhfp_encoder.shingling_from_smiles('CCOC1=C(C=C(C=C1)S(=O)(=O)N(C)C)C2=NC(=O)C3=C(N2)C(=NN3C)C(C)(C)C', sanitize=True)) fp_b = mhfp_encoder.from_molecular_shingling(mhfp_encoder.shingling_from_smiles('CCCC1=NN(C2=C1NC(=NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C', sanitize=True)) fp_c = mhfp_encoder.from_molecular_shingling(mhfp_encoder.shingling_from_smiles('O=C(OC)C(C1CCCCN1)C2=CC=CC=C2', sanitize=True)) assert MHFPEncoder.distance(fp, fp_b) == 0.0 assert MHFPEncoder.distance(fp, fp_a) == 0.45849609375 assert MHFPEncoder.distance(fp, fp_c) == 0.97216796875
def GetMHFP6(mol, nBits=2048, radius=3): """ MHFP6: radius=3 """ encoder = MHFPEncoder(n_permutations=nBits) hash_values = encoder.encode_mol(mol, radius=radius, rings=True, kekulize=True, min_radius=1) arr = encoder.fold(hash_values, nBits) return arr.astype(bool)
def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False): """ MAP4 calculator class """ self.radius = radius self.is_counted = is_counted self.is_folded = is_folded if self.is_folded: self.encoder = MHFPEncoder(dimensions) else: self.encoder = tm.Minhash(dimensions)
def CalculateSmilesExtendedConnectivityFingerprint( mol: Chem.Mol, radius: int = 2, rtype: str = 'bitstring', bits: int = 2048) -> Tuple[str, dict, Any]: """Calculate SMILES extended connectivity fingerprint (SECFP), doi: 10.1186/s13321-018-0321-8. :param radius: maximum radius of atom-centered substructures. :param rtype: Type of output, may either be: bitstring (default), returns a binary string numpy, return the underlying numpy array rdkit, return the native rdkit DataStructs dict, for a dict of bits turned on :param bits: Number of folded bits (ignored if rtype != 'bitstring') """ secfp = MHFPEncoder.secfp_from_mol(mol, length=bits, radius=radius, rings=True, kekulize=True, min_radius=1) if rtype == 'numpy': return secfp elif rtype == 'dict': return {x: 1 for x in secfp.tolist() if x != 0} bv = DataStructs.ExplicitBitVect(bits) bv.SetBitsFromList([x for x, y in enumerate(secfp.tolist()) if y != 0]) if rtype == 'rdkit': return bv else: return bv.ToBitString()
def convert(subset): target = '/cluster/chembl/chembl.' + str(subset) + '.smi' actives = pd.read_csv(target, sep=' ', usecols=[0], header=None) mh = MHFPEncoder() with open('/cluster/chembl/chembl.' + str(subset) + '.mhfp6', 'w+') as f: for _, row in actives.iterrows(): mol = AllChem.MolFromSmiles(row[0]) if mol: fp_vals = ','.join(map(str, mh.encode_mol(mol))) f.write(fp_vals + '\n') with open('/cluster/chembl/chembl.' + str(subset) + '.mhecfp4', 'w+') as f: for _, row in actives.iterrows(): mol = AllChem.MolFromSmiles(row[0]) if mol: fp_vals = ','.join( map( str, mh.from_sparse_array([ *AllChem.GetMorganFingerprint( mol, 2).GetNonzeroElements() ]))) f.write(fp_vals + '\n') with open('/cluster/chembl/chembl.' + str(subset) + '.ecfp4', 'w+') as f: for _, row in actives.iterrows(): mol = AllChem.MolFromSmiles(row[0]) if mol: fp_vals = ','.join( map( str, AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048))) f.write(fp_vals + '\n')
def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False, return_strings=False): """ Parameters ---------- dimensions : int (default = 1024) Number of entries in the output map4 fingerprint. radius : int (default = 2) Number of bonds away from atom centre to consider. is_counted : bool (default = False) is_folded : bool (default = False) return_strings : bool (default = False) If True then returns substructure strings rather than hashed fingerprint. """ self.dimensions = int(dimensions) self.radius = int(radius) self.is_counted = bool(is_counted) self.is_folded = bool(is_folded) self.return_strings = bool(return_strings) if self.is_folded: self.encoder = MHFPEncoder(dimensions) else: self.encoder = tm.Minhash(dimensions)
def CalculateMinHashFingerprint(mol: Chem.Mol, radius: int = 3, rtype: str = 'bitstring', bits: int = 2048) -> Tuple[str, dict, Any]: """Calculate the MinHash Fingerprint (MHFP) of molecule. doi: 10.1186/s13321-018-0321-8. :param radius: maximum radius of atom-centered substructures. :param rtype: Type of output, may either be: bitstring (default), returns a binary string numpy, return the underlying numpy array dict, for a dict of bits turned on :param bits: Number of folded bits (ignored if rtype != 'bitstring') """ mhfp = MHFPEncoder() shingles = mhfp.shingling_from_mol(mol, radius, True, True, 1) hash_values = mhfp.hash(shingles) if rtype == 'numpy': return hash_values elif rtype == 'dict': return {x: 1 for x in hash_values.tolist()} else: folded = mhfp.fold(hash_values, bits) return ''.join(map(str, folded))
def _get_knn(query_mhfp, ann, k, data): """ Brute-force search for selecting k nearest neighbors from k * kc approximate nearest neighbors. Keyword arguments: query_mhfp {numpy.ndarray} -- The query MHFP fingerprint. ann {list} -- A list of indices of approximate nearest neighbors of size k * kc to be brute-force searched k {int} -- The number of nearest neighbors to be returned from the approximate nearest neighbors data {dict} -- The MHFP values indexed with the same key supplied to add() """ dists = [] for index in ann: dists.append((index, 1.0 - MHFPEncoder.distance(query_mhfp, data[index]))) dists.sort(key=itemgetter(1), reverse=True) return [x[0] for x in dists[:k]]
def similarity_search(chos_fp, db, mol, number_of_hits, lf1, lf2, lf3, findID1, findID2, findID3): """returns n hits of the given query ligand Arguments: smile {string} -- smile of the query ligand number_of_hits {integer} -- number of required hits Returns: list -- n NN of the query molecule according to MXfp """ results = [] if db == 'ChEMBL': lf = lf1 findID = findID1 elif db == 'SwissProt': lf = lf2 findID = findID2 else: lf = lf3 findID = findID3 mhfp_encoder = MHFPEncoder(512) if chos_fp == 'MAP4': fp = calc_map(mol) else: fp = calc_mhfp(mhfp_encoder, mol) NNs = lf.query_linear_scan(fp, int(number_of_hits)) for i, NN in enumerate(NNs): results.append([ findID[NN[1]][1].split(';'), findID[NN[1]][0], round(NN[0], 3), findID[NN[1]][2] ]) return results
def LSH_Convert(mols, outpath, num_workers): # MinHash fingerprints (mhfp) encoder for molecular fingerprinting enc = MHFPEncoder(1024) # Locality Sensitive Hashing Forest Instance lf = tm.LSHForest(1024, 64) print("Number of mols to be hashed:", len(mols)) fps = process_map(enc.encode_mol, mols, chunksize=100, max_workers=num_workers) fp_vecs = [tm.VectorUint(fp) for fp in fps] lf.batch_add(fp_vecs) lf.index() # save fp and lf with open(os.path.join(outpath, "fps.pickle"), "wb") as fpfile: pickle.dump(fps, fpfile) lf.store(os.path.join(outpath, "lf.dat")) print('LSH data files saved!') return lf
import pytest import numpy as np from scipy.spatial.distance import jaccard from rdkit.Chem import AllChem from mhfp.encoder import MHFPEncoder from mhfp.lsh_forest import LSHForestHelper # Keeping tests barebone and simple mhfp_encoder = MHFPEncoder() lfh = LSHForestHelper() drugbank = [] with open('test/drugbank.smi') as f: for line in f.readlines(): mol = AllChem.MolFromSmiles(line.strip().split()[0]) if mol: drugbank.append(mhfp_encoder.encode_mol(mol)) for i, fp in enumerate(drugbank): lfh.add(i, fp) lfh.index() def test_setup(): assert len(drugbank) == 226 def test_add():
type=int, help="Number of workers (CPU cores) to use for multiprocessing,\ default to the number of available CPU cores minus one", default=os.cpu_count() - 1) parser.add_argument("-d", "--dim", type=int, help="Fingerprint dimension, default to 1024", default=1024) a = parser.parse_args() outpath = os.path.abspath(a.output) mols = file_to_mols(a.filename) # Define a named properties tuple # To pickle a named tuple correctly: ## 1) The named tupple object has to be declared under __main__ ## 2) The declared variable for the named tuple has to match ## the tuple name in the quotation mark!! Props = namedtuple('Props', ['SMILES', 'MolWt', 'LogP', 'QED', 'SAS']) # MinHash fingerprints (mhfp) encoder. This is a specialized molecular fingerprint scheme enc = MHFPEncoder(a.dim) # Locality Sensitive Hashing Forest lf = tm.LSHForest(a.dim, 64) MolsToLSHForest(mol_list=mols, save_path=outpath, worker=a.worker, batch_size=a.batch)
from rdkit.Chem import AllChem # config = mstmap.LayoutConfiguration() # # config.merger = mstmap.Merger.Solar # # print(config) # # TODO: Fails for disconnected components! # u = mstmap.VectorUint([0, 1, 2, 3, 4]) # v = mstmap.VectorUint([1, 2, 0, 4, 3]) # w = mstmap.VectorFloat([1.0, 1.0, 1.0, 2.0, 6.0]) # x, y = mstmap.layout(5, u, v, config, w) # print(x) # print(y) enc = MHFPEncoder(512) fps = [] if not os.path.isfile('fps.dat'): with open('drugbank.smi', 'r') as f: i = 0 for line in f: smiles = line.split()[0].strip() mol = AllChem.MolFromSmiles(smiles) if mol: fps.append(enc.encode_mol(mol)) i += 1 if i > 2000: break pickle.dump(fps, open('fps.dat', 'wb')) else:
class MAP4Calculator: def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False): """ MAP4 calculator class """ self.radius = radius self.is_counted = is_counted self.is_folded = is_folded if self.is_folded: self.encoder = MHFPEncoder(dimensions) else: self.encoder = tm.Minhash(dimensions) def calculate(self, mol): """Calculates the atom pair minhashed fingerprint Arguments: mol -- rdkit mol object Returns: tmap VectorUint -- minhashed fingerprint """ atom_env_pairs = self._calculate(mol) if self.is_folded: return self._fold(atom_env_pairs) return self.encoder.from_string_array(atom_env_pairs) def calculate_many(self, mols): """ Calculates the atom pair minhashed fingerprint Arguments: mols -- list of mols Returns: list of tmap VectorUint -- minhashed fingerprints list """ atom_env_pairs_list = [self._calculate(mol) for mol in mols] if self.is_folded: return [self._fold(pairs) for pairs in atom_env_pairs_list] return self.encoder.batch_from_string_array(atom_env_pairs_list) def _calculate(self, mol): return self._all_pairs(mol, self._get_atom_envs(mol)) def _fold(self, pairs): fp_hash = self.encoder.hash(set(pairs)) return self.encoder.fold(fp_hash) def _get_atom_envs(self, mol): atoms_env = {} for atom in mol.GetAtoms(): idx = atom.GetIdx() for radius in range(1, self.radius + 1): if idx not in atoms_env: atoms_env[idx] = [] atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius)) return atoms_env @classmethod def _find_env(cls, mol, idx, radius): env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx) atom_map = {} submol = Chem.PathToSubmol(mol, env, atomMap=atom_map) if idx in atom_map: smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False) return smiles return '' def _all_pairs(self, mol, atoms_env): atom_pairs = [] distance_matrix = GetDistanceMatrix(mol) num_atoms = mol.GetNumAtoms() shingle_dict = defaultdict(int) for idx1, idx2 in itertools.combinations(range(num_atoms), 2): dist = str(int(distance_matrix[idx1][idx2])) for i in range(self.radius): env_a = atoms_env[idx1][i] env_b = atoms_env[idx2][i] ordered = sorted([env_a, env_b]) shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1]) if self.is_counted: shingle_dict[shingle] += 1 shingle += '|' + str(shingle_dict[shingle]) atom_pairs.append(shingle.encode('utf-8')) return list(set(atom_pairs))
def test_secfp(): assert np.array_equal(MHFPEncoder.secfp_from_smiles(smiles), secfp)
import pytest import numpy as np from scipy.spatial.distance import jaccard from rdkit.Chem import AllChem from mhfp.encoder import MHFPEncoder # Keeping tests barebone and simple mhfp_encoder = MHFPEncoder() smiles = 'CCCC1=NN(C2=C1NC(=NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C' mol = AllChem.MolFromSmiles(smiles) shingling = sorted([ b'c1cnnc1', b'Cn(nc)c(c)c', b'C(C)Oc', b'c1(OCC)ccccc1-c([nH])n', b'S(c)(N)(=O)=O', b'c(nc)([nH]c)-c(c)c', b'CN(C)C', b'n(c(-c)[nH])c(c)=O', b'C(C)O', b'N(C)(C)S', b'S(=O)(=O)(c(cc)cc)N(CC)CC', b'CC', b'c1(CCC)nn(C)c(c)c1[nH]c', b'c1(S(=O)(=O)N(C)C)cccc(-c)c1', b'N(C)(C)C', b'c(cc)c(c)S', b'N1(S(=O)(=O)c(c)c)CCNCC1', b'c(c)(c)[nH]', b'O=c(nc)c(c)n', b'N(C)(CC)CC', b'n(c(c)C)n(c)C', b'C1CNCCN1', b'c1ccccc1', b'C(C)N', b'n(c)(C)n', b'C(CC)c(c)n', b'c(c(c)-c)c(c)S', b'O(c)C', b'CCO', b'CN(CC)CC', b'[nH](c)c', b'n(c)c', b'n1c(-c(c)c)[nH]cc(n)c1=O', b'N(CC)(CC)S(c)(=O)=O', b'C(CN)N(C)C', b'S(=O)(=O)(c(c)c)N(C)C', b'O(CC)c(cc)c(c)-c', b'C(C)Oc(c)c', b'C(C)Cc', b'C(CN)N(C)S', b'c1(-c(nc)[nH]c)cc(S)ccc1OC', b'c(cc)(OC)c(c)-c', b'n1c(CC)c([nH])c(c)n1C', b'c(c)(c)-c', b'c([nH]c)(c(C)n)c(c)n', b'c(c)(c)O', b'c1cc(O)ccc1S(N)(=O)=O', b'O=S(c)(N)=O', b'c12[nH]c(-c)nc(=O)c1n(C)nc2CC', b'c(-c)([nH])n', b'c1(=O)nc(-c)[nH]c(c)c1n(C)n', b'c1cc(S)cc(-c)c1OC', b'O(CC)c(c)c', b'c(c)(c)n', b'C(C)C', b'n(c)n', b'CCOc', b'c(cc)(-c([nH])n)c(c)O', b'c(CC)(nn)c(c)[nH]', b'c(c)(c)S', b'CCC', b'N1(C)CCNCC1', b'c(c(n)=O)(c(c)[nH])n(C)n', b'n(C)(nc)c(c)c', b'c(c)(n)=O', b'Cn(c)n', b'c(cc)(cc)S(N)(=O)=O', b'O=c(c)n', b'c(c)c', b'n1(C)nc(C)c([nH])c1c(n)=O',
class Map4Fingerprint: """Calculates the atom pair minmashed fingerprint for a given molecular object. Fingerprint is as described by `DOI: 10.1186/1758-2946-5-26` and implemented in the [corresponding repository](https://github.com/reymond-group/map4). """ def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False, return_strings=False): """ Parameters ---------- dimensions : int (default = 1024) Number of entries in the output map4 fingerprint. radius : int (default = 2) Number of bonds away from atom centre to consider. is_counted : bool (default = False) is_folded : bool (default = False) return_strings : bool (default = False) If True then returns substructure strings rather than hashed fingerprint. """ self.dimensions = int(dimensions) self.radius = int(radius) self.is_counted = bool(is_counted) self.is_folded = bool(is_folded) self.return_strings = bool(return_strings) if self.is_folded: self.encoder = MHFPEncoder(dimensions) else: self.encoder = tm.Minhash(dimensions) def __call__(self, mol): """Calculates the atom pair minmashed fingerprint for a given molecular object. Fingerprint is as described by `DOI: 10.1186/1758-2946-5-26` and implemented in the [corresponding repository](https://github.com/reymond-group/map4). Parameters ---------- mol : rdkit.Chem.rdchem.Mol `rdkit` mol object. Returns ------- fp_arr : np.ndarray shape(self.dimensions, ) Map4 fingerprint. """ atom_envs = self._get_atom_envs(mol) atom_env_pairs = self._all_pairs(mol, atom_envs) if self.is_folded: fp_arr = self._fold(atom_env_pairs) elif self.return_strings: fp_arr = atom_env_pairs else: fp_arr = self.encoder.from_string_array(atom_env_pairs) return np.asarray(fp_arr) def _fold(self, pairs): fp_hash = self.encoder.hash(set(pairs)) return self.encoder.fold(fp_hash, self.dimensions) def _get_atom_envs(self, mol): atoms_env = {} for atom in mol.GetAtoms(): idx = atom.GetIdx() for radius in range(1, self.radius + 1): if idx not in atoms_env: atoms_env[idx] = [] atoms_env[idx].append( Map4Fingerprint._find_env(mol, idx, radius)) return atoms_env @classmethod def _find_env(cls, mol, idx, radius): env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx) atom_map = {} submol = Chem.PathToSubmol(mol, env, atomMap=atom_map) if idx in atom_map: smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False) return smiles return '' def _all_pairs(self, mol, atoms_env): atom_pairs = [] distance_matrix = GetDistanceMatrix(mol) num_atoms = mol.GetNumAtoms() shingle_dict = defaultdict(int) for idx1, idx2 in itertools.combinations(range(num_atoms), 2): dist = str(int(distance_matrix[idx1][idx2])) for i in range(self.radius): env_a = atoms_env[idx1][i] env_b = atoms_env[idx2][i] ordered = sorted([env_a, env_b]) shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1]) if self.is_counted: shingle_dict[shingle] += 1 shingle += '|' + str(shingle_dict[shingle]) atom_pairs.append(shingle.encode('utf-8')) return list(set(atom_pairs))
def main(): """ The main function """ df = pd.read_csv("drugbank.csv").dropna(subset=["SMILES"]).reset_index( drop=True) enc = MHFPEncoder() lf = tm.LSHForest(2048, 128) fps = [] labels = [] groups = [] tpsa = [] logp = [] mw = [] h_acceptors = [] h_donors = [] ring_count = [] is_lipinski = [] has_coc = [] has_sa = [] has_tz = [] substruct_coc = AllChem.MolFromSmiles("COC") substruct_sa = AllChem.MolFromSmiles("NS(=O)=O") substruct_tz = AllChem.MolFromSmiles("N1N=NN=C1") total = len(df) for i, row in df.iterrows(): if i % 1000 == 0 and i > 0: print(f"{round(100 * (i / total))}% done ...") smiles = row[6] mol = AllChem.MolFromSmiles(smiles) if mol and mol.GetNumAtoms() > 5 and smiles.count(".") < 2: fps.append(tm.VectorUint(enc.encode_mol(mol, min_radius=0))) labels.append( f'{smiles}__<a href="https://www.drugbank.ca/drugs/{row[0]}" target="_blank">{row[0]}</a>__{row[1]}' .replace("'", "")) groups.append(row[3].split(";")[0]) tpsa.append(Descriptors.TPSA(mol)) logp.append(Descriptors.MolLogP(mol)) mw.append(Descriptors.MolWt(mol)) h_acceptors.append(Descriptors.NumHAcceptors(mol)) h_donors.append(Descriptors.NumHDonors(mol)) ring_count.append(Descriptors.RingCount(mol)) is_lipinski.append(lipinski_pass(mol)) has_coc.append(mol.HasSubstructMatch(substruct_coc)) has_sa.append(mol.HasSubstructMatch(substruct_sa)) has_tz.append(mol.HasSubstructMatch(substruct_tz)) # Create the labels and the integer encoded array for the groups, # as they're categorical labels_groups, groups = Faerun.create_categories(groups) tpsa_ranked = ss.rankdata(np.array(tpsa) / max(tpsa)) / len(tpsa) logp_ranked = ss.rankdata(np.array(logp) / max(logp)) / len(logp) mw_ranked = ss.rankdata(np.array(mw) / max(mw)) / len(mw) h_acceptors_ranked = ss.rankdata( np.array(h_acceptors) / max(h_acceptors)) / len(h_acceptors) h_donors_ranked = ss.rankdata( np.array(h_donors) / max(h_donors)) / len(h_donors) ring_count_ranked = ss.rankdata( np.array(ring_count) / max(ring_count)) / len(ring_count) lf.batch_add(fps) lf.index() cfg = tm.LayoutConfiguration() cfg.k = 100 # cfg.sl_extra_scaling_steps = 1 cfg.sl_repeats = 2 cfg.mmm_repeats = 2 cfg.node_size = 2 x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg) # Define a colormap highlighting approved vs non-approved custom_cmap = ListedColormap( [ "#2ecc71", "#9b59b6", "#ecf0f1", "#e74c3c", "#e67e22", "#f1c40f", "#95a5a6" ], name="custom", ) bin_cmap = ListedColormap(["#e74c3c", "#2ecc71"], name="bin_cmap") f = Faerun( clear_color="#222222", coords=False, view="front", impress= 'made with <a href="http://tmap.gdb.tools" target="_blank">tmap</a><br />and <a href="https://github.com/reymond-group/faerun-python" target="_blank">faerun</a><br /><a href="https://gist.github.com/daenuprobst/5cddd0159c0cf4758fb16b4b4acbef89">source</a>', ) f.add_scatter( "Drugbank", { "x": x, "y": y, "c": [ groups, is_lipinski, has_coc, has_sa, has_tz, tpsa_ranked, logp_ranked, mw_ranked, h_acceptors_ranked, h_donors_ranked, ring_count_ranked, ], "labels": labels, }, shader="smoothCircle", colormap=[ custom_cmap, bin_cmap, bin_cmap, bin_cmap, bin_cmap, "viridis", "viridis", "viridis", "viridis", "viridis", "viridis", ], point_scale=2.5, categorical=[ True, True, True, True, True, False, False, False, False, False ], has_legend=True, legend_labels=[ labels_groups, [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], ], selected_labels=["SMILES", "Drugbank ID", "Name"], series_title=[ "Group", "Lipinski", "Ethers", "Sulfonamides", "Tetrazoles", "TPSA", "logP", "Mol Weight", "H Acceptors", "H Donors", "Ring Count", ], max_legend_label=[ None, None, None, None, None, str(round(max(tpsa))), str(round(max(logp))), str(round(max(mw))), str(round(max(h_acceptors))), str(round(max(h_donors))), str(round(max(ring_count))), ], min_legend_label=[ None, None, None, None, None, str(round(min(tpsa))), str(round(min(logp))), str(round(min(mw))), str(round(min(h_acceptors))), str(round(min(h_donors))), str(round(min(ring_count))), ], title_index=2, legend_title="", ) f.add_tree("drugbanktree", {"from": s, "to": t}, point_helper="Drugbank") f.plot("drugbank", template="smiles")
# Import Modules import pyspark from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType import pyspark.sql.functions as F from db_config import url, properties from mhfp.encoder import MHFPEncoder from pyspark.sql.types import IntegerType, StringType sudo apt-get install libboost-all-dev mhfp_encoder = MHFPEncoder() spark = SparkSession.builder.getOrCreate() # Defined multiple UDF via PySpark Sql appfunctions.py module def filename(path): return path countCarbons = F.udf(lambda x : str(x).lower().count('c'), IntegerType()) sourceFile = F.udf(filename, StringType()) mhfp_smiles = F.udf(lambda x : mhfp_encoder.encode(x, radius=3, rings=True, kekulize=True, sanitize=True), StringType()) # Created DataFrames here with the new columns that I required and dropped the duplicates df = spark.read.format('csv').option('delimiter','\t').option('header', 'false')\ .load('s3a://zincdata/zinc/AA/AAAA.txt') df = df.withColumn('mhfp', mhfp_smiles('smiles')) df = df.dropDuplicates(['smiles']) df.show() # Performed my dataframe write with the help of jdbc #df.write.jdbc(url='jdbc:%s' % url, table="zincmap", mode='append', properties=properties)