def benchmark_fprinting(smiles, sdf_file, name, fprint_params={}): mol = mol_from_sdf(sdf_file, conf_num=fprint_params.get('first', None)) num_confs = mol.GetNumConformers() num_rot = AllChem.CalcNumRotatableBonds(mol) num_heavy = mol.GetNumHeavyAtoms() start_time = time.time() Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 2, 1024) fprint_2d_time = time.time() - start_time start_time = time.time() fprints_from_mol(mol, fprint_params=fprint_params, save=False) fprint_3d_time = time.time() - start_time return (fprint_2d_time, fprint_3d_time, num_heavy, num_confs, num_rot)
def native_tuples_from_mol(mol, fprint_params={}, save=False): """Fingerprint molecule and convert to native encoding.""" if not mol.HasProp("_Name"): raise ValueError( "mol must have a '_Name' property or `name` must be provided") fprints_list = fprints_from_mol(mol, fprint_params=fprint_params, save=save) native_tuples = list(map(fprint_to_native_tuple, fprints_list)) return native_tuples
def get_e3fp(mol, bits, smiles=None): """ Get an E3FP fingerprint from an RDKit mol. Args: mol (rdkit.Chem.rdchem.Mol): RDKit mol object bits (int): Number of bits in fingerprint Returns: fp (np.array): fingerprint as numpy array """ if smiles is None: smiles = Chem.MolToSmiles(mol) mol.SetProp("_Name", smiles) fprint_params = {"bits": bits} fp = (fprints_from_mol( mol, fprint_params=fprint_params)[0].to_vector().toarray().astype(int) ).reshape(-1) return fp
def gen_mol_blocks_from_confs(mols, num_confs, ref, ref_mol_block): # fprint_params = {'bits': 4096, 'radius_multiplier': 1.5, 'rdkit_invariants': True} # ref_fprint = fprints_from_mol(ref_mol, fprint_params=fprint_params) # ref = ref_fprint[0].fold().to_rdkit() mols_b = copy.deepcopy(mols) names = [] mol_blocks = [] fps = [] ref_mol_block = [] for mol in mols_b: mol = AllChem.AddHs(mol) AllChem.EmbedMultipleConfs(mol, numConfs=num_confs, ignoreSmoothingFailures=True, pruneRmsThresh=-1.0, maxAttempts=10 * num_confs, randomSeed=0xf00d) fprint_params = { 'bits': 4096, 'first': num_confs, 'radius_multiplier': 1.5, 'rdkit_invariants': True } fprints = fprints_from_mol(mol, fprint_params=fprint_params) binfp = [fp.fold().to_rdkit() for fp in fprints] similarity_efcp4 = [ DataStructs.FingerprintSimilarity(ref, x) for x in binfp ] for i in range(num_confs): sub_name = mol.GetProp('_Name') name = f'{sub_name}_confnum_{i}' names.append(name) mol_blocks.append(Chem.MolToMolBlock(mol, confId=i)) fps.append(similarity_efcp4[i]) ref_mol_block.append(ref_mol_block) df = pd.DataFrame(list(zip(names, mol_blocks, fps, ref_mol_block)), columns=['name', 'mol_blocks', 'fps', 'ref_mol_block']) return df
def get_e3fp_tc(mol): fprints = fprints_from_mol(mol, fprint_params=FPRINT_PARAMS) return tanimoto(fprints[0], fprints[1])
chunksize = 1048576 / 10000 chunks = 10 suppl = [ m for m in AllChem.SDMolSupplier( '/Users/tom/code_test_repository/arrow_testing/cdk2.sdf', removeHs=False) ] ref_mol = suppl[0] ref_mol_block = Chem.MolToMolBlock(ref_mol) fprint_params = { 'bits': 4096, 'radius_multiplier': 1.5, 'rdkit_invariants': True } fprints = fprints_from_mol(ref_mol, fprint_params=fprint_params) binfp = [fp.fold().to_rdkit() for fp in fprints] arr = np.zeros((0, ), dtype=np.int8) DataStructs.ConvertToNumpyArray(binfp[0], arr) # ref_fprint = fprints_from_mol(ref_mol, fprint_params=fprint_params) # ref = ref_fprint[0].fold().to_rdkit() # ref_smiles = 'CCC1=CC(Cl)=C(OC)C(C(NC[C@H]2C[C@H](OC)CN2CC)=O)=C1O' include_columns = ['SMILES', 'Name'] table_list = csv_chunk_extractor(chunks, chunksize, include_columns, arr, ref_mol_block) ray.shutdown() print('finished alignment') # table = Chem.MolFromMolBlock(mol_block_array[0]) # print(f' the mol from the array is not a mol true or false: {m_out_of_array is None}')