def search_by_mols(self, mols, topk=10): ''' :param mols: a list of molecuar :param topk: :return: [[{"id": xx, "smiles": xx, "score": xx}, {}, ...], []] ''' mols_vec = [] for mol in mols: tmp_arr = np.array([]) DataStructs.ConvertToNumpyArray( rdMolDescriptors.GetMACCSKeysFingerprint(mol), tmp_arr) mols_vec.append(self.vec2bytes(tmp_arr)) ret_dists, ret_ids = self.index.search( np.array(mols_vec).astype("uint8"), topk) rets = [] for mol, dists, ids in zip(mols, ret_dists, ret_ids): ret = [] for id in ids: ret.append({ "id": self.df_zinc.iloc[id]["zinc_id"], "smiles": self.df_zinc.iloc[id]["smiles"], "score": self.calc_similarity( mol, Chem.MolFromSmiles(self.df_zinc.iloc[id]["smiles"])) }) rets.append(sorted(ret, key=lambda item: item["score"], reverse=True)) return rets
def GenerateMACCS166KeysFingerprints(Mols): """Generate MACCS166Keys fingerprints.""" MiscUtil.PrintInfo("\nGenerating MACCS166Keys %s fingerprints..." % OptionsInfo["SpecifiedFingerprintsType"]) # Generate ExplicitBitVect fingerprints... MolsFingerprints = [rdMolDescriptors.GetMACCSKeysFingerprint(Mol) for Mol in Mols] return MolsFingerprints
def get_maccs(molecule): try: maccs = rdMolDescriptors.GetMACCSKeysFingerprint(molecule) # Does not have length except Exception as e: print(e) print("error" + str(molecule)) maccs = np.nan return maccs
def maccs_keys(smiles): # mol=Chem.MolFromSmiles(row['smiles']) #aqui entra os smiles # res=fingerprint.CalculateMACCSFingerprint(mol) isto seria se nao fosse vetor # result_maccs.append(res) mol=Chem.MolFromSmiles(smiles) fps=rdMolDescriptors.GetMACCSKeysFingerprint(mol) # DataStructs.ConvertToNumpyArray(desc, arr) arr = np.array(fps) return arr
def build_mol_features(in_file, out_file): df_zinc = pd.read_csv(in_file, compression="zip") fp_list = [] for smi in tqdm.tqdm(df_zinc["smiles"], total=len(df_zinc)): tmp_arr = np.array([]) DataStructs.ConvertToNumpyArray( rdMolDescriptors.GetMACCSKeysFingerprint(Chem.MolFromSmiles(smi)), tmp_arr) fp_list.append(tmp_arr) fp_arr = np.array(fp_list) np.save(out_file, fp_arr)
def _encode(smi: str, fingerprint: str, radius: int, length: int) -> T_comp: """fingerprint functions must be wrapped in a static function so that they may be pickled for parallel processing Parameters ---------- smi : str the SMILES string of the molecule to encode fingerprint : str the the type of fingerprint to generate radius : int the radius of the fingerprint length : int the length of the fingerprint Returns ------- T_comp the compressed feature representation of the molecule """ mol = Chem.MolFromSmiles(smi) if fingerprint == 'morgan': return rdmd.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=length, useChirality=True) if fingerprint == 'pair': return rdmd.GetHashedAtomPairFingerprintAsBitVect(mol, minLength=1, maxLength=1 + radius, nBits=length) if fingerprint == 'rdkit': return rdmd.RDKFingerprint(mol, minPath=1, maxPath=1 + radius, fpSize=length) if fingerprint == 'maccs': return rdmd.GetMACCSKeysFingerprint(mol) if fingerprint == 'map4': return map4.MAP4Calculator(dimensions=length, radius=radius, is_folded=True).calculate(mol) raise NotImplementedError(f'Unrecognized fingerprint: "{fingerprint}"')
def calc_similarity(self, mol1, mol2): fp_mol1 = rdMolDescriptors.GetMACCSKeysFingerprint(mol1) fp_mol2 = rdMolDescriptors.GetMACCSKeysFingerprint(mol2) score = DataStructs.TanimotoSimilarity(fp_mol1, fp_mol2) return score
def MACCS_keys(smiles): mol = Chem.MolFromSmiles(smiles) fp = rdMolDescriptors.GetMACCSKeysFingerprint(mol) arr = np.zeros((0,), dtype=np.int32) cDataStructs.ConvertToNumpyArray(fp, arr) return arr
#structure embed_fn = np.nan_to_num(fngroups.values) embed_graph = graph.values #molecular fingerprint #https://www.rdkit.org/UGM/2012/Landrum_RDKit_UGM.Fingerprints.Final.pptx.pdf finger_mqn = [] finger_morgan = [] finger_maccs = [] finger_ap = [] for i in smiles: mol = AllChem.MolFromSmiles(i) finger_mqn.append(np.array(Descriptors.MQNs_(mol))) finger_maccs.append(np.array(Descriptors.GetMACCSKeysFingerprint((mol)))) #finger_morgan.append(np.array(Descriptors.GetMorganFingerprint((mol)))) finger_ap.append(np.array(Descriptors.GetAtomPairFingerprint((mol)))) ### names = 'vec_spec,vec_smiles,embed_fn,finger_mqn,finger_maccs,finger_ap,embed_graph'.split( ',') data = [ vec_spec, vec_smiles, embed_fn, finger_mqn, finger_maccs, finger_ap, embed_graph ] counter = 0 for i in data: try: res = do_pca(i) plt.scatter(res[0], res[1], label=counter, alpha=.4)