def has_carboxylate(SMILES): """Returns False if carboxylate fragment is not found using RDKIT. """ mol = Chem.MolFromSmiles(SMILES) no_frag = Fragments.fr_COO(mol) if no_frag > 0: return True else: return False
def recursive_dance(mols, check=set([]), thres=0.4): before_n = len(check) print before_n for mol in mols: danced_mols = dance_atom(mol) for mol_conv in danced_mols: aro_n = Fragments.fr_Ar_N(mol) aro_a = len(mol.GetAromaticAtoms()) ratio = float(aro_n) / float(aro_a) if ratio < thres: smi = Chem.MolToSmiles(mol_conv) check.add(smi) after_n = len(check) print before_n, after_n if before_n < after_n: mols = [Chem.MolFromSmiles(mol) for mol in check] recursive_dance(mols, check=check) return [Chem.MolFromSmiles(smi) for smi in check]
def recursive_dance(mols, check = set([]), thres = 0.4): before_n = len(check) print before_n for mol in mols: danced_mols = dance_atom( mol ) for mol_conv in danced_mols: aro_n = Fragments.fr_Ar_N( mol ) aro_a = len( mol.GetAromaticAtoms() ) ratio = float(aro_n) / float(aro_a) if ratio < thres: smi = Chem.MolToSmiles(mol_conv ) check.add( smi ) after_n = len(check) print before_n, after_n if before_n < after_n: mols = [ Chem.MolFromSmiles(mol) for mol in check ] recursive_dance( mols, check=check ) return [ Chem.MolFromSmiles(smi) for smi in check ]
def calc_rdkit(mol): descriptors = pd.Series( np.array([ Crippen.MolLogP(mol), Crippen.MolMR(mol), Descriptors.FpDensityMorgan1(mol), Descriptors.FpDensityMorgan2(mol), Descriptors.FpDensityMorgan3(mol), Descriptors.FractionCSP3(mol), Descriptors.HeavyAtomMolWt(mol), Descriptors.MaxAbsPartialCharge(mol), Descriptors.MaxPartialCharge(mol), Descriptors.MinAbsPartialCharge(mol), Descriptors.MinPartialCharge(mol), Descriptors.MolWt(mol), Descriptors.NumRadicalElectrons(mol), Descriptors.NumValenceElectrons(mol), EState.EState.MaxAbsEStateIndex(mol), EState.EState.MaxEStateIndex(mol), EState.EState.MinAbsEStateIndex(mol), EState.EState.MinEStateIndex(mol), EState.EState_VSA.EState_VSA1(mol), EState.EState_VSA.EState_VSA10(mol), EState.EState_VSA.EState_VSA11(mol), EState.EState_VSA.EState_VSA2(mol), EState.EState_VSA.EState_VSA3(mol), EState.EState_VSA.EState_VSA4(mol), EState.EState_VSA.EState_VSA5(mol), EState.EState_VSA.EState_VSA6(mol), EState.EState_VSA.EState_VSA7(mol), EState.EState_VSA.EState_VSA8(mol), EState.EState_VSA.EState_VSA9(mol), Fragments.fr_Al_COO(mol), Fragments.fr_Al_OH(mol), Fragments.fr_Al_OH_noTert(mol), Fragments.fr_aldehyde(mol), Fragments.fr_alkyl_carbamate(mol), Fragments.fr_alkyl_halide(mol), Fragments.fr_allylic_oxid(mol), Fragments.fr_amide(mol), Fragments.fr_amidine(mol), Fragments.fr_aniline(mol), Fragments.fr_Ar_COO(mol), Fragments.fr_Ar_N(mol), Fragments.fr_Ar_NH(mol), Fragments.fr_Ar_OH(mol), Fragments.fr_ArN(mol), Fragments.fr_aryl_methyl(mol), Fragments.fr_azide(mol), Fragments.fr_azo(mol), Fragments.fr_barbitur(mol), Fragments.fr_benzene(mol), Fragments.fr_benzodiazepine(mol), Fragments.fr_bicyclic(mol), Fragments.fr_C_O(mol), Fragments.fr_C_O_noCOO(mol), Fragments.fr_C_S(mol), Fragments.fr_COO(mol), Fragments.fr_COO2(mol), Fragments.fr_diazo(mol), Fragments.fr_dihydropyridine(mol), Fragments.fr_epoxide(mol), Fragments.fr_ester(mol), Fragments.fr_ether(mol), Fragments.fr_furan(mol), Fragments.fr_guanido(mol), Fragments.fr_halogen(mol), Fragments.fr_hdrzine(mol), Fragments.fr_hdrzone(mol), Fragments.fr_HOCCN(mol), Fragments.fr_imidazole(mol), Fragments.fr_imide(mol), Fragments.fr_Imine(mol), Fragments.fr_isocyan(mol), Fragments.fr_isothiocyan(mol), Fragments.fr_ketone(mol), Fragments.fr_ketone_Topliss(mol), Fragments.fr_lactam(mol), Fragments.fr_lactone(mol), Fragments.fr_methoxy(mol), Fragments.fr_morpholine(mol), Fragments.fr_N_O(mol), Fragments.fr_Ndealkylation1(mol), Fragments.fr_Ndealkylation2(mol), Fragments.fr_NH0(mol), Fragments.fr_NH1(mol), Fragments.fr_NH2(mol), Fragments.fr_Nhpyrrole(mol), Fragments.fr_nitrile(mol), Fragments.fr_nitro(mol), Fragments.fr_nitro_arom(mol), Fragments.fr_nitro_arom_nonortho(mol), Fragments.fr_nitroso(mol), Fragments.fr_oxazole(mol), Fragments.fr_oxime(mol), Fragments.fr_para_hydroxylation(mol), Fragments.fr_phenol(mol), Fragments.fr_phenol_noOrthoHbond(mol), Fragments.fr_phos_acid(mol), Fragments.fr_phos_ester(mol), Fragments.fr_piperdine(mol), Fragments.fr_piperzine(mol), Fragments.fr_priamide(mol), Fragments.fr_prisulfonamd(mol), Fragments.fr_pyridine(mol), Fragments.fr_quatN(mol), Fragments.fr_SH(mol), Fragments.fr_sulfide(mol), Fragments.fr_sulfonamd(mol), Fragments.fr_sulfone(mol), Fragments.fr_term_acetylene(mol), Fragments.fr_tetrazole(mol), Fragments.fr_thiazole(mol), Fragments.fr_thiocyan(mol), Fragments.fr_thiophene(mol), Fragments.fr_unbrch_alkane(mol), Fragments.fr_urea(mol), GraphDescriptors.BalabanJ(mol), GraphDescriptors.BertzCT(mol), GraphDescriptors.Chi0(mol), GraphDescriptors.Chi0n(mol), GraphDescriptors.Chi0v(mol), GraphDescriptors.Chi1(mol), GraphDescriptors.Chi1n(mol), GraphDescriptors.Chi1v(mol), GraphDescriptors.Chi2n(mol), GraphDescriptors.Chi2v(mol), GraphDescriptors.Chi3n(mol), GraphDescriptors.Chi3v(mol), GraphDescriptors.Chi4n(mol), GraphDescriptors.Chi4v(mol), GraphDescriptors.HallKierAlpha(mol), GraphDescriptors.Ipc(mol), GraphDescriptors.Kappa1(mol), GraphDescriptors.Kappa2(mol), GraphDescriptors.Kappa3(mol), Lipinski.HeavyAtomCount(mol), Lipinski.NHOHCount(mol), Lipinski.NOCount(mol), Lipinski.NumAliphaticCarbocycles(mol), Lipinski.NumAliphaticHeterocycles(mol), Lipinski.NumAliphaticRings(mol), Lipinski.NumAromaticCarbocycles(mol), Lipinski.NumAromaticHeterocycles(mol), Lipinski.NumAromaticRings(mol), Lipinski.NumHAcceptors(mol), Lipinski.NumHDonors(mol), Lipinski.NumHeteroatoms(mol), Lipinski.NumRotatableBonds(mol), Lipinski.NumSaturatedCarbocycles(mol), Lipinski.NumSaturatedHeterocycles(mol), Lipinski.NumSaturatedRings(mol), Lipinski.RingCount(mol), MolSurf.LabuteASA(mol), MolSurf.PEOE_VSA1(mol), MolSurf.PEOE_VSA10(mol), MolSurf.PEOE_VSA11(mol), MolSurf.PEOE_VSA12(mol), MolSurf.PEOE_VSA13(mol), MolSurf.PEOE_VSA14(mol), MolSurf.PEOE_VSA2(mol), MolSurf.PEOE_VSA3(mol), MolSurf.PEOE_VSA4(mol), MolSurf.PEOE_VSA5(mol), MolSurf.PEOE_VSA6(mol), MolSurf.PEOE_VSA7(mol), MolSurf.PEOE_VSA8(mol), MolSurf.PEOE_VSA9(mol), MolSurf.SlogP_VSA1(mol), MolSurf.SlogP_VSA10(mol), MolSurf.SlogP_VSA11(mol), MolSurf.SlogP_VSA12(mol), MolSurf.SlogP_VSA2(mol), MolSurf.SlogP_VSA3(mol), MolSurf.SlogP_VSA4(mol), MolSurf.SlogP_VSA5(mol), MolSurf.SlogP_VSA6(mol), MolSurf.SlogP_VSA7(mol), MolSurf.SlogP_VSA8(mol), MolSurf.SlogP_VSA9(mol), MolSurf.SMR_VSA1(mol), MolSurf.SMR_VSA10(mol), MolSurf.SMR_VSA2(mol), MolSurf.SMR_VSA3(mol), MolSurf.SMR_VSA4(mol), MolSurf.SMR_VSA5(mol), MolSurf.SMR_VSA6(mol), MolSurf.SMR_VSA7(mol), MolSurf.SMR_VSA8(mol), MolSurf.SMR_VSA9(mol), MolSurf.TPSA(mol) ])) return descriptors
def COOH(mol): return Fragments.fr_Ar_COO(mol)
def NO2(mol): return Fragments.fr_nitro(mol)
def SR(mol): return Fragments.fr_sulfide(mol)
def SH(mol): return Fragments.fr_SH(mol)
def OH(mol): total = Fragments.fr_Ar_OH(mol) total += Fragments.fr_Al_OH(mol) return total
def NH2(mol): return Fragments.fr_NH2(mol)
def main(): infile = open("molecule_training.csv", 'r') infile.readline() with open('train_molecule_new_features.csv', 'w') as f: writer = csv.writer(f) # writer.writerow(['index', 'Maximum Degree', 'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors', # 'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area', 'Graph', 'smiles', # 'target']) writer.writerow([ 'index', 'Maximum Degree', 'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors', 'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area', 'fr_phos', 'aromatic_carbocycles', 'MolLogP', 'PEOE_VSA1', 'Fingerprint', 'smiles', 'target' ]) for line in infile: line = line.strip('\n\r ') line = line.split(",") smiles = line[10].strip() #edge_list = to_graph(smiles) mol = Chem.MolFromSmiles(smiles) # fingerprint_explicit_bitvector = RDKFingerprint(mol) # fingerprint_bit_string = fingerprint_explicit_bitvector.ToBitString() fingerprint_bit_string = GetMorganFingerprintAsBitVect( mol, 2).ToBitString() #writer.writerow(line[:8] + [fingerprint_bit_string, line[10], line[11]]) #writer.writerow(line[:8] + [edge_list] + [line[10], line[11]]) fr_phos = Fragments.fr_phos_acid(mol) + Fragments.fr_phos_ester( mol) aromatic_cc = Lipinski.NumAromaticCarbocycles(mol) molLogP = Crippen.MolLogP(mol) peoe_vsa1 = MolSurf.PEOE_VSA1(mol) writer.writerow(line[:8] + [ fr_phos, aromatic_cc, molLogP, peoe_vsa1, fingerprint_bit_string, line[10], line[11] ]) infile.close() infile = open("molecule_TestFeatures.csv", 'r') infile.readline() with open('test_molecule_new_features.csv', 'w') as f: writer = csv.writer(f) # writer.writerow(['index', 'Maximum Degree', 'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors', # 'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area', 'Graph', 'smiles', # 'target']) writer.writerow([ 'index', 'Maximum Degree', 'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors', 'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area', 'fr_phos', 'aromatic_carbocycles', 'MolLogP', 'PEOE_VSA1', 'Fingerprint', 'smiles' ]) for line in infile: line = line.strip('\n\r ') line = line.split(",") smiles = line[10].strip() # edge_list = to_graph(smiles) mol = Chem.MolFromSmiles(smiles) # fingerprint_explicit_bitvector = RDKFingerprint(mol) # fingerprint_bit_string = fingerprint_explicit_bitvector.ToBitString() fingerprint_bit_string = GetMorganFingerprintAsBitVect( mol, 2).ToBitString() fr_phos = Fragments.fr_phos_acid(mol) + Fragments.fr_phos_ester( mol) aromatic_cc = Lipinski.NumAromaticCarbocycles(mol) molLogP = Crippen.MolLogP(mol) peoe_vsa1 = MolSurf.PEOE_VSA1(mol) writer.writerow(line[:8] + [ fr_phos, aromatic_cc, molLogP, peoe_vsa1, fingerprint_bit_string, line[10] ]) # writer.writerow(line[:8] + [edge_list] + [line[10], line[11]]) infile.close()
def makeFeatures(fileName): from rdkit import Chem from rdkit.Chem import Fragments from rdkit.Chem import AllChem from rdkit.Chem import MolSurf global featuresFile, numFeatures featuresFile = open(fileName, 'w') # Molecule features output file # run gaussian jobs # gaussian.setNumMols() # gaussian.makeAllGinps() # gaussian.runGaussianOnAllGinps() # open database file drugDB = Chem.SDMolSupplier("FKBP12_binders.sdf") if debug: print "\n\tNo features data file found. Writing new features data file.\n" text = "" # Placeholder for feature data molCount = 0 convergedCount = 0 converged_and_different = 0 drug_name = [] # load fragment descriptor Fragments._LoadPatterns(fileName='/usr/local/anaconda/pkgs/rdkit-2015.03.1-np19py27_0/share/RDKit/Data/FragmentDescriptors.csv') # Select features of interest for mol in drugDB: if molCount > -1: # print mol.GetProp("BindingDB Target Chain Sequence") gaussian_log_file = "gaussian_files/drug_"+str(molCount)+".log" converged, dipole, quadrupole, octapole, hexadecapole, dg_solv = gaussian.parseGaussianLog(gaussian_log_file) if converged == "True" and mol.GetProp("BindingDB Target Chain Sequence") == "MGVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPFKFMLGKQEVIRGWEEGVAQMSVGQRAKLTISPDYAYGATGHPGIIPPHATLVFDVELLKLE": if convergedCount ==0: diff = "True" else: diff = "True" for i in range(converged_and_different): if mol.GetProp("BindingDB Ligand Name") == drug_name[i]: diff = "False" break if diff == "True": drug_name.append(mol.GetProp("BindingDB Ligand Name")) text += "{}\n".format(AllChem.ComputeMolVolume(mol)) text += "{}\n".format(MolSurf.pyLabuteASA(mol)) text += "{}\n".format(mol.GetNumAtoms()) text += "{}\n".format(mol.GetNumBonds()) text += "{}\n".format(mol.GetNumHeavyAtoms()) text += "{}\n".format(dipole) text += "{}\n".format(quadrupole) text += "{}\n".format(octapole) text += "{}\n".format(hexadecapole) text += "{}\n".format(dg_solv) text += "{}\n".format(Fragments.fr_Al_OH(mol)) # aliphatic alcohols text += "{}\n".format(Fragments.fr_Ar_OH(mol)) # aromatic alcohols text += "{}\n".format(Fragments.fr_ketone(mol)) # number of ketones text += "{}\n".format(Fragments.fr_ether(mol)) # number of ether oxygens text += "{}\n".format(Fragments.fr_ester(mol)) # number of esters text += "{}\n".format(Fragments.fr_aldehyde(mol)) # number of aldehydes text += "{}\n".format(Fragments.fr_COO(mol)) # number of carboxylic acids text += "{}\n".format(Fragments.fr_benzene(mol)) # number of benzenes text += "{}\n".format(Fragments.fr_Ar_N(mol)) # number of aromatic nitrogens text += "{}\n".format(Fragments.fr_NH0(mol)) # number of tertiary amines text += "{}\n".format(Fragments.fr_NH1(mol)) # number of secondary amines text += "{}\n".format(Fragments.fr_NH2(mol)) # number of primary amines text += "{}\n".format(Fragments.fr_amide(mol)) # number of amides text += "{}\n".format(Fragments.fr_SH(mol)) # number of thiol groups text += "{}\n".format(Fragments.fr_nitro(mol)) # number of nitro groups text += "{}\n".format(Fragments.fr_furan(mol)) # number of furan rings text += "{}\n".format(Fragments.fr_imidazole(mol)) # number of imidazole rings text += "{}\n".format(Fragments.fr_oxazole(mol)) # number of oxazole rings text += "{}\n".format(Fragments.fr_morpholine(mol)) # number of morpholine rings text += "{}\n".format(Fragments.fr_halogen(mol)) # number of halogens text += "\nKI: {}\n".format(mol.GetProp("Ki (nM)")) text += "\n" # Use a blank line to divide molecule data featuresFile.write(text) text = "" converged_and_different += 1 convergedCount += 1 else: break molCount += 1 print "Number of molecules with converged gaussian log files and correct sequence:", convergedCount, "\n" print "Number of overlap drugs:", convergedCount - converged_and_different featuresFile.close()
def smiles_to_all_labels(df): smilesList = df['SMILES'] feature_df = df.copy() # get all functions of modules all_lipinski = inspect.getmembers(l, inspect.isfunction) all_fragments = inspect.getmembers(f, inspect.isfunction) # bad features have the same value for all our compounds bad_features = [] for (columnName, columnData) in df.iteritems(): if (len(set(columnData.values)) == 1): bad_features.append(columnName) # add fragment features for i in range(len(all_fragments)): new_col = [] # exclude attributes which start with _ and exclude bad features if all_fragments[i][0].startswith( '_') == False and all_fragments[i][0] not in bad_features: for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) mol_method = all_fragments[i][1](molecule) new_col.append(mol_method) # add new col with feature name to our df feature_df[all_fragments[i][0]] = new_col print('fragments over') # add lipinski features for i in range(len(all_lipinski)): new_col = [] if all_lipinski[i][0].startswith( '_') == False and all_fragments[i][0] not in bad_features: for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) mol_method = all_lipinski[i][1](molecule) new_col.append(mol_method) feature_df[all_lipinski[i][0]] = new_col print('lipinski over') new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(f.fr_Al_COO(molecule)) feature_df["fr_Al_COO"] = new_col # new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(l.HeavyAtomCount(molecule)) feature_df["HeavyAtomCount"] = new_col # add getnumatoms as feature new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(molecule.GetNumAtoms()) feature_df["GetNumAtoms"] = new_col # add CalcExactMolWt as feature new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(molDesc.CalcExactMolWt(molecule)) feature_df["CalcExactMolWt"] = new_col # print('other over') return feature_df
def create_features(data, types="train"): if types == "train": y = np.array(data['ACTIVE'].astype(int)) elif types == "test": y = None data = data[["SMILES"]] data["SMILES_str"] = data["SMILES"] data["SMILES"] = data["SMILES"].apply(lambda x: Chem.MolFromSmiles(x)) data["NumAtoms"] = data["SMILES"].apply( lambda x: x.GetNumAtoms()) #l.HeavyAtomCount(m) data["ExactMolWt"] = data["SMILES"].apply(lambda x: d.CalcExactMolWt(x)) data["fr_Al_COO"] = data["SMILES"].apply(lambda x: f.fr_Al_COO(x)) data["HsNumAtoms"] = data["SMILES"].apply( lambda x: Chem.AddHs(x).GetNumAtoms()) #to have the hydrogens explicitly present BondType = [[str(x.GetBondType()) for x in m.GetBonds()] for m in data["SMILES"]] BondType = [" ".join(x) for x in BondType] vec = CountVectorizer().fit(BondType) train_tfidf = vec.transform(BondType).todense() # 转化为更直观的一般矩阵 vocabulary = vec.vocabulary_ train_tfidf = pd.DataFrame(train_tfidf) train_tfidf.columns = vocabulary data = pd.concat([data, train_tfidf], axis=1) #data.columns #['SMILES', 'ACTIVE', 'SMILES_str', 'NumAtoms', 'ExactMolWt', 'fr_Al_COO','HsNumAtoms', 'double', 'single', 'aromatic', 'triple'] traindata = data[[ 'NumAtoms', 'ExactMolWt', 'fr_Al_COO', 'HsNumAtoms', 'double', 'single', 'aromatic', 'triple' ]] finger = [ np.array(AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=512)) for x in data["SMILES"] ] finger = pd.DataFrame(finger) finger.columns = ["morgan_" + str(x) for x in finger.columns] model = word2vec.Word2Vec.load('models/model_300dim.pkl') data['sentence'] = data.apply( lambda x: MolSentence(mol2alt_sentence(x['SMILES'], 1)), axis=1) m2v = [ DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK') ] m2v = np.array([x.vec for x in m2v]) m2v = pd.DataFrame(m2v) m2v.columns = ["m2v_" + str(x) for x in m2v.columns] datadict = { "Morgan": finger, "Despcritor": traindata, "molvec": m2v, 'y': y } return datadict