def get_descriptors(df): PandasTools.ChangeMoleculeRendering(renderer='String') Lmol = df['ROMol'] Ldescriptors = [] for m in Lmol: # Calculer les propriétés chimiques MW = round(Descriptors.ExactMolWt(m), 1) LogP = round(Descriptors.MolLogP(m), 1) TPSA = round(Descriptors.TPSA(m), 1) LabuteASA = round(Descriptors.LabuteASA(m), 1) HBA = Descriptors.NumHAcceptors(m) HBD = Descriptors.NumHDonors(m) FCSP3 = Lipinski.FractionCSP3(m) MQN8 = rdMolDescriptors.MQNs_(m)[7] MQN10 = rdMolDescriptors.MQNs_(m)[9] NAR = Lipinski.NumAromaticRings(m) NRB = Chem.Descriptors.NumRotatableBonds(m) Ldescriptors.append([ MW, LogP, TPSA, LabuteASA, HBA, HBD, FCSP3, MQN8, MQN10, NAR, NRB ]) # Create pandas row for conditions results with values and information whether rule of five is violated prop_df = pd.DataFrame(Ldescriptors) prop_df.columns = [ 'MW', 'LogP', 'TPSA', 'LabuteASA', 'HBA', 'HBD', 'FCSP3', 'MQN8', 'MQN10', 'NAR', 'NRB' ] prop_df = prop_df.set_index(df.index) return prop_df
def get_descriptors(mol, write=False): # Make a copy of the molecule dataframe desc = [ Lipinski.NumAromaticHeterocycles(mol), Lipinski.NumAromaticRings(mol), Lipinski.NumHDonors(mol), Lipinski.RingCount(mol), Lipinski.NHOHCount(mol), Lipinski.NumHeteroatoms(mol), Lipinski.NumAliphaticCarbocycles(mol), Lipinski.NumSaturatedCarbocycles(mol), Lipinski.NumAliphaticHeterocycles(mol), Lipinski.NumHAcceptors(mol), Lipinski.NumSaturatedHeterocycles(mol), Lipinski.NumAliphaticRings(mol), Descriptors.NumRadicalElectrons(mol), Descriptors.MaxPartialCharge(mol), Descriptors.NumValenceElectrons(mol), Lipinski.FractionCSP3(mol), Descriptors.MaxAbsPartialCharge(mol), Lipinski.NumAromaticCarbocycles(mol), Lipinski.NumSaturatedRings(mol), Lipinski.NumRotatableBonds(mol) ] desc = [0 if i != i else i for i in desc] return desc
def mole_proper(mol): num_hdonors = Lipinski.NumHDonors(mol) num_hacceptors = Lipinski.NumHAcceptors(mol) num_rotatable = Lipinski.NumRotatableBonds(mol) num_aromatic = Lipinski.NumAromaticRings(mol) mol_weight = Descriptors.MolWt(mol) mol_logp = Crippen.MolLogP(mol) mol_TPSA = Descriptors.TPSA(mol) proper = [ num_hdonors, num_hacceptors, num_rotatable, num_aromatic, mol_weight, mol_logp, mol_TPSA ] return proper
def properties(fnames, labels, is_active=False): """ Five structural properties calculation for each molecule in each given file. These properties contains No. of Hydrogen Bond Acceptor/Donor, Rotatable Bond, Aliphatic Ring, Aromatic Ring and Heterocycle. Arguments: fnames (list): the file path of molecules. labels (list): the label for each file in the fnames. is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False) if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected. (Default: False) Returns: df (DataFrame): the table contains three columns; 'Set' is the label of fname the molecule belongs to, 'Property' is the name of one of five properties, 'Number' is the property value. """ props = [] for i, fname in enumerate(fnames): df = pd.read_table(fname) if 'SCORE' in df.columns: df = df[df.SCORE > (0.5 if is_active else 0)] elif 'PCHEMBL_VALUE' in df.columns: df = df[df.PCHEMBL_VALUE >= (6.5 if is_active else 0)] df = df.drop_duplicates(subset='CANONICAL_SMILES') if len(df) > int(1e5): df = df.sample(int(1e5)) for smile in tqdm(df.CANONICAL_SMILES): mol = Chem.MolFromSmiles(smile) HA = Lipinski.NumHAcceptors(mol) props.append([labels[i], 'Hydrogen Bond\nAcceptor', HA]) HD = Lipinski.NumHDonors(mol) props.append([labels[i], 'Hydrogen\nBond Donor', HD]) RB = Lipinski.NumRotatableBonds(mol) props.append([labels[i], 'Rotatable\nBond', RB]) RI = AllChem.CalcNumAliphaticRings(mol) props.append([labels[i], 'Aliphatic\nRing', RI]) AR = Lipinski.NumAromaticRings(mol) props.append([labels[i], 'Aromatic\nRing', AR]) HC = AllChem.CalcNumHeterocycles(mol) props.append([labels[i], 'Heterocycle', HC]) df = pd.DataFrame(props, columns=['Set', 'Property', 'Number']) return df
def extract(x, from_smiles): if from_smiles: mol = Chem.MolFromSmiles(x) else: mol = x if (mol is None) or (len(mol.GetAtoms()) == 0): if include_3D: return [0] * 29 else: return [0] * 24 else: logP = Crippen.MolLogP(mol) refractivity = Crippen.MolMR(mol) weight = Descriptors.MolWt(mol) exact_weight = Descriptors.ExactMolWt(mol) heavy_weight = Descriptors.HeavyAtomMolWt(mol) heavy_count = Lipinski.HeavyAtomCount(mol) nhoh_count = Lipinski.NHOHCount(mol) no_count = Lipinski.NOCount(mol) hacceptor_count = Lipinski.NumHAcceptors(mol) hdonor_count = Lipinski.NumHDonors(mol) hetero_count = Lipinski.NumHeteroatoms(mol) rotatable_bond_count = Lipinski.NumRotatableBonds(mol) valance_electron_count = Descriptors.NumValenceElectrons(mol) amide_bond_count = rdMolDescriptors.CalcNumAmideBonds(mol) aliphatic_ring_count = Lipinski.NumAliphaticRings(mol) aromatic_ring_count = Lipinski.NumAromaticRings(mol) saturated_ring_count = Lipinski.NumSaturatedRings(mol) aliphatic_cycle_count = Lipinski.NumAliphaticCarbocycles(mol) aliphaticHetero_cycle_count = Lipinski.NumAliphaticHeterocycles( mol) aromatic_cycle_count = Lipinski.NumAromaticCarbocycles(mol) aromaticHetero_cycle_count = Lipinski.NumAromaticHeterocycles(mol) saturated_cycle_count = Lipinski.NumSaturatedCarbocycles(mol) saturatedHetero_cycle_count = Lipinski.NumSaturatedHeterocycles( mol) tpsa = rdMolDescriptors.CalcTPSA(mol) if include_3D: mol_3D = Chem.AddHs(mol) AllChem.EmbedMolecule(mol_3D) AllChem.MMFFOptimizeMolecule(mol_3D) eccentricity = rdMolDescriptors.CalcEccentricity(mol_3D) asphericity = rdMolDescriptors.CalcAsphericity(mol_3D) spherocity = rdMolDescriptors.CalcSpherocityIndex(mol_3D) inertial = rdMolDescriptors.CalcInertialShapeFactor(mol_3D) gyration = rdMolDescriptors.CalcRadiusOfGyration(mol_3D) return [ logP, refractivity, weight, exact_weight, heavy_weight, heavy_count, nhoh_count, no_count, hacceptor_count, hdonor_count, hetero_count, rotatable_bond_count, valance_electron_count, amide_bond_count, aliphatic_ring_count, aromatic_ring_count, saturated_ring_count, aliphatic_cycle_count, aliphaticHetero_cycle_count, aromatic_cycle_count, aromaticHetero_cycle_count, saturated_cycle_count, saturatedHetero_cycle_count, tpsa, eccentricity, asphericity, spherocity, inertial, gyration ] else: return [ logP, refractivity, weight, exact_weight, heavy_weight, heavy_count, nhoh_count, no_count, hacceptor_count, hdonor_count, hetero_count, rotatable_bond_count, valance_electron_count, amide_bond_count, aliphatic_ring_count, aromatic_ring_count, saturated_ring_count, aliphatic_cycle_count, aliphaticHetero_cycle_count, aromatic_cycle_count, aromaticHetero_cycle_count, saturated_cycle_count, saturatedHetero_cycle_count, tpsa ]
## Bertz TC org_tcs = [BertzCT(mol) for mol in tqdm(org_mols)] ## TPSA org_tpsa = [Descriptors.TPSA(mol) for mol in org_mols] ## QED org_qed = [] for mol in org_mols: try: org_qed.append(Descriptors.qed(mol)) except OverflowError: pass ## number of rings org_rings1 = [Lipinski.RingCount(mol) for mol in tqdm(org_mols)] org_rings2 = [Lipinski.NumAliphaticRings(mol) for mol in tqdm(org_mols)] org_rings3 = [Lipinski.NumAromaticRings(mol) for mol in tqdm(org_mols)] ## SA score org_SA = [] for mol in tqdm(org_mols): try: org_SA.append(sascorer.calculateScore(mol)) except (OverflowError, ZeroDivisionError): pass ## NP-likeness fscore = npscorer.readNPModel() org_NP = [npscorer.scoreMol(mol, fscore) for mol in tqdm(org_mols)] ## % sp3 carbons org_sp3 = [Lipinski.FractionCSP3(mol) for mol in org_mols] ## % rotatable bonds org_rot = [pct_rotatable_bonds(mol) for mol in org_mols]
def calc_rdkit(mol): descriptors = pd.Series( np.array([ Crippen.MolLogP(mol), Crippen.MolMR(mol), Descriptors.FpDensityMorgan1(mol), Descriptors.FpDensityMorgan2(mol), Descriptors.FpDensityMorgan3(mol), Descriptors.FractionCSP3(mol), Descriptors.HeavyAtomMolWt(mol), Descriptors.MaxAbsPartialCharge(mol), Descriptors.MaxPartialCharge(mol), Descriptors.MinAbsPartialCharge(mol), Descriptors.MinPartialCharge(mol), Descriptors.MolWt(mol), Descriptors.NumRadicalElectrons(mol), Descriptors.NumValenceElectrons(mol), EState.EState.MaxAbsEStateIndex(mol), EState.EState.MaxEStateIndex(mol), EState.EState.MinAbsEStateIndex(mol), EState.EState.MinEStateIndex(mol), EState.EState_VSA.EState_VSA1(mol), EState.EState_VSA.EState_VSA10(mol), EState.EState_VSA.EState_VSA11(mol), EState.EState_VSA.EState_VSA2(mol), EState.EState_VSA.EState_VSA3(mol), EState.EState_VSA.EState_VSA4(mol), EState.EState_VSA.EState_VSA5(mol), EState.EState_VSA.EState_VSA6(mol), EState.EState_VSA.EState_VSA7(mol), EState.EState_VSA.EState_VSA8(mol), EState.EState_VSA.EState_VSA9(mol), Fragments.fr_Al_COO(mol), Fragments.fr_Al_OH(mol), Fragments.fr_Al_OH_noTert(mol), Fragments.fr_aldehyde(mol), Fragments.fr_alkyl_carbamate(mol), Fragments.fr_alkyl_halide(mol), Fragments.fr_allylic_oxid(mol), Fragments.fr_amide(mol), Fragments.fr_amidine(mol), Fragments.fr_aniline(mol), Fragments.fr_Ar_COO(mol), Fragments.fr_Ar_N(mol), Fragments.fr_Ar_NH(mol), Fragments.fr_Ar_OH(mol), Fragments.fr_ArN(mol), Fragments.fr_aryl_methyl(mol), Fragments.fr_azide(mol), Fragments.fr_azo(mol), Fragments.fr_barbitur(mol), Fragments.fr_benzene(mol), Fragments.fr_benzodiazepine(mol), Fragments.fr_bicyclic(mol), Fragments.fr_C_O(mol), Fragments.fr_C_O_noCOO(mol), Fragments.fr_C_S(mol), Fragments.fr_COO(mol), Fragments.fr_COO2(mol), Fragments.fr_diazo(mol), Fragments.fr_dihydropyridine(mol), Fragments.fr_epoxide(mol), Fragments.fr_ester(mol), Fragments.fr_ether(mol), Fragments.fr_furan(mol), Fragments.fr_guanido(mol), Fragments.fr_halogen(mol), Fragments.fr_hdrzine(mol), Fragments.fr_hdrzone(mol), Fragments.fr_HOCCN(mol), Fragments.fr_imidazole(mol), Fragments.fr_imide(mol), Fragments.fr_Imine(mol), Fragments.fr_isocyan(mol), Fragments.fr_isothiocyan(mol), Fragments.fr_ketone(mol), Fragments.fr_ketone_Topliss(mol), Fragments.fr_lactam(mol), Fragments.fr_lactone(mol), Fragments.fr_methoxy(mol), Fragments.fr_morpholine(mol), Fragments.fr_N_O(mol), Fragments.fr_Ndealkylation1(mol), Fragments.fr_Ndealkylation2(mol), Fragments.fr_NH0(mol), Fragments.fr_NH1(mol), Fragments.fr_NH2(mol), Fragments.fr_Nhpyrrole(mol), Fragments.fr_nitrile(mol), Fragments.fr_nitro(mol), Fragments.fr_nitro_arom(mol), Fragments.fr_nitro_arom_nonortho(mol), Fragments.fr_nitroso(mol), Fragments.fr_oxazole(mol), Fragments.fr_oxime(mol), Fragments.fr_para_hydroxylation(mol), Fragments.fr_phenol(mol), Fragments.fr_phenol_noOrthoHbond(mol), Fragments.fr_phos_acid(mol), Fragments.fr_phos_ester(mol), Fragments.fr_piperdine(mol), Fragments.fr_piperzine(mol), Fragments.fr_priamide(mol), Fragments.fr_prisulfonamd(mol), Fragments.fr_pyridine(mol), Fragments.fr_quatN(mol), Fragments.fr_SH(mol), Fragments.fr_sulfide(mol), Fragments.fr_sulfonamd(mol), Fragments.fr_sulfone(mol), Fragments.fr_term_acetylene(mol), Fragments.fr_tetrazole(mol), Fragments.fr_thiazole(mol), Fragments.fr_thiocyan(mol), Fragments.fr_thiophene(mol), Fragments.fr_unbrch_alkane(mol), Fragments.fr_urea(mol), GraphDescriptors.BalabanJ(mol), GraphDescriptors.BertzCT(mol), GraphDescriptors.Chi0(mol), GraphDescriptors.Chi0n(mol), GraphDescriptors.Chi0v(mol), GraphDescriptors.Chi1(mol), GraphDescriptors.Chi1n(mol), GraphDescriptors.Chi1v(mol), GraphDescriptors.Chi2n(mol), GraphDescriptors.Chi2v(mol), GraphDescriptors.Chi3n(mol), GraphDescriptors.Chi3v(mol), GraphDescriptors.Chi4n(mol), GraphDescriptors.Chi4v(mol), GraphDescriptors.HallKierAlpha(mol), GraphDescriptors.Ipc(mol), GraphDescriptors.Kappa1(mol), GraphDescriptors.Kappa2(mol), GraphDescriptors.Kappa3(mol), Lipinski.HeavyAtomCount(mol), Lipinski.NHOHCount(mol), Lipinski.NOCount(mol), Lipinski.NumAliphaticCarbocycles(mol), Lipinski.NumAliphaticHeterocycles(mol), Lipinski.NumAliphaticRings(mol), Lipinski.NumAromaticCarbocycles(mol), Lipinski.NumAromaticHeterocycles(mol), Lipinski.NumAromaticRings(mol), Lipinski.NumHAcceptors(mol), Lipinski.NumHDonors(mol), Lipinski.NumHeteroatoms(mol), Lipinski.NumRotatableBonds(mol), Lipinski.NumSaturatedCarbocycles(mol), Lipinski.NumSaturatedHeterocycles(mol), Lipinski.NumSaturatedRings(mol), Lipinski.RingCount(mol), MolSurf.LabuteASA(mol), MolSurf.PEOE_VSA1(mol), MolSurf.PEOE_VSA10(mol), MolSurf.PEOE_VSA11(mol), MolSurf.PEOE_VSA12(mol), MolSurf.PEOE_VSA13(mol), MolSurf.PEOE_VSA14(mol), MolSurf.PEOE_VSA2(mol), MolSurf.PEOE_VSA3(mol), MolSurf.PEOE_VSA4(mol), MolSurf.PEOE_VSA5(mol), MolSurf.PEOE_VSA6(mol), MolSurf.PEOE_VSA7(mol), MolSurf.PEOE_VSA8(mol), MolSurf.PEOE_VSA9(mol), MolSurf.SlogP_VSA1(mol), MolSurf.SlogP_VSA10(mol), MolSurf.SlogP_VSA11(mol), MolSurf.SlogP_VSA12(mol), MolSurf.SlogP_VSA2(mol), MolSurf.SlogP_VSA3(mol), MolSurf.SlogP_VSA4(mol), MolSurf.SlogP_VSA5(mol), MolSurf.SlogP_VSA6(mol), MolSurf.SlogP_VSA7(mol), MolSurf.SlogP_VSA8(mol), MolSurf.SlogP_VSA9(mol), MolSurf.SMR_VSA1(mol), MolSurf.SMR_VSA10(mol), MolSurf.SMR_VSA2(mol), MolSurf.SMR_VSA3(mol), MolSurf.SMR_VSA4(mol), MolSurf.SMR_VSA5(mol), MolSurf.SMR_VSA6(mol), MolSurf.SMR_VSA7(mol), MolSurf.SMR_VSA8(mol), MolSurf.SMR_VSA9(mol), MolSurf.TPSA(mol) ])) return descriptors
def main(): # CLI options parsing parser = argparse.ArgumentParser( description = "Project molecules read from a SMILES file into an 8D \ space whose dimensions are molecular descriptors: \ (MolW, HA, cLogP, MR, TPSA, RotB, HBA, HBD, FC)") parser.add_argument("-i", metavar = "input_smi", dest = "input_smi", help = "input SMILES file") parser.add_argument("-o", metavar = "output_csv", dest = "output_csv", help = "output CSV file") parser.add_argument('--no-header', dest='no_header', action='store_true', default=False, help = "no CSV header in output file") # just warn about aliens by default parser.add_argument('--remove-aliens', dest='rm_aliens', action='store_true', default=False, help = "don't allow aliens in output file") # parse CLI if len(sys.argv) == 1: # show help in case user has no clue of what to do parser.print_help(sys.stderr) sys.exit(1) args = parser.parse_args() input_smi = args.input_smi output_csv = args.output_csv rm_aliens = args.rm_aliens no_header = args.no_header out_count = 0 alien_count = 0 error_count = 0 with open(output_csv, 'w') as out_file: if not no_header: print("#name,MolW,HA,cLogP,AR,MR,TPSA,RotB,HBA,HBD,FC", file=out_file) for i, mol, name in RobustSmilesMolSupplier(input_smi): if mol is None: error_count += 1 else: MolW = Descriptors.MolWt(mol) HA = Lipinski.HeavyAtomCount(mol) cLogP = Descriptors.MolLogP(mol) AR = Lipinski.NumAromaticRings(mol) MR = Descriptors.MolMR(mol) TPSA = Descriptors.TPSA(mol) RotB = Descriptors.NumRotatableBonds(mol) HBA = Descriptors.NumHAcceptors(mol) HBD = Descriptors.NumHDonors(mol) FC = Chem.rdmolops.GetFormalCharge(mol) alien = is_alien(MolW, cLogP, TPSA, RotB, HBA, HBD, FC) if alien: alien_str = alien_diagnose(i, name, MolW, cLogP, TPSA, RotB, HBA, HBD, FC) print("WARN: %s" % alien_str, file=sys.stderr) alien_count += 1 if (not alien) or (not rm_aliens): csv_line = "%s,%g,%d,%g,%d,%g,%g,%d,%d,%d,%d" % \ (name, MolW, HA, cLogP, AR, MR, TPSA, RotB, HBA, HBD, FC) print(csv_line, file=out_file) out_count += 1 total_count = out_count + error_count if rm_aliens: total_count += alien_count print("encoded: %d aliens: %d errors: %d total: %d" % \ (out_count, alien_count, error_count, total_count), file=sys.stderr)
def calculate_metrics(mol): # calculate chemical descriptors ## % of sp3 carbons pct_sp3 = Lipinski.FractionCSP3(mol) ## H bond donors/acceptors h_acceptor = Lipinski.NumHAcceptors(mol) h_donor = Lipinski.NumHDonors(mol) ## number of rotable bonds n_bonds = mol.GetNumBonds() if n_bonds > 0: rot_bonds = Lipinski.NumRotatableBonds(mol) / n_bonds else: rot_bonds = 0 ## number of rings, aromatic and aliphatic n_rings = Lipinski.RingCount(mol) n_rings_ali = Lipinski.NumAliphaticRings(mol) n_rings_aro = Lipinski.NumAromaticRings(mol) ## number of stereocentres Chem.AssignStereochemistry(mol) n_stereo = CalcNumAtomStereoCenters(mol) ## polarity tpsa = Chem.CalcTPSA(mol) ## hydrophobicity logP = Descriptors.MolLogP(mol) ## molecular weight mw = Descriptors.MolWt(mol) ## in Lipinski space? Ro5 = in_Ro5(mol) ## % heteroatoms n_atoms = len(mol.GetAtoms()) pct_hetero = Lipinski.NumHeteroatoms(mol) / n_atoms ## number of each atom symbols = [atom.GetSymbol() for atom in mol.GetAtoms()] atom_counts = Counter(symbols) ## Murcko scaffolds murcko = Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(mol)) ## NP-likeness try: np_score = calculateNPScore(mol, np_mod) except ValueError: np_score = None ## synthetic accessibility try: sa_score = calculateSAScore(mol, sa_mod) except ValueError: sa_score = None ## topological complexity bertz_idx = BertzCT(mol) # create dict metrics = { '% sp3 carbons': pct_sp3, 'H bond acceptors': h_acceptor, 'H bond donors': h_donor, '% rotatable bonds': rot_bonds, 'Rings': n_rings, 'Rings, aliphatic': n_rings_ali, 'Rings, aromatic': n_rings_aro, 'Stereocentres': n_stereo, 'Topological polar surface area': tpsa, 'LogP': logP, 'Molecular weight': mw, 'Lipinski rule of 5': Ro5, '% heteroatoms': pct_hetero, 'Murcko scaffold': murcko, 'NP-likeness score': np_score, 'Synthetic accessibility score': sa_score, 'Bertz topological complexity': bertz_idx } # append atom counts for key in atom_counts.keys(): metrics['Atoms with symbol ' + key] = atom_counts[key] return (metrics)