Esempio n. 1
0
def _calculateDescriptors(mol):
    df = pd.DataFrame(index=[0])
    df["SlogP"] = rdMolDescriptors.CalcCrippenDescriptors(mol)[0]
    df["SMR"] = rdMolDescriptors.CalcCrippenDescriptors(mol)[1]
    df["LabuteASA"] = rdMolDescriptors.CalcLabuteASA(mol)
    df["TPSA"] = Descriptors.TPSA(mol)
    df["AMW"] = Descriptors.MolWt(mol)
    df["ExactMW"] = rdMolDescriptors.CalcExactMolWt(mol)
    df["NumLipinskiHBA"] = rdMolDescriptors.CalcNumLipinskiHBA(mol)
    df["NumLipinskiHBD"] = rdMolDescriptors.CalcNumLipinskiHBD(mol)
    df["NumRotatableBonds"] = rdMolDescriptors.CalcNumRotatableBonds(mol)
    df["NumHBD"] = rdMolDescriptors.CalcNumHBD(mol)
    df["NumHBA"] = rdMolDescriptors.CalcNumHBA(mol)
    df["NumAmideBonds"] = rdMolDescriptors.CalcNumAmideBonds(mol)
    df["NumHeteroAtoms"] = rdMolDescriptors.CalcNumHeteroatoms(mol)
    df["NumHeavyAtoms"] = Chem.rdchem.Mol.GetNumHeavyAtoms(mol)
    df["NumAtoms"] = Chem.rdchem.Mol.GetNumAtoms(mol)
    df["NumRings"] = rdMolDescriptors.CalcNumRings(mol)
    df["NumAromaticRings"] = rdMolDescriptors.CalcNumAromaticRings(mol)
    df["NumSaturatedRings"] = rdMolDescriptors.CalcNumSaturatedRings(mol)
    df["NumAliphaticRings"] = rdMolDescriptors.CalcNumAliphaticRings(mol)
    df["NumAromaticHeterocycles"] = \
        rdMolDescriptors.CalcNumAromaticHeterocycles(mol)
    df["NumSaturatedHeterocycles"] = \
        rdMolDescriptors.CalcNumSaturatedHeterocycles(mol)
    df["NumAliphaticHeterocycles"] = \
        rdMolDescriptors.CalcNumAliphaticHeterocycles(mol)
    df["NumAromaticCarbocycles"] = \
        rdMolDescriptors.CalcNumAromaticCarbocycles(mol)
    df["NumSaturatedCarbocycles"] = \
        rdMolDescriptors.CalcNumSaturatedCarbocycles(mol)
    df["NumAliphaticCarbocycles"] = \
        rdMolDescriptors.CalcNumAliphaticCarbocycles(mol)
    df["FractionCSP3"] = rdMolDescriptors.CalcFractionCSP3(mol)
    df["Chi0v"] = rdMolDescriptors.CalcChi0v(mol)
    df["Chi1v"] = rdMolDescriptors.CalcChi1v(mol)
    df["Chi2v"] = rdMolDescriptors.CalcChi2v(mol)
    df["Chi3v"] = rdMolDescriptors.CalcChi3v(mol)
    df["Chi4v"] = rdMolDescriptors.CalcChi4v(mol)
    df["Chi1n"] = rdMolDescriptors.CalcChi1n(mol)
    df["Chi2n"] = rdMolDescriptors.CalcChi2n(mol)
    df["Chi3n"] = rdMolDescriptors.CalcChi3n(mol)
    df["Chi4n"] = rdMolDescriptors.CalcChi4n(mol)
    df["HallKierAlpha"] = rdMolDescriptors.CalcHallKierAlpha(mol)
    df["kappa1"] = rdMolDescriptors.CalcKappa1(mol)
    df["kappa2"] = rdMolDescriptors.CalcKappa2(mol)
    df["kappa3"] = rdMolDescriptors.CalcKappa3(mol)
    slogp_VSA = list(map(lambda i: "slogp_VSA" + str(i), list(range(1, 13))))
    df = df.assign(**dict(zip(slogp_VSA, rdMolDescriptors.SlogP_VSA_(mol))))
    smr_VSA = list(map(lambda i: "smr_VSA" + str(i), list(range(1, 11))))
    df = df.assign(**dict(zip(smr_VSA, rdMolDescriptors.SMR_VSA_(mol))))
    peoe_VSA = list(map(lambda i: "peoe_VSA" + str(i), list(range(1, 15))))
    df = df.assign(**dict(zip(peoe_VSA, rdMolDescriptors.PEOE_VSA_(mol))))
    MQNs = list(map(lambda i: "MQN" + str(i), list(range(1, 43))))
    df = df.assign(**dict(zip(MQNs, rdMolDescriptors.MQNs_(mol))))
    return df
Esempio n. 2
0
    def __init__(self, configuration: StatsExtractionConfig):
        self._filters = FilterTypesEnum

        self._columns = DataframeColumnsEnum
        self._stats = StatsExtractionEnum
        self._purging = PurgingEnum
        self._configuration = configuration
        standardisation_config_dict = self._configuration.standardisation_config
        standardisation_config = [
            FilterConfiguration(name=name, parameters=params)
            for name, params in standardisation_config_dict.items()
        ]

        dec_separator = self._stats.DECORATION_SEPARATOR_TOKEN
        attachment_token = self._stats.ATTACHMENT_POINT_TOKEN
        self._mol_wts_udf = psf.udf(
            lambda x: ExactMolWt(Chem.MolFromSmiles(x)), pst.FloatType())
        self._num_rings_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(x)),
            pst.IntegerType())
        self._num_atoms_udf = psf.udf(
            lambda x: Chem.MolFromSmiles(x).GetNumHeavyAtoms(),
            pst.IntegerType())
        self._num_aromatic_rings_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumAromaticRings(
                Chem.MolFromSmiles(x)), pst.IntegerType())
        self._hbond_donors_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumHBD(Chem.MolFromSmiles(x)),
            pst.IntegerType())
        self._hbond_acceptors_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumHBA(Chem.MolFromSmiles(x)),
            pst.IntegerType())
        self._hetero_atom_ratio_udf = psf.udf(
            lambda x: len([
                atom for atom in Chem.MolFromSmiles(x).GetAtoms()
                if atom.GetAtomicNum() == 6
            ]) / Chem.MolFromSmiles(x).GetNumHeavyAtoms(), pst.FloatType())
        self._make_canonical_udf = psf.udf(
            lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)),
            pst.StringType())
        self._standardise_smiles_udf = psf.udf(
            lambda x: RDKitStandardizer(standardisation_config, None).
            apply_filter(x), pst.StringType())
        pattern = self._stats.REGEX_TOKENS
        self.regex = re.compile(pattern)
        self._tokeniser_udf = psf.udf(self.regex.findall,
                                      pst.ArrayType(pst.StringType()))
        self._decoration_split_udf = psf.udf(lambda x: x.split(dec_separator),
                                             pst.ArrayType(pst.StringType()))
        self._count_decorations_udf = psf.udf(
            lambda s: list(s).count(attachment_token), pst.IntegerType())
def main(in_file, output):

  Cmpds  = {}
  InMols = rdkit_open([in_file])
  print('\n # Number of input molecule: {0}'.format(len(InMols)))
  for mol in InMols:
    m = {}

    name = mol.GetProp('_Name').split()[0]
    
    m['Name'] = name
    m['Formula'] = rd.CalcMolFormula(mol)
    m['SMILES'] = Chem.MolToSmiles(mol)

    m['MW']   = rd._CalcMolWt(mol)               # Molecular Weight
    m['logP'] = rd.CalcCrippenDescriptors(mol)[0]  # Partition coefficient
    m['HDon'] = rd.CalcNumLipinskiHBD(mol)      # Lipinski Hbond donor
    m['HAcc'] = rd.CalcNumLipinskiHBA(mol)      # Lipinski Hbond acceptor
    m['TPSA'] = rd.CalcTPSA(mol)                # Topological polar surface area

    m['Rotat'] = rd.CalcNumRotatableBonds(mol, strict=True) # Rotatable bond
    m['MolRef'] = rd.CalcCrippenDescriptors(mol)[1]         # Molar refractivity
    m['AliRing'] = rd.CalcNumAliphaticRings(mol)        # Aliphatic ring number
    m['AroRing'] = rd.CalcNumAromaticRings(mol)         # Aromatic ring number
#    m['Stereo'] = rd.CalcNumAtomStereoCenters(mol)      # Stereo center number
#    m['UnspStereo'] = rd.CalcNumUnspecifiedAtomStereoCenters(mol)  # unspecified stereo

    m['SMILES'] = Chem.MolToSmiles(mol, 
                    isomericSmiles=True, allHsExplicit=False)
    Cmpds[name] = m

  ####################################

  df = pd.DataFrame.from_dict(Cmpds, orient='index')
  df.index.name = 'Name'

  # Columns of data to print out
  Columns = [ 'Formula',
              'MW',    'logP',   'HDon',    'HAcc',    'TPSA',
              'Rotat', 'MolRef', 'AliRing', 'AroRing', 
              #'Stereo', 'UnspStereo', 
              'SMILES', ]
  reorder = df[Columns]

  # Output to CSV
  reorder.to_csv( output+'.csv', sep=',', na_rep='NA', encoding='utf-8',
                  float_format='%.5f', header=True )

  # Output to Excel
  reorder.to_excel( output+'.xlsx', header=True, na_rep='NA' )
Esempio n. 4
0
    def assign_ring_order(self, skeleton, substituents):
        """
        Assures that ring numbering in substituents is compatible with the
        number of rings present in the skeleton.

        Arguments
        ---------

        skeleton : :class:`str`
            SMILES string representing molecular skeleton onto which substituent
            groups will be placed.

        substituents : :class:`list`
            A list of allowed substituents, represented by SMILES strings.

        Returns
        -------

        substituents : :class:`list`
            The list of allowed substituents, still represented by SMILES
            strings, with their ring open/close numbering adjusted to be
            compatible with the number of rings present in the skeleton.

        """

        n = rdMolDescriptors.CalcNumAromaticRings(
            rdkit.MolFromSmiles(skeleton))

        for i, item in enumerate(substituents):
            rings = rdMolDescriptors.CalcNumAromaticRings(
                rdkit.MolFromSmiles(item[1:-1]))
            if rings > 0:
                for j in reversed(range(rings + 1)):
                    item = item.replace(str(j), str(j + n))
                substituents[i] = item

        return substituents
Esempio n. 5
0
def model_process_fun(model_out, visdom, n):
    # TODO: rephrase this to return a dict, instead of calling visdom directly
    from rdkit import Chem
    from rdkit.Chem.Draw import MolToFile
    # actions, logits, rewards, terminals, info = model_out
    smiles, valid = model_out['info']
    total_rewards = model_out['rewards']
    if len(total_rewards.shape) > 1:
        total_rewards = total_rewards.sum(1)
    best_ind = torch.argmax(total_rewards).data.item()
    this_smile = smiles[best_ind]
    mol = Chem.MolFromSmiles(this_smile)
    pic_save_path = os.path.realpath(root_location + '/images/' + 'tmp.svg')
    if mol is not None:
        try:
            MolToFile(mol, pic_save_path, imageType='svg')
            with open(pic_save_path, 'r') as myfile:
                data = myfile.read()
            data = data.replace('svg:', '')
            visdom.append('best molecule of batch', 'svg', svgstr=data)
        except Exception as e:
            print(e)
        scores, norm_scores = scorer.get_scores([this_smile])
        visdom.append(
            'score component',
            'line',
            X=np.array([n]),
            Y=np.array([[x for x in norm_scores[0]] + [norm_scores[0].sum()] +
                        [scores[0].sum()] + [desc.CalcNumAromaticRings(mol)]]),
            opts={
                'legend': [
                    'logP', 'SA', 'cycle', 'norm_reward', 'reward',
                    'Aromatic rings'
                ]
            })
        visdom.append('reward',
                      'line',
                      X=np.array([n]),
                      Y=np.array([total_rewards[best_ind].item()]))
        visdom.append('fraction valid',
                      'line',
                      X=np.array([n]),
                      Y=np.array([valid.mean().data.item()]))
        visdom.append('num atoms',
                      'line',
                      X=np.array([n]),
                      Y=np.array([len(mol.GetAtoms())]))
Esempio n. 6
0
 def calculate_properties(self, smiles=None, mol=None, props=[]):
     """this method calculates basic properties for the mol
     returns : error (bool)"""
     if len(props) == 0:
         return True
     if mol is None:
         mol = Chem.MolFromSmiles(smiles)
     if mol is None:
         return True
     if 'py_formula' in props:
         self.data['py_formula'] = desc.CalcMolFormula(mol)
     if 'py_em' in props:
         self.data['py_em'] = round(desc.CalcExactMolWt(mol), 5)
     if 'py_n_Cl_Br' in props:
         all_atoms = []
         for atom in mol.GetAtoms():
             all_atoms.append(atom.GetSymbol())
         n_Cl = all_atoms.count('Cl')
         n_Br = all_atoms.count('Br')
         self.data['py_n_Cl_Br'] = n_Cl + n_Br
     if 'py_na' in props:
         self.data['py_na'] = mol.GetNumAtoms()
     if 'py_mw' in props:
         self.data['py_mw'] = desc._CalcMolWt(mol)
     if 'py_fsp3' in props:
         self.data['py_fsp3'] = desc.CalcFractionCSP3(mol)
     if 'py_rb' in props:
         self.data['py_rb'] = desc.CalcNumRotatableBonds(mol)
     if 'py_tpsa' in props:
         self.data['py_tpsa'] = desc.CalcTPSA(mol)
     if 'py_clogp' in props:
         self.data['py_clogp'] = desc.CalcCrippenDescriptors(mol)[0]
     if 'py_nar' in props:
         self.data['py_nar'] = desc.CalcNumAromaticRings(mol)
     if 'py_nhba' in props:
         self.data['py_nhba'] = desc.CalcNumHBA(mol)
     if 'py_nhbd' in props:
         self.data['py_nhbd'] = desc.CalcNumHBD(mol)
     return False
Esempio n. 7
0
def feature_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = rdMolDescriptors.MQNs_(mol)
    
    fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol))
    fp.append(rdMolDescriptors.CalcExactMolWt(mol))
    fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol))
    fp.append(rdMolDescriptors.CalcFractionCSP3(mol))
    fp.append(rdMolDescriptors.CalcNumAliphaticCarbocycles(mol))
    fp.append(rdMolDescriptors.CalcNumAliphaticHeterocycles(mol))
    fp.append(rdMolDescriptors.CalcNumAliphaticRings((mol)))
    fp.append(rdMolDescriptors.CalcNumAromaticCarbocycles(mol))
    fp.append(rdMolDescriptors.CalcNumAromaticHeterocycles(mol))
    fp.append(rdMolDescriptors.CalcNumAromaticRings(mol))
    fp.append(rdMolDescriptors.CalcNumBridgeheadAtoms(mol))
    fp.append(rdMolDescriptors.CalcNumRings(mol))
    fp.append(rdMolDescriptors.CalcNumAmideBonds(mol))
    fp.append(rdMolDescriptors.CalcNumHeterocycles(mol))
    fp.append(rdMolDescriptors.CalcNumSpiroAtoms(mol))
    fp.append(rdMolDescriptors.CalcTPSA(mol))
    
    return np.array(fp)
 def model_process_fun(model_out, visdom, n):
     from rdkit import Chem
     from rdkit.Chem.Draw import MolToFile
     actions, logits, rewards, terminals, info = model_out
     smiles, valid = info
     total_rewards = rewards.sum(1)
     best_ind = torch.argmax(total_rewards).data.item()
     this_smile = smiles[best_ind]
     mol = Chem.MolFromSmiles(this_smile)
     pic_save_path = root_location + 'images/' + 'test.svg'
     if mol is not None:
         try:
             MolToFile(mol, pic_save_path, imageType='svg')
             with open(pic_save_path, 'r') as myfile:
                 data = myfile.read()
             data = data.replace('svg:', '')
             visdom.append('best molecule of batch', 'svg', svgstr=data)
         except:
             pass
         scores, norm_scores = scorer.get_scores([this_smile])
         visdom.append(
             'score component',
             'line',
             X=np.array([n]),
             Y=np.array(
                 [[x for x in norm_scores[0]] + [norm_scores[0].sum()] +
                  [scores[0].sum()] + [desc.CalcNumAromaticRings(mol)]]),
             opts={
                 'legend': [
                     'logP', 'SA', 'cycle', 'norm_reward', 'reward',
                     'Aromatic rings'
                 ]
             })
         visdom.append('fraction valid',
                       'line',
                       X=np.array([n]),
                       Y=np.array([valid.mean().data.item()]))
Esempio n. 9
0
def numAromatic(x):
    #     moleculeString = (df_all['smiles'][1])
    #     mS = moleculeString.tolist()[0]
    molecule = Chem.MolFromSmiles(str(x))
    return rdMolDescriptors.CalcNumAromaticRings(molecule)
    def get_global_features(self, mol):
        u = []
        # Now get some specific features
        fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
        factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
        feats = factory.GetFeaturesForMol(mol)

        # First get some basic features
        natoms = mol.GetNumAtoms()
        nbonds = mol.GetNumBonds()
        mw = Descriptors.ExactMolWt(mol)
        HeavyAtomMolWt = Descriptors.HeavyAtomMolWt(mol)
        NumValenceElectrons = Descriptors.NumValenceElectrons(mol)
        ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n)
        MaxAbsPartialCharge = Descriptors.MaxAbsPartialCharge(mol)
        MaxPartialCharge = Descriptors.MaxPartialCharge(mol)
        MinAbsPartialCharge = Descriptors.MinAbsPartialCharge(mol)
        MinPartialCharge = Descriptors.MinPartialCharge(mol)
        '''
        #        FpDensityMorgan1 = Descriptors.FpDensityMorgan1(mol)
        #        FpDensityMorgan2 = Descriptors.FpDensityMorgan2(mol)
        #        FpDensityMorgan3 = Descriptors.FpDensityMorgan3(mol)

        # Get some features using chemical feature factory

        nbrAcceptor = 0
        nbrDonor = 0
        nbrHydrophobe = 0
        nbrLumpedHydrophobe = 0
        nbrPosIonizable = 0
        nbrNegIonizable = 0

        for j in range(len(feats)):
            #print(feats[j].GetFamily(), feats[j].GetType())
            if ('Acceptor' == (feats[j].GetFamily())):
                nbrAcceptor = nbrAcceptor + 1
            elif ('Donor' == (feats[j].GetFamily())):
                nbrDonor = nbrDonor + 1
            elif ('Hydrophobe' == (feats[j].GetFamily())):
                nbrHydrophobe = nbrHydrophobe + 1
            elif ('LumpedHydrophobe' == (feats[j].GetFamily())):
                nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1
            elif ('PosIonizable' == (feats[j].GetFamily())):
                nbrPosIonizable = nbrPosIonizable + 1
            elif ('NegIonizable' == (feats[j].GetFamily())):
                nbrNegIonizable = nbrNegIonizable + 1
            else:
                pass
                #print(feats[j].GetFamily())

        # Now get some features using rdMolDescriptors

        moreGlobalFeatures = [rdm.CalcNumRotatableBonds(mol), rdm.CalcChi0n(mol), rdm.CalcChi0v(mol), \
                            rdm.CalcChi1n(mol), rdm.CalcChi1v(mol), rdm.CalcChi2n(mol), rdm.CalcChi2v(mol), \
                            rdm.CalcChi3n(mol), rdm.CalcChi4n(mol), rdm.CalcChi4v(mol), \
                            rdm.CalcFractionCSP3(mol), rdm.CalcHallKierAlpha(mol), rdm.CalcKappa1(mol), \
                            rdm.CalcKappa2(mol), rdm.CalcLabuteASA(mol), \
                            rdm.CalcNumAliphaticCarbocycles(mol), rdm.CalcNumAliphaticHeterocycles(mol), \
                            rdm.CalcNumAliphaticRings(mol), rdm.CalcNumAmideBonds(mol), \
                            rdm.CalcNumAromaticCarbocycles(mol), rdm.CalcNumAromaticHeterocycles(mol), \
                            rdm.CalcNumAromaticRings(mol), rdm.CalcNumBridgeheadAtoms(mol), rdm.CalcNumHBA(mol), \
                            rdm.CalcNumHBD(mol), rdm.CalcNumHeteroatoms(mol), rdm.CalcNumHeterocycles(mol), \
                            rdm.CalcNumLipinskiHBA(mol), rdm.CalcNumLipinskiHBD(mol), rdm.CalcNumRings(mol), \
                            rdm.CalcNumSaturatedCarbocycles(mol), rdm.CalcNumSaturatedHeterocycles(mol), \
                            rdm.CalcNumSaturatedRings(mol), rdm.CalcNumSpiroAtoms(mol), rdm.CalcTPSA(mol)]


        u = [natoms, nbonds, mw, HeavyAtomMolWt, NumValenceElectrons, \
            nbrAcceptor, nbrDonor, nbrHydrophobe, nbrLumpedHydrophobe, \
            nbrPosIonizable, nbrNegIonizable]

        u = u + moreGlobalFeatures
        u = np.array(u).T
        # Some of the descriptors produice NAN. We can convert them to 0
        # If you are getting outliers in the training or validation set this could be
        # Because some important features were set to zero here because it produced NAN
        # Removing those features from the feature set might remove the outliers

        #u[np.isnan(u)] = 0

        #u = torch.tensor(u, dtype=torch.float)
        return (u)
Esempio n. 11
0


#FRAGMENTS = {
#    "acyl_halide": Chem.MolFromSmarts('[#9,#17,#35,#53]=O'),  # C(=O)X
#    "anhydride": Chem.MolFromSmarts('[#6]-[#6](=O)-[#8]-[#6](-[#6])=O'),  # CC(=O)OC(=O)C
#    "peroxide": Chem.MolFromSmarts('[#8]-[#8]'),  # R-O-O-R'
#    "ab_unsaturated_ketone": Chem.MolFromSmarts('[#6]=[#6]-[#6]=O'),  # R=CC=O
#}

DESCRIPTORS = {
    # classical molecular descriptors
    "num_heavy_atoms": lambda x: x.GetNumAtoms(),
    "molecular_weight": lambda x: round(Desc.ExactMolWt(x), 4),
    "num_rings": lambda x: rdMolDesc.CalcNumRings(x),
    "num_rings_arom": lambda x: rdMolDesc.CalcNumAromaticRings(x),
    "num_rings_ali": lambda x: rdMolDesc.CalcNumAliphaticRings(x),
    "num_hbd": lambda x: rdMolDesc.CalcNumLipinskiHBD(x),
    "num_hba": lambda x: rdMolDesc.CalcNumLipinskiHBA(x),
    "slogp": lambda x: round(Crippen.MolLogP(x), 4),
    "tpsa": lambda x: round(rdMolDesc.CalcTPSA(x), 4),
    "num_rotatable_bond": lambda x: rdMolDesc.CalcNumRotatableBonds(x),
    "num_atoms_oxygen": lambda x: len(
        [a for a in x.GetAtoms() if a.GetAtomicNum() == 8]
    ),
    "num_atoms_nitrogen": lambda x: len(
        [a for a in x.GetAtoms() if a.GetAtomicNum() == 7]
    ),
    "num_atoms_halogen": Fragments.fr_halogen,
    "num_atoms_bridgehead": rdMolDesc.CalcNumBridgeheadAtoms,
    # custom molecular descriptors
Esempio n. 12
0
def num_aromatic_rings(mol: Mol) -> int:
    return rdMolDescriptors.CalcNumAromaticRings(mol)
Esempio n. 13
0
 def calculate_number_aromatic_rings(self):
     '''
     Calculates the number of aromatic rings
     :return:
     '''
     return rdMolDescriptors.CalcNumAromaticRings(self.mol)
Esempio n. 14
0
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as Des

df = pd.read_csv('Datasets/COVID/train.csv',
                 names=['smiles', 'aff'],
                 skiprows=1)

df['mol'] = df['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
df['mol'] = df['mol'].apply(lambda x: Chem.AddHs(x))

df['num_of_atoms'] = df['mol'].apply(lambda x: x.GetNumAtoms())
df['num_of_bonds'] = df['mol'].apply(lambda x: x.GetNumBonds())
df['num_of_bonds_sq'] = df['mol'].apply(lambda x: x.GetNumBonds()**2)

# df['ringsar'] = df['mol'].apply(lambda x: Des.CalcNumAromaticRings(x))
df['rings_sq'] = df['mol'].apply(lambda x: Des.CalcNumAromaticRings(x)**2)

# df['num_of_heavy_atoms'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms())
df['n_h_a_sq'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms()**2)


def number_of_atoms(atom_list, df):
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = df['mol'].apply(
            lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))


number_of_atoms(['C', 'O', 'N'], df)
############################################

train_df = df.drop(columns=['smiles', 'mol', 'aff'])
Esempio n. 15
0
def get_molecular_features(dataframe, mol_list):
    df = dataframe
    for i in range(len(mol_list)):
        print("Getting molecular features for molecule: ", i)
        mol = mol_list[i]
        natoms = mol.GetNumAtoms()
        nbonds = mol.GetNumBonds()
        mw = Descriptors.ExactMolWt(mol)
        df.at[i,"NbrAtoms"] = natoms
        df.at[i,"NbrBonds"] = nbonds
        df.at[i,"mw"] = mw
        df.at[i,'HeavyAtomMolWt'] = Chem.Descriptors.HeavyAtomMolWt(mol)
        df.at[i,'NumValenceElectrons'] = Chem.Descriptors.NumValenceElectrons(mol)
        ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n)
        df.at[i,'MaxAbsPartialCharge'] = Chem.Descriptors.MaxAbsPartialCharge(mol)
        df.at[i,'MaxPartialCharge'] = Chem.Descriptors.MaxPartialCharge(mol)
        df.at[i,'MinAbsPartialCharge'] = Chem.Descriptors.MinAbsPartialCharge(mol)
        df.at[i,'MinPartialCharge'] = Chem.Descriptors.MinPartialCharge(mol)
        '''
        df.at[i,'FpDensityMorgan1'] = Chem.Descriptors.FpDensityMorgan1(mol)
        df.at[i,'FpDensityMorgan2'] = Chem.Descriptors.FpDensityMorgan2(mol)
        df.at[i,'FpDensityMorgan3'] = Chem.Descriptors.FpDensityMorgan3(mol)
        
        #print(natoms, nbonds)
        
        # Now get some specific features
        fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
        factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
        feats = factory.GetFeaturesForMol(mol)
        #df["Acceptor"] = 0
        #df["Aromatic"] = 0
        #df["Hydrophobe"] = 0
        nbrAcceptor = 0
        nbrDonor = 0
        nbrHydrophobe = 0
        nbrLumpedHydrophobe = 0
        nbrPosIonizable = 0
        nbrNegIonizable = 0
        for j in range(len(feats)):
            #print(feats[j].GetFamily(), feats[j].GetType())
            if ('Acceptor' == (feats[j].GetFamily())):
                nbrAcceptor = nbrAcceptor + 1
            elif ('Donor' == (feats[j].GetFamily())):
                nbrDonor = nbrDonor + 1
            elif ('Hydrophobe' == (feats[j].GetFamily())):
                nbrHydrophobe = nbrHydrophobe + 1
            elif ('LumpedHydrophobe' == (feats[j].GetFamily())):
                nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1
            elif ('PosIonizable' == (feats[j].GetFamily())):
                nbrPosIonizable = nbrPosIonizable + 1
            elif ('NegIonizable' == (feats[j].GetFamily())):
                nbrNegIonizable = nbrNegIonizable + 1                
            else:
                pass#print(feats[j].GetFamily())
                        
        df.at[i,"Acceptor"] = nbrAcceptor
        df.at[i,"Donor"] = nbrDonor
        df.at[i,"Hydrophobe"] = nbrHydrophobe
        df.at[i,"LumpedHydrophobe"] = nbrLumpedHydrophobe
        df.at[i,"PosIonizable"] = nbrPosIonizable
        df.at[i,"NegIonizable"] = nbrNegIonizable
        
        # We can also get some more molecular features using rdMolDescriptors
        
        df.at[i,"NumRotatableBonds"] = rdMolDescriptors.CalcNumRotatableBonds(mol)
        df.at[i,"CalcChi0n"] = rdMolDescriptors.CalcChi0n(mol)
        df.at[i,"CalcChi0v"] = rdMolDescriptors.CalcChi0v(mol)
        df.at[i,"CalcChi1n"] = rdMolDescriptors.CalcChi1n(mol)
        df.at[i,"CalcChi1v"] = rdMolDescriptors.CalcChi1v(mol)
        df.at[i,"CalcChi2n"] = rdMolDescriptors.CalcChi2n(mol)
        df.at[i,"CalcChi2v"] = rdMolDescriptors.CalcChi2v(mol)
        df.at[i,"CalcChi3n"] = rdMolDescriptors.CalcChi3n(mol)
        df.at[i,"CalcChi3v"] = rdMolDescriptors.CalcChi3v(mol)
        df.at[i,"CalcChi4n"] = rdMolDescriptors.CalcChi4n(mol)
        df.at[i,"CalcChi4v"] = rdMolDescriptors.CalcChi4v(mol)
        df.at[i,"CalcFractionCSP3"] = rdMolDescriptors.CalcFractionCSP3(mol)
        df.at[i,"CalcHallKierAlpha"] = rdMolDescriptors.CalcHallKierAlpha(mol)
        df.at[i,"CalcKappa1"] = rdMolDescriptors.CalcKappa1(mol)
        df.at[i,"CalcKappa2"] = rdMolDescriptors.CalcKappa2(mol)
        #df.at[i,"CalcKappa3"] = rdMolDescriptors.CalcKappa3(mol)
        df.at[i,"CalcLabuteASA"] = rdMolDescriptors.CalcLabuteASA(mol)
        df.at[i,"CalcNumAliphaticCarbocycles"] = rdMolDescriptors.CalcNumAliphaticCarbocycles(mol)
        df.at[i,"CalcNumAliphaticHeterocycles"] = rdMolDescriptors.CalcNumAliphaticHeterocycles(mol)
        df.at[i,"CalcNumAliphaticRings"] = rdMolDescriptors.CalcNumAliphaticRings(mol)
        df.at[i,"CalcNumAmideBonds"] = rdMolDescriptors.CalcNumAmideBonds(mol)
        df.at[i,"CalcNumAromaticCarbocycles"] = rdMolDescriptors.CalcNumAromaticCarbocycles(mol)
        df.at[i,"CalcNumAromaticHeterocycles"] = rdMolDescriptors.CalcNumAromaticHeterocycles(mol)
        df.at[i,"CalcNumAromaticRings"] = rdMolDescriptors.CalcNumAromaticRings(mol)
        df.at[i,"CalcNumBridgeheadAtoms"] = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
        df.at[i,"CalcNumHBA"] = rdMolDescriptors.CalcNumHBA(mol)
        df.at[i,"CalcNumHBD"] = rdMolDescriptors.CalcNumHBD(mol)
        df.at[i,"CalcNumHeteroatoms"] = rdMolDescriptors.CalcNumHeteroatoms(mol)
        df.at[i,"CalcNumHeterocycles"] = rdMolDescriptors.CalcNumHeterocycles(mol)
        df.at[i,"CalcNumLipinskiHBA"] = rdMolDescriptors.CalcNumLipinskiHBA(mol)
        df.at[i,"CalcNumLipinskiHBD"] = rdMolDescriptors.CalcNumLipinskiHBD(mol)
        df.at[i,"CalcNumRings"] = rdMolDescriptors.CalcNumRings(mol)
        df.at[i,"CalcNumSaturatedCarbocycles"] = rdMolDescriptors.CalcNumSaturatedCarbocycles(mol)
        df.at[i,"CalcNumSaturatedHeterocycles"] = rdMolDescriptors.CalcNumSaturatedHeterocycles(mol)
        df.at[i,"CalcNumSaturatedRings"] = rdMolDescriptors.CalcNumSaturatedRings(mol)
        df.at[i,"CalcNumSpiroAtoms"] = rdMolDescriptors.CalcNumSpiroAtoms(mol)
        df.at[i,"CalcTPSA"] = rdMolDescriptors.CalcTPSA(mol)
    return(df)
Esempio n. 16
0
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors

smiles = open('rb/smiles.smi')
line = smiles.readline()

salida = open('rb/descriptors.csv', 'w')

while (line):
    print(line)
    mol = Chem.MolFromSmiles(line)
    logp = Descriptors.MolLogP(mol)
    molwt = Descriptors.MolWt(mol)
    hac = Descriptors.HeavyAtomCount(mol)
    ar = rdMolDescriptors.CalcNumAromaticRings(mol)
    rb = rdMolDescriptors.CalcNumRotatableBonds(mol)
    lista = [line[:-1], logp, molwt, hac, ar, rb]
    salida.write(','.join([str(elem) for elem in lista]) + '\n')
    line = smiles.readline()
salida.close()