def Add_Mol_Chars(suppl):
    char_names = ['Chi0','Chi0n','Chi0v','Chi1',\
    'Chi1n','Chi1v','Chi2n','Chi2v','Chi3n','Chi3v','Chi4n','Chi4v',\
    'EState_VSA1','EState_VSA10','EState_VSA11','EState_VSA2','EState_VSA3',\
    'EState_VSA4','EState_VSA5','EState_VSA6','EState_VSA7','EState_VSA8',\
    'EState_VSA9','FractionCSP3','HallKierAlpha','HeavyAtomCount','Ipc',\
    'Kappa1','Kappa2','Kappa3','LabuteASA','MolLogP','MolMR','MolWt',\
    'NHOHCount','NOCount','NumAliphaticCarbocycles','NumAliphaticHeterocycles',\
    'NumAliphaticRings','NumAromaticCarbocycles','NumAromaticHeterocycles',\
    'NumAromaticRings','NumHAcceptors','NumHDonors','NumHeteroatoms',\
    'NumRotatableBonds','NumSaturatedCarbocycles','NumSaturatedHeterocycles',\
    'NumSaturatedRings','PEOE_VSA1','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12',\
    'PEOE_VSA13','PEOE_VSA14','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5',\
    'PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','RingCount','SMR_VSA1',\
    'SMR_VSA10','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7',\
    'SMR_VSA8','SMR_VSA9','SlogP_VSA1','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12',\
    'SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7',\
    'SlogP_VSA8','SlogP_VSA9','TPSA','VSA_EState1','VSA_EState10','VSA_EState2',\
    'VSA_EState3','VSA_EState4','VSA_EState5','VSA_EState6','VSA_EState7',\
    'VSA_EState8','VSA_EState9']
    calc = MolecularDescriptorCalculator(char_names)
    full_list = []

    for mol in suppl:
        if mol is None: continue
        #row_list = [mol.GetProp('_Name')]
        full_list.append([mol.GetProp('_Name')] +
                         list(calc.CalcDescriptors(mol)))
    return ([['_Name'] + char_names] + (full_list))
Exemple #2
0
def compute_descriptors(df: DataFrame, to_compute: List[str]) -> DataFrame:
    """
    Computes all descriptors defined in `to_compute` for every molecule in `df.ROMol`
    and adds the results to the input DataFrame.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing a column named "`ROMol`" with RDKit mol objects to calculate the descriptors for

    to_compute : List[str]
        A list of descriptor names which have to be calculated

    Returns
    -------
    DataFrame
        Contains the data from the input DataFrame and the newly calculated descriptors
    """

    if not to_compute:
        return df
    logging.info(f'Computing {len(to_compute)} descriptors for {len(df)} molecules...')
    calc = MolecularDescriptorCalculator(to_compute)
    new_cols = dict((desc, []) for desc in to_compute)
    n_mols = len(df)
    for ix, mol in enumerate(df.ROMol):
        values = calc.CalcDescriptors(mol)
        for name, value in zip(to_compute, values):
            new_cols[name].append(value)
        print(f'\r{ix}/{n_mols}' if ix % 10 == 0 else '', end='')
    print('\r', end='')
    return pd.concat([df, DataFrame(new_cols, copy=False)], axis=1, copy=False)
Exemple #3
0
def RDKit_descriptor_featurizer(mol_list,
                                descriptor_list=_descList,
                                return_names=True):
    num_descriptors = len(descriptor_list)
    num_mols = len(mol_list)
    descriptor_function_names = [
        _descList[i][0] for i in range(num_descriptors)
    ]

    mdc = MolecularDescriptorCalculator(simpleList=descriptor_function_names)

    X = np.zeros([num_mols, num_descriptors])

    for i in range(num_mols):
        X[i, :] = np.array(mdc.CalcDescriptors(mol_list[i]))

    descriptor_function_names = np.array(descriptor_function_names)

    #Drop descriptors that are zero for every molecule in the list
    cols_to_drop = []
    for i in range(num_descriptors):
        if (sum(X[:, i]) == 0):
            cols_to_drop += [i]

    X_truncated = np.delete(X, cols_to_drop, 1)
    descriptor_function_names_truncated = np.delete(descriptor_function_names,
                                                    cols_to_drop, 0)

    descriptor_function_names = list(descriptor_function_names)

    if (return_names):
        return descriptor_function_names_truncated, X_truncated
    else:
        return X_truncated
Exemple #4
0
def molecular_descriptors(data):
    """
    Use RDKit to prepare the molecular descriptor

    Inputs
    ------
    data: dataframe, cleaned csv data

    Returns
    ------
    prenorm_X: normalized input features
    Y: experimental electrical conductivity

    """

    n = data.shape[0]
    # Choose which molecular descriptor we want
    list_of_descriptors = [
        'NumHeteroatoms', 'ExactMolWt', 'NOCount', 'NumHDonors', 'RingCount',
        'NumAromaticRings', 'NumSaturatedRings', 'NumAliphaticRings'
    ]
    # Get the molecular descriptors and their dimension
    calc = Calculator(list_of_descriptors)
    D = len(list_of_descriptors)
    d = len(list_of_descriptors) * 2 + 4

    Y = data['EC_value']
    X = np.zeros((n, d))
    X[:, -3] = data['T']
    X[:, -2] = data['P']
    X[:, -1] = data['MOLFRC_A']
    for i in range(n):
        A = Chem.MolFromSmiles(data['A'][i])
        B = Chem.MolFromSmiles(data['B'][i])
        X[i][:D] = calc.CalcDescriptors(A)
        X[i][D:2 * D] = calc.CalcDescriptors(B)

    prenorm_X = pd.DataFrame(
        X,
        columns=[
            'NUM', 'NumHeteroatoms_A', 'MolWt_A', 'NOCount_A', 'NumHDonors_A',
            'RingCount_A', 'NumAromaticRings_A', 'NumSaturatedRings_A',
            'NumAliphaticRings_A', 'NumHeteroatoms_B', 'MolWt_B', 'NOCount_B',
            'NumHDonors_B', 'RingCount_B', 'NumAromaticRings_B',
            'NumSaturatedRings_B', 'NumAliphaticRings_B', 'T', 'P', 'MOLFRC_A'
        ])

    prenorm_X = prenorm_X.drop('NumAliphaticRings_A', 1)
    prenorm_X = prenorm_X.drop('NumAliphaticRings_B', 1)

    return prenorm_X, Y
Exemple #5
0
def GetMolecularDescriptor(molObject, descriptorName):
    """
	Read molecule from smiles and generate 3d coordinate
	Args:
		param1 (mol object): rdkit mol object 
		param2 (list): list of descriptor name
	Returns:
		list of descriptor value 
	Raise:
		Exceptions
	"""
    calc = MolecularDescriptorCalculator(descriptorName)
    descrs = calc.CalcDescriptors(molObject)
    return list(descrs)
    def write_rdkit_descriptors(smiles, csv, data):
        from rdkit import Chem
        from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

        if os.path.isfile(smiles) and not os.path.isfile(f'{csv}.gz'):
            # Get molecules from SMILES
            mols = [Chem.MolFromSmiles(smi) for smi in data['SMILES']]

            # Get list of descriptors
            descriptors_list = [a[0] for a in Chem.Descriptors.descList]

            msg = st.text('Sit back! This may take a while...')
            calculator = MolecularDescriptorCalculator(descriptors_list)
            calc_descriptors = [calculator.CalcDescriptors(m) for m in mols]

            df = pd.DataFrame(calc_descriptors, columns=descriptors_list)
            df.insert(0, column='CID', value=data['CID'].tolist())
            df.to_csv(f'{csv}.gz', index=False, compression='gzip')
            msg.text('')
Exemple #7
0
    def __init__(
        self,
        descriptors: List[str] = [],
        named_descriptor_set: str = 'all',
        fingerprint_extra_args: Optional[dict] = None,
        normalise: bool = False,
        subset_size: int = 200,
    ):
        """
        Args:
            descriptors: list of descriptor names -
                the subset given is validated to make sure they exist and will be used.
            named_descriptor_set: 'all' or 'simple' to use preset subsets
            fingerprint_extra_args: optional kwargs for `MolecularDescriptorCalculator`
            subset_size: number of descriptors to return (or the size of the subset if that's smaller)
        """
        super().__init__()

        if fingerprint_extra_args is None:
            fingerprint_extra_args = {}

        self.descriptors = self._get_descriptor_list(
            named_descriptor_set=named_descriptor_set,
            descriptor_list=descriptors,
            subset_size=subset_size)

        self.fingerprint_extra_args = fingerprint_extra_args
        self.calc = MolecularDescriptorCalculator(
            self.descriptors, **self.fingerprint_extra_args)
        self.normalise = normalise

        distributions_path = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            '../data/physchem_distributions.json')

        with open(distributions_path) as fp:
            self.distributions = json.load(fp)

        if self.normalise:
            self.scaler = PhyschemScaler(descriptor_list=self.descriptors,
                                         dists=self.distributions)
Exemple #8
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:

import pandas as pd
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import Descriptors, PandasTools

# In[2]:

original_df = original_df.set_index('ID')

descriptor_list = [x[0] for x in Descriptors._descList]
calc = MolecularDescriptorCalculator(descriptor_list)
calc.ShowDescriptors()

# In[3]:

original_df.info()

# In[4]:


def gen_descriptors(df, calc):
    """Generate a new descriptor df"""
    values = []
    columns = ['ID'] + list(calc.GetDescriptorNames())
    count = 0
    for ID, row in df.iterrows():
        if (count % 1000 == 0):
Exemple #9
0
    mol_property_vals = map(float, mol_property_vals)
    double_bond = float(molstring.count('='))
    mol_property_vals.append(double_bond)
    mol_property_names.append('double_bond_count')
    triple_bond = float(molstring.count('#'))
    mol_property_vals.append(triple_bond)
    mol_property_names.append('triple_bond_count')
    #print mol_property_names
    return mol_property_vals

rdkit_descriptors = ['BalabanJ','BertzCT','FractionCSP3','HallKierAlpha','HeavyAtomCount','HeavyAtomMolWt',\
'Kappa1','Kappa2','Kappa3','LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 'MaxEStateIndex', 'MaxPartialCharge',\
'MinAbsEStateIndex', 'MinAbsPartialCharge', 'MinEStateIndex', 'MinPartialCharge','MolLogP','MolMR','NHOHCount','NOCount',\
'NumHeteroatoms','NumRotatableBonds','NumValenceElectrons','TPSA']
rdkit_mol_descrip_calculator = MolecularDescriptorCalculator(rdkit_descriptors)


def Calculate_rdkit_mol_descriptors(mol_string):
    """
    Calculate the molecular descriptors available in RDkit
    :param mol_string: smiles form or InChI string of the molecule
    :return: a list of RDkit molecular descriptors of the molecule
    """
    if 'InChI=' in mol_string:
        mol_string = inchi2smiles(mol_string)
    cur_molecule = Chem.MolFromSmiles(mol_string)
    cur_mol_properties = rdkit_mol_descrip_calculator.CalcDescriptors(
        cur_molecule)
    return list(cur_mol_properties)
def GetMolecularDescriptor(molObject, descriptorName):
    calc = MolecularDescriptorCalculator(descriptorName)
    descrs = calc.CalcDescriptors(molObject)
    return list(descrs)
Exemple #11
0
def descCalc(mol):
    try:
        if mol.GetNumHeavyAtoms() > 0:
            return (MDC.CalcDescriptors(calc, mol))
    except:
        print "Error while processing "
Exemple #12
0
def build_model_from_md(df,
                        property_to_model,
                        temperature=[298.1, 299],
                        pressure=[101, 102],
                        output_ranges=[[200, 3000]],
                        md_temperature=298.15,
                        md_pressure=101.325):
    """
    creates new qspr models using md data

    Parameters
    ----------
    df : pandas DataFrame
        salt_log data from the genetic algorithm. Contains
        the headers 'Salt Smiles' and 'MD Calculation'. Current
        support is only for cpt and density
    property_to_model : str
        current support is for 'cpt' or 'density'
    temperature : array, optional
        temperature bounds on experimental data to add. Default
        297, 316 K
    pressure : array, optional
        pressure bounds on experimental data to add. Default
        99, 102 kpa
    output_ranges : array, optional
        property bounds on experimental data to add. Default
        200, 3000 (kg/m3 or kj/molK)
    md_temperature : float, optional
        temperature used to generate the md data. Default
        298.15 K
    md_pressure : float, optional
        pressure used to generate the md data. Dfault
        101.325 kPa

    Returns
    -------
    newmodel : salt dev_model object
    new_MD_data_index : int
        start index of the newly incorporated MD data

    Summary
    -------
    Create 4 lists from df: cation/anion smiles, cpt, density
    Nans will be used for cation/anion name in the newmodel
    output
    """

    cpt = []
    density = []
    cation_smi = []
    anion_smi = []
    for i in range(df.shape[0]):
        calculation = df["MD Calculation"][i]
        cpt.append(re.findall("\d+\.\d+", calculation)[0])
        density.append(re.findall("\d+\.\d+", calculation)[1])
        cation_smi.append(df['Salt Smiles'][i].split(".")[0])
        anion_smi.append(df['Salt Smiles'][i].split(".")[1])

    module_path = dirname(__file__)
    data = df
    n = data.shape[0]
    f = open(join(module_path, 'data', 'Deslist'), 'r')
    Deslist = []
    for line in f:
        Deslist.append(line.strip('\n\t'))
    calc = Calculator(Deslist)
    D = len(Deslist)
    d = len(Deslist) * 2 + 8
    X = np.zeros((n, d))
    X[:, -8] = md_temperature
    X[:, -7] = md_pressure
    for i in range(n):
        cation = Chem.MolFromSmiles(cation_smi[i])
        anion = Chem.MolFromSmiles(anion_smi[i])
        X[i][:D] = calc.CalcDescriptors(cation)
        X[i][D:2 * D] = calc.CalcDescriptors(anion)
    X[:, -5] = density
    X[:, -6] = cpt
    cols_cat = [s + "-cation" for s in Deslist]
    cols_ani = [s + "-anion" for s in Deslist]
    cols = cols_cat + cols_ani + [
        "Temperature, K", "Pressure, kPa",
        "Heat capacity at constant pressure,"
        "J/K/mol", "Specific density, kg/m<SUP>3</SUP>", "name-anion",
        "smiles-anion", "name-cation", "smiles-cation"
    ]
    X = pd.DataFrame(X, columns=cols)
    X.iloc[:, -4] = np.nan
    X.iloc[:, -2] = np.nan
    X.iloc[:, -3] = anion_smi
    X.iloc[:, -1] = cation_smi  # X is the df with the new simulation data
    new_MD_data_index = X.shape[0]  # plot new predictions after re-training

    devmodel = salty.aggregate_data(property_to_model,
                                    T=temperature,
                                    P=pressure,
                                    data_ranges=output_ranges,
                                    scale_center=False)
    cols = devmodel.Data.columns
    new_data = pd.concat([devmodel.Data, X])  # have to sort in future version

    if property_to_model == ['density']:
        prop = "Specific density, kg/m<SUP>3</SUP>"
        to_drop = "Heat capacity at constant pressure, J/K/mol"
    elif property_to_model == ['cpt']:
        to_drop = "Specific density, kg/m<SUP>3</SUP>"
        prop = "Heat capacity at constant pressure, J/K/mol"
    elif property_to_model == ["cpt", "density"]:
        prop = [
            "Heat capacity at constant pressure, J/K/mol",
            "Specific density, kg/m<SUP>3</SUP>"
        ]

    if property_to_model != ["cpt", "density"]:
        new_data.drop(columns=[to_drop], inplace=True)

    new_data = new_data[cols]
    new_data.reset_index(inplace=True, drop=True)

    if property_to_model == ["cpt", "density"]:
        exp_data = [prop[0], prop[1], "Temperature, K", "Pressure, kPa"]
    else:
        exp_data = [prop, "Temperature, K", "Pressure, kPa"]

    merged = new_data
    unique_salts = merged["smiles-cation"] + merged["smiles-anion"]
    unique_cations = repr(merged["smiles-cation"].unique())
    unique_anions = repr(merged["smiles-anion"].unique())
    actual_data_ranges = []
    for i in range(len(exp_data)):
        actual_data_ranges.append("{} - {}".format(
            str(merged[exp_data[i]].min()), str(merged[exp_data[i]].max())))
    a = np.array([
        len(unique_salts.unique()), unique_cations, unique_anions,
        len(unique_salts)
    ])
    a = np.concatenate((a, actual_data_ranges))
    cols1 = ["Unique salts", "Cations", "Anions", "Total datapoints"]
    cols = cols1 + exp_data
    data_summary = pd.DataFrame(a, cols)
    merged = new_data
    metaDf = merged.select_dtypes(include=["object"])
    dataDf = merged.select_dtypes(include=[np.number])
    cols = dataDf.columns.tolist()
    instance = StandardScaler()
    for i in range(1, len(property_to_model) + 1):
        dataDf.iloc[:, -i] = dataDf.iloc[:, -i].apply(lambda x: log(float(x)))

    scaled_data = pd.DataFrame(instance.fit_transform(
        dataDf.iloc[:, :-len(property_to_model)]),
                               columns=cols[:-len(property_to_model)])
    df = pd.concat(
        [scaled_data, dataDf.iloc[:, -len(property_to_model):], metaDf],
        axis=1)  # may have to sort in future version
    mean_std_of_coeffs = pd.DataFrame([instance.mean_, instance.scale_],
                                      columns=cols[:-len(property_to_model)])
    new_model = salty.dev_model(mean_std_of_coeffs, data_summary, df)
    print(new_model.Data_summary)
    return new_model, new_MD_data_index
# def CalcDescriptorsErrorCheck(mol):
#     if mol.GetNumHeavyAtoms() > 0:
#         print MDC.CalcDescriptors(calc, mol)
#         return MDC.CalcDescriptors(calc, mol)

# pool1 = Pool(4)
# pool2 = Pool(4)
# activesdesc = pool1.map(CalcDescriptorsErrorCheck, smallActiveSet)
# pool1.close()
# inactivedesc = pool2.map(CalcDescriptorsErrorCheck, smallInactiveSet)
# pool2.close()

for i, mol in enumerate(activeMolecules):
    try:
        if mol.GetNumHeavyAtoms() > 0:
            activesdesc.append(MDC.CalcDescriptors(calc, mol))
            if (i % LOG_EVERY_N) == 0:
                print str(i) + " molecules processed"
    except:
        print "Error while processing "

inactivedesc = []
for i, mol in enumerate(inactiveMolecules):
    try:
        if mol.GetNumHeavyAtoms() > 0:
            inactivedesc.append(MDC.CalcDescriptors(calc, mol))
            if (i % LOG_EVERY_N) == 0:
                print str(i) + " molecules processed"
    except:
        print "Error while processing "
Exemple #14
0
class PhysChemFeaturizer(RDKitFeaturizer):
    """
    MolFeaturizer that featurizes a molecule with an array of phys-chem properties.

    @see http://www.rdkit.org/Python_Docs/rdkit.ML.Descriptors.MoleculeDescriptors-module.html
    For available descriptors @see http://rdkit.org/docs/source/rdkit.ML.Descriptors.MoleculeDescriptors.html
    """
    def __init__(
        self,
        descriptors: List[str] = [],
        named_descriptor_set: str = 'all',
        fingerprint_extra_args: Optional[dict] = None,
        normalise: bool = False,
        subset_size: int = 200,
    ):
        """
        Args:
            descriptors: list of descriptor names -
                the subset given is validated to make sure they exist and will be used.
            named_descriptor_set: 'all' or 'simple' to use preset subsets
            fingerprint_extra_args: optional kwargs for `MolecularDescriptorCalculator`
            subset_size: number of descriptors to return (or the size of the subset if that's smaller)
        """
        super().__init__()

        if fingerprint_extra_args is None:
            fingerprint_extra_args = {}

        self.descriptors = self._get_descriptor_list(
            named_descriptor_set=named_descriptor_set,
            descriptor_list=descriptors,
            subset_size=subset_size)

        self.fingerprint_extra_args = fingerprint_extra_args
        self.calc = MolecularDescriptorCalculator(
            self.descriptors, **self.fingerprint_extra_args)
        self.normalise = normalise

        distributions_path = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            '../data/physchem_distributions.json')

        with open(distributions_path) as fp:
            self.distributions = json.load(fp)

        if self.normalise:
            self.scaler = PhyschemScaler(descriptor_list=self.descriptors,
                                         dists=self.distributions)

    @staticmethod
    def get_descriptor_subset(subset: str, subset_size: int) -> List[str]:
        if subset == 'all':
            return PhysChemFeaturizer.get_all_descriptor_names()[:subset_size]
        elif subset == 'simple':
            return PhysChemFeaturizer.get_simple_descriptor_subset(
            )[:subset_size]
        elif subset == 'uncorrelated':
            return PhysChemFeaturizer.get_uncorrelated_descriptor_subset(
                subset_size)
        elif subset == 'fragment':
            return PhysChemFeaturizer.get_fragment_descriptor_subset(
            )[:subset_size]
        elif subset == 'graph':
            return PhysChemFeaturizer.get_graph_descriptor_subset(
            )[:subset_size]
        elif subset == 'surface':
            return PhysChemFeaturizer.get_surface_descriptor_subset(
            )[:subset_size]
        elif subset == 'druglikeness':
            return PhysChemFeaturizer.get_druglikeness_descriptor_subset(
            )[:subset_size]
        elif subset == 'logp':
            return PhysChemFeaturizer.get_logp_descriptor_subset(
            )[:subset_size]
        elif subset == 'refractivity':
            return PhysChemFeaturizer.get_refractivity_descriptor_subset(
            )[:subset_size]
        elif subset == 'estate':
            return PhysChemFeaturizer.get_estate_descriptor_subset(
            )[:subset_size]
        elif subset == 'charge':
            return PhysChemFeaturizer.get_charge_descriptor_subset(
            )[:subset_size]
        elif subset == 'general':
            return PhysChemFeaturizer.get_general_descriptor_subset(
            )[:subset_size]
        else:
            raise ValueError(
                f'Unrecognised descriptor subset: {subset} (should be "all", "simple",'
                f'"uncorrelated", "fragment", "graph", "logp", "refractivity",'
                f'"estate", "druglikeness", "surface", "charge", "general").')

    @property
    def output_size(self):
        return len(self.descriptors)

    def transform(self,
                  molecules: Sequence[str]) -> Tuple[np.ndarray, np.ndarray]:
        features, valids = super().transform(molecules)

        return features, valids

    def transform_single(self, molecule: str) -> Tuple[np.ndarray, bool]:
        features, valid = super().transform_single(molecule)

        return features, valid

    def transform_mol(self,
                      molecule: Chem.rdchem.Mol) -> Tuple[np.ndarray, bool]:
        fp = self.calc.CalcDescriptors(molecule)
        fp = np.array(fp)
        mask = np.isfinite(fp)
        fp[~mask] = 0
        fp = rdkit_dense_array_to_np(fp, dtype=float)
        if self.normalise:
            fp = self.scaler.transform_single(fp)
        return fp, True

    def is_valid_single(self, molecule: str) -> bool:
        _, valid = self.transform_single(molecule)
        return valid

    # control pickling / unpickling
    def __getstate__(self):
        return {
            'descriptors': self.descriptors,
            'fingerprint_extra_args': self.fingerprint_extra_args,
            'normalise': self.normalise,
        }

    def __setstate__(self, saved_dict):
        # ignore mypy check: calling __init__ directly as a form of reflection during unpickling (not called by default)
        self.__init__(  # type: ignore
            descriptors=saved_dict['descriptors'],
            fingerprint_extra_args=saved_dict['fingerprint_extra_args'],
            normalise=saved_dict['normalise'],
        )

    @staticmethod
    def get_all_descriptor_names() -> List[str]:
        """
        Get available descriptor names for RDKit physchem features. Custom subset can be used as list of descriptors.
        """
        return sorted([x[0] for x in Descriptors._descList])

    @staticmethod
    def get_simple_descriptor_subset() -> List[str]:
        return [
            'FpDensityMorgan2',
            'FractionCSP3',
            'MolLogP',
            'MolWt',
            'NumHAcceptors',
            'NumHDonors',
            'NumRotatableBonds',
            'TPSA',
        ]

    @staticmethod
    def get_refractivity_descriptor_subset() -> List[str]:
        return [
            'MolMR',
            'SMR_VSA1',
            'SMR_VSA10',
            'SMR_VSA2',
            'SMR_VSA3',
            'SMR_VSA4',
            'SMR_VSA5',
            'SMR_VSA6',
            'SMR_VSA7',
            'SMR_VSA8',
            'SMR_VSA9',
        ]

    @staticmethod
    def get_logp_descriptor_subset() -> List[str]:
        """LogP descriptors and VSA/LogP descriptors
        SlogP_VSA: VSA of atoms contributing to a specified bin of SlogP
        """

        return [
            'MolLogP',
            'SlogP_VSA1',
            'SlogP_VSA10',
            'SlogP_VSA11',
            'SlogP_VSA12',
            'SlogP_VSA2',
            'SlogP_VSA3',
            'SlogP_VSA4',
            'SlogP_VSA5',
            'SlogP_VSA6',
            'SlogP_VSA7',
            'SlogP_VSA8',
            'SlogP_VSA9',
        ]

    @staticmethod
    def get_graph_descriptor_subset() -> List[str]:
        """ Graph descriptors (https://www.rdkit.org/docs/source/rdkit.Chem.GraphDescriptors.html) """
        return [
            'BalabanJ',
            'BertzCT',
            'Chi0',
            'Chi0n',
            'Chi0v',
            'Chi1',
            'Chi1n',
            'Chi1v',
            'Chi2n',
            'Chi2v',
            'Chi3n',
            'Chi3v',
            'Chi4n',
            'Chi4v',
            'HallKierAlpha',
            'Ipc',
            'Kappa1',
            'Kappa2',
            'Kappa3',
        ]

    @staticmethod
    def get_surface_descriptor_subset() -> List[str]:
        """MOE-like surface descriptors
        EState_VSA: VSA (van der Waals surface area) of atoms contributing to a specified bin of e-state
        SlogP_VSA: VSA of atoms contributing to a specified bin of SlogP
        SMR_VSA: VSA of atoms contributing to a specified bin of molar refractivity
        PEOE_VSA: VSA of atoms contributing to a specified bin of partial charge (Gasteiger)
        LabuteASA: Labute's approximate surface area descriptor
        """
        return [
            'SlogP_VSA1',
            'SlogP_VSA10',
            'SlogP_VSA11',
            'SlogP_VSA12',
            'SlogP_VSA2',
            'SlogP_VSA3',
            'SlogP_VSA4',
            'SlogP_VSA5',
            'SlogP_VSA6',
            'SlogP_VSA7',
            'SlogP_VSA8',
            'SlogP_VSA9',
            'SMR_VSA1',
            'SMR_VSA10',
            'SMR_VSA2',
            'SMR_VSA3',
            'SMR_VSA4',
            'SMR_VSA5',
            'SMR_VSA6',
            'SMR_VSA7',
            'SMR_VSA8',
            'SMR_VSA9',
            'EState_VSA1',
            'EState_VSA10',
            'EState_VSA11',
            'EState_VSA2',
            'EState_VSA3',
            'EState_VSA4',
            'EState_VSA5',
            'EState_VSA6',
            'EState_VSA7',
            'EState_VSA8',
            'EState_VSA9',
            'LabuteASA',
            'PEOE_VSA1',
            'PEOE_VSA10',
            'PEOE_VSA11',
            'PEOE_VSA12',
            'PEOE_VSA13',
            'PEOE_VSA14',
            'PEOE_VSA2',
            'PEOE_VSA3',
            'PEOE_VSA4',
            'PEOE_VSA5',
            'PEOE_VSA6',
            'PEOE_VSA7',
            'PEOE_VSA8',
            'PEOE_VSA9',
            'TPSA',
        ]

    @staticmethod
    def get_druglikeness_descriptor_subset() -> List[str]:
        """ Descriptors commonly used to assess druglikeness"""
        return [
            'TPSA',
            'MolLogP',
            'MolMR',
            'ExactMolWt',
            'FractionCSP3',
            'HeavyAtomCount',
            'MolWt',
            'NHOHCount',
            'NOCount',
            'NumAliphaticCarbocycles',
            'NumAliphaticHeterocycles',
            'NumAliphaticRings',
            'NumAromaticCarbocycles',
            'NumAromaticHeterocycles',
            'NumAromaticRings',
            'NumHAcceptors',
            'NumHDonors',
            'NumHeteroatoms',
            'NumRotatableBonds',
            'NumSaturatedCarbocycles',
            'NumSaturatedHeterocycles',
            'NumSaturatedRings',
            'RingCount',
            'qed',
        ]

    @staticmethod
    def get_fragment_descriptor_subset() -> List[str]:
        return [
            'NHOHCount',
            'NOCount',
            'NumAliphaticCarbocycles',
            'NumAliphaticHeterocycles',
            'NumAliphaticRings',
            'NumAromaticCarbocycles',
            'NumAromaticHeterocycles',
            'NumAromaticRings',
            'NumHAcceptors',
            'NumHDonors',
            'NumHeteroatoms',
            'NumRotatableBonds',
            'NumSaturatedCarbocycles',
            'NumSaturatedHeterocycles',
            'NumSaturatedRings',
            'RingCount',
            'fr_Al_COO',
            'fr_Al_OH',
            'fr_Al_OH_noTert',
            'fr_ArN',
            'fr_Ar_COO',
            'fr_Ar_N',
            'fr_Ar_NH',
            'fr_Ar_OH',
            'fr_COO',
            'fr_COO2',
            'fr_C_O',
            'fr_C_O_noCOO',
            'fr_C_S',
            'fr_HOCCN',
            'fr_Imine',
            'fr_NH0',
            'fr_NH1',
            'fr_NH2',
            'fr_N_O',
            'fr_Ndealkylation1',
            'fr_Ndealkylation2',
            'fr_Nhpyrrole',
            'fr_SH',
            'fr_aldehyde',
            'fr_alkyl_carbamate',
            'fr_alkyl_halide',
            'fr_allylic_oxid',
            'fr_amide',
            'fr_amidine',
            'fr_aniline',
            'fr_aryl_methyl',
            'fr_azide',
            'fr_azo',
            'fr_barbitur',
            'fr_benzene',
            'fr_benzodiazepine',
            'fr_bicyclic',
            'fr_diazo',
            'fr_dihydropyridine',
            'fr_epoxide',
            'fr_ester',
            'fr_ether',
            'fr_furan',
            'fr_guanido',
            'fr_halogen',
            'fr_hdrzine',
            'fr_hdrzone',
            'fr_imidazole',
            'fr_imide',
            'fr_isocyan',
            'fr_isothiocyan',
            'fr_ketone',
            'fr_ketone_Topliss',
            'fr_lactam',
            'fr_lactone',
            'fr_methoxy',
            'fr_morpholine',
            'fr_nitrile',
            'fr_nitro',
            'fr_nitro_arom',
            'fr_nitro_arom_nonortho',
            'fr_nitroso',
            'fr_oxazole',
            'fr_oxime',
            'fr_para_hydroxylation',
            'fr_phenol',
            'fr_phenol_noOrthoHbond',
            'fr_phos_acid',
            'fr_phos_ester',
            'fr_piperdine',
            'fr_piperzine',
            'fr_priamide',
            'fr_prisulfonamd',
            'fr_pyridine',
            'fr_quatN',
            'fr_sulfide',
            'fr_sulfonamd',
            'fr_sulfone',
            'fr_term_acetylene',
            'fr_tetrazole',
            'fr_thiazole',
            'fr_thiocyan',
            'fr_thiophene',
            'fr_unbrch_alkane',
            'fr_urea',
        ]

    @staticmethod
    def get_estate_descriptor_subset() -> List[str]:
        """Electrotopological state (e-state) and VSA/e-state descriptors
        EState_VSA: VSA (van der Waals surface area) of atoms contributing to a specified bin of e-state
        VSA_EState: e-state values of atoms contributing to a specific bin of VSA
        """
        return [
            'EState_VSA1',
            'EState_VSA10',
            'EState_VSA11',
            'EState_VSA2',
            'EState_VSA3',
            'EState_VSA4',
            'EState_VSA5',
            'EState_VSA6',
            'EState_VSA7',
            'EState_VSA8',
            'EState_VSA9',
            'VSA_EState1',
            'VSA_EState10',
            'VSA_EState2',
            'VSA_EState3',
            'VSA_EState4',
            'VSA_EState5',
            'VSA_EState6',
            'VSA_EState7',
            'VSA_EState8',
            'VSA_EState9',
            'MaxAbsEStateIndex',
            'MaxEStateIndex',
            'MinAbsEStateIndex',
            'MinEStateIndex',
        ]

    @staticmethod
    def get_charge_descriptor_subset() -> List[str]:
        """
        Partial charge and VSA/charge descriptors
        PEOE: Partial Equalization of Orbital Electronegativities (Gasteiger partial atomic charges)
        PEOE_VSA: VSA of atoms contributing to a specific bin of partial charge
        """
        return [
            'PEOE_VSA1',
            'PEOE_VSA10',
            'PEOE_VSA11',
            'PEOE_VSA12',
            'PEOE_VSA13',
            'PEOE_VSA14',
            'PEOE_VSA2',
            'PEOE_VSA3',
            'PEOE_VSA4',
            'PEOE_VSA5',
            'PEOE_VSA6',
            'PEOE_VSA7',
            'PEOE_VSA8',
            'PEOE_VSA9',
            'MaxAbsPartialCharge',
            'MaxPartialCharge',
            'MinAbsPartialCharge',
            'MinPartialCharge',
        ]

    @staticmethod
    def get_general_descriptor_subset() -> List[str]:
        """ Descriptors from https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors.html """
        return [
            'MaxAbsPartialCharge',
            'MaxPartialCharge',
            'MinAbsPartialCharge',
            'MinPartialCharge',
            'ExactMolWt',
            'MolWt',
            'FpDensityMorgan1',
            'FpDensityMorgan2',
            'FpDensityMorgan3',
            'HeavyAtomMolWt',
            'NumRadicalElectrons',
            'NumValenceElectrons',
        ]

    @staticmethod
    def get_uncorrelated_descriptor_subset(subset_size: int) -> List[str]:
        """
        Column names are sorted starting with the non-informative descriptors, then the rest are ordered
        from most correlated to least correlated. This will return the n least correlated descriptors.

        Args:
            subset_size: how many to return

        Returns:
            List of descriptors
        """
        columns_sorted_by_correlation = [
            'fr_sulfone',
            'MinPartialCharge',
            'fr_C_O_noCOO',
            'fr_hdrzine',
            'fr_Ndealkylation2',
            'NumAromaticHeterocycles',
            'fr_N_O',
            'fr_piperdine',
            'fr_HOCCN',
            'fr_Nhpyrrole',
            'NumHAcceptors',
            'NumHeteroatoms',
            'fr_C_O',
            'VSA_EState5',
            'fr_Al_OH',
            'SlogP_VSA9',
            'fr_benzodiazepine',
            'VSA_EState6',
            'fr_Ar_N',
            'VSA_EState7',
            'fr_COO2',
            'VSA_EState3',
            'fr_Imine',
            'fr_sulfide',
            'FractionCSP3',
            'fr_imidazole',
            'fr_azo',
            'NumHDonors',
            'fr_COO',
            'fr_ether',
            'fr_nitro',
            'NumSaturatedHeterocycles',
            'fr_lactam',
            'fr_aniline',
            'NumAliphaticCarbocycles',
            'fr_para_hydroxylation',
            'SMR_VSA2',
            'MaxAbsPartialCharge',
            'fr_thiocyan',
            'NHOHCount',
            'fr_ester',
            'fr_aldehyde',
            'SMR_VSA8',
            'fr_halogen',
            'fr_NH0',
            'fr_furan',
            'fr_tetrazole',
            'HeavyAtomCount',
            'NumRotatableBonds',
            'NumSaturatedCarbocycles',
            'fr_SH',
            'fr_Ar_NH',
            'SlogP_VSA7',
            'fr_ketone',
            'fr_alkyl_halide',
            'fr_NH1',
            'NumRadicalElectrons',
            'MaxPartialCharge',
            'fr_ArN',
            'fr_imide',
            'fr_priamide',
            'fr_hdrzone',
            'fr_azide',
            'NumAromaticCarbocycles',
            'NOCount',
            'fr_isocyan',
            'RingCount',
            'fr_nitroso',
            'EState_VSA11',
            'MinAbsPartialCharge',
            'fr_Ar_COO',
            'fr_prisulfonamd',
            'fr_sulfonamd',
            'VSA_EState4',
            'fr_quatN',
            'fr_NH2',
            'fr_epoxide',
            'fr_allylic_oxid',
            'fr_piperzine',
            'VSA_EState1',
            'NumAliphaticHeterocycles',
            'fr_Ndealkylation1',
            'fr_Al_OH_noTert',
            'fr_aryl_methyl',
            'NumAromaticRings',
            'fr_bicyclic',
            'fr_methoxy',
            'fr_oxazole',
            'fr_barbitur',
            'NumAliphaticRings',
            'fr_Ar_OH',
            'fr_phos_ester',
            'fr_thiophene',
            'fr_nitrile',
            'fr_dihydropyridine',
            'VSA_EState2',
            'fr_nitro_arom',
            'SlogP_VSA11',
            'fr_thiazole',
            'fr_ketone_Topliss',
            'fr_term_acetylene',
            'fr_isothiocyan',
            'fr_urea',
            'fr_nitro_arom_nonortho',
            'fr_lactone',
            'fr_diazo',
            'fr_amide',
            'fr_alkyl_carbamate',
            'fr_Al_COO',
            'fr_amidine',
            'fr_phos_acid',
            'fr_oxime',
            'fr_guanido',
            'fr_C_S',
            'NumSaturatedRings',
            'fr_benzene',
            'fr_phenol',
            'fr_unbrch_alkane',
            'fr_phenol_noOrthoHbond',
            'fr_pyridine',
            'fr_morpholine',
            'MaxAbsEStateIndex',
            'ExactMolWt',
            'MolWt',
            'Chi0',
            'LabuteASA',
            'Chi0n',
            'NumValenceElectrons',
            'Chi3n',
            'Chi0v',
            'Chi3v',
            'Chi1',
            'Chi1n',
            'Chi1v',
            'FpDensityMorgan2',
            'HeavyAtomMolWt',
            'Kappa1',
            'SMR_VSA7',
            'Chi2n',
            'Chi2v',
            'Kappa2',
            'Chi4n',
            'SMR_VSA5',
            'MolMR',
            'EState_VSA10',
            'BertzCT',
            'MinEStateIndex',
            'SMR_VSA1',
            'FpDensityMorgan1',
            'VSA_EState10',
            'SlogP_VSA2',
            'SMR_VSA10',
            'HallKierAlpha',
            'VSA_EState9',
            'TPSA',
            'MaxEStateIndex',
            'Chi4v',
            'SMR_VSA4',
            'MolLogP',
            'qed',
            'VSA_EState8',
            'EState_VSA2',
            'SMR_VSA6',
            'PEOE_VSA1',
            'EState_VSA1',
            'SlogP_VSA8',
            'SlogP_VSA6',
            'SlogP_VSA5',
            'SlogP_VSA10',
            'BalabanJ',
            'Kappa3',
            'EState_VSA4',
            'PEOE_VSA6',
            'EState_VSA9',
            'PEOE_VSA2',
            'PEOE_VSA5',
            'SMR_VSA3',
            'SlogP_VSA3',
            'EState_VSA7',
            'EState_VSA3',
            'PEOE_VSA7',
            'SlogP_VSA1',
            'SMR_VSA9',
            'EState_VSA8',
            'EState_VSA6',
            'PEOE_VSA3',
            'MinAbsEStateIndex',
            'PEOE_VSA14',
            'FpDensityMorgan3',
            'PEOE_VSA12',
            'SlogP_VSA4',
            'PEOE_VSA9',
            'PEOE_VSA13',
            'PEOE_VSA10',
            'PEOE_VSA8',
            'EState_VSA5',
            'SlogP_VSA12',
            'PEOE_VSA4',
            'Ipc',
            'PEOE_VSA11',
        ]

        return columns_sorted_by_correlation[-subset_size:]

    @staticmethod
    def _get_descriptor_list(named_descriptor_set: str = 'all',
                             descriptor_list: List[str] = [],
                             subset_size: int = 200):
        if len(descriptor_list) == 0:
            descriptor_list = PhysChemFeaturizer.get_descriptor_subset(
                named_descriptor_set, subset_size)
        else:  # else use the named_descriptor_set given by the user
            assert isinstance(descriptor_list, list)

            all_descriptors = set(
                PhysChemFeaturizer.get_all_descriptor_names())
            assert set(descriptor_list).issubset(all_descriptors)

        descriptor_list.sort()

        return descriptor_list
Exemple #15
0
                      'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO',
                      'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O',
                      'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde',
                      'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide',
                      'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azo', 'fr_barbitur',
                      'fr_benzene', 'fr_bicyclic', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester',
                      'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone',
                      'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone',
                      'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine',
                      'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitroso', 'fr_oxazole',
                      'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond',
                      'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_pyridine', 'fr_quatN',
                      'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole',
                      'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_urea']

DescCalc = MolecularDescriptorCalculator(LigandDescriptors)


# ### An atom type from EFIC is defined as:
#     Atom symbol;
#     Explicit valence;
#     Attached heavy atoms;
#     Attached hydrogens;
#     Aromaticity;
#     Ring membership

# In[5]:


def GetAtomType(atom):
# This function takes an atom in a molecule and returns its type as defined for ECIF