Exemple #1
0
 def getChemicalDescriptors(
     self, data_file, destpath, filename
 ):  # datafile(csv file) destpath(where to save) # filename (name of file to save)
     test1 = pd.read_csv(data_file)
     for i in range(len(test1)):
         try:
             temp = test1["Smiles"][i]
             descriptors = from_smiles(temp)
         except RuntimeError:
             temp = test1["Smiles"][i]
             descriptors = from_smiles(temp, timeout=30)
         if i == 0:
             df = pd.DataFrame(descriptors,
                               columns=descriptors.keys(),
                               index=[0])
         else:
             df1 = pd.DataFrame(descriptors,
                                columns=descriptors.keys(),
                                index=[i])
         if i is 1:
             ff = pd.concat([df, df1], axis=0)
         if i > 1:
             ff = pd.concat([ff, df1], axis=0)
     ff = pd.concat([test1['Activation Status'], ff, test1['Ligand']],
                    axis=1)
     ff.to_csv(destpath + filename + ".csv", index=False)
Exemple #2
0
    def use(self, smiles: list, backend: str = 'padel'):
        ''' use: uses the trained project to predict values for supplied
        molecules

        Args:
            smiles (list): list of SMILES strings to predict for
            backend (str): backend software to use for QSPR generation; `padel`
                or `alvadesc`; default = `padel`; alvadesc requries valid
                license

        Returns:
            numpy.array: predicted values
        '''

        if backend == 'alvadesc':
            mols = [smiles_to_descriptors(s) for s in smiles]
            for mol in mols:
                for key in list(mol.keys()):
                    if mol[key] == 'na':
                        mol[key] = 0
        elif backend == 'padel':
            mols = [from_smiles(s) for s in smiles]
        else:
            raise ValueError('Unknown backend software: {}'.format(backend))
        return mean([
            model.use(
                asarray([[float(mol[name]) for name in self._df._input_names]
                         for mol in mols])) for model in self._models
        ],
                    axis=0)
Exemple #3
0
def Pubchem_FP(smi,**kwargs):
    if type(smi) != str:
        smi = MolToSmiles(smi)
    fp = from_smiles(smi,**kwargs)
    lst_as_vect = []
    for key,val in fp.items():
        lst_as_vect.append(val)
    assert len(lst_as_vect)==881
    return lst_as_vect
Exemple #4
0
    def generate_padel_features_serially(self, java_path):
        self.jlogger.info("Inside generate_padel_features_serial")

        test1 = self.ml_pipeline.data
        for i in range(len(test1)):
            self.jlogger.debug(
                "Generating Padel Features for datapoint {}".format(i))
            try:
                temp = test1["SMILES"][i]
                descriptors = from_smiles(temp,
                                          timeout=60,
                                          java_path=java_path)
            except RuntimeError:
                self.jlogger.error(
                    "Padel feature generation failed with timeout of 60 secs, datapoint {}"
                    .format(i))
                continue
                # try:
                #     temp = test1["SMILES"][i]
                #     descriptors = from_smiles(temp, timeout=60)
                # except RuntimeError:
                #     self.jlogger.exception(
                #         "Padel feature generation failed on 2nd retry of 60secs, ignoring this datapoint {}".format(
                #             i))
                #     continue

            if i == 0:
                df = pd.DataFrame(descriptors,
                                  columns=descriptors.keys(),
                                  index=[0])
            else:
                df1 = pd.DataFrame(descriptors,
                                   columns=descriptors.keys(),
                                   index=[i])
            if i is 1:
                ff = pd.concat([df, df1], axis=0)
            if i > 1:
                ff = pd.concat([ff, df1], axis=0)

        ff = pd.concat([ff, test1[['CNAME', 'Activation Status']]], axis=1)

        self.ml_pipeline.data = ff

        return ff
Exemple #5
0
def lower_heating_value(smiles: list) -> tuple:

    desc = from_smiles(smiles)
    masses = [[
        int(d['nC']) * 12.011,
        int(d['nH']) * 1.008,
        int(d['nO']) * 15.999,
        int(d['nS']) * 32.06
    ] for d in desc]
    totals = [sum(m) for m in masses]
    mass_fracs = [[
        m[0] / totals[i], m[1] / totals[i], m[2] / totals[i], m[3] / totals[i]
    ] for i, m in enumerate(masses)]
    lhv = []
    for m in mass_fracs:
        lhv.append(_dulong(m[0], m[1], m[2], m[3]))
    errors = []
    for l in lhv:
        # error assumed at 3.8%
        errors.append(0.038 * l)
    return (lhv, errors)
Exemple #6
0
def _qspr_from_padel(smiles: List[str], timeout: int = None) -> Tuple[List[List[float]], List[str]]:
    """
    Args:
        smiles (list[str]): list of SMILES strings
        timeout (int, optional): timeout for PaDEL-Descriptor process call; if None, uses
        max(15, len(smiles)) seconds; default = None

    Returns:
        Tuple[List[List[float]], List[str]]: (descriptors w/ shape (n_compounds, n_desc),
            descriptor names)
    """

    if timeout is None:
        timeout = len(smiles)
    desc = from_smiles(smiles, timeout=max(15, len(smiles)))
    keys = list(desc[0].keys())
    for idx, d in enumerate(desc):
        for k in keys:
            if d[k] == '':
                desc[idx][k] = 0.0
    desc = [[float(d[k]) for k in keys] for d in desc]
    return (desc, keys)
Exemple #7
0
    def test_from_smiles(self):

        descriptors = from_smiles('CCC')
        self.assertEqual(len(descriptors), 1875)
        self.assertAlmostEqual(float(descriptors['MW']), 44.0626, 4)
        self.assertEqual(int(descriptors['nC']), 3)
Exemple #8
0
    def test_multiple_smiles(self):

        smiles = ['CCC', 'CCCC']
        descriptors = from_smiles(smiles)
        self.assertEqual(len(descriptors), 2)
        self.assertEqual(len(descriptors[0]), 1875)
Exemple #9
0
nms = [x[0] for x in Descriptors._descList]
print('\n')
calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
f = open('/scratch/woon/b3lyp_2017/datasmile2.txt')
for i, line in enumerate(f):
    mydes = []
    mylist = []
    line = line.split(',')
    number = str(line[0])
    print(number)
    line = line[1]
    m = Chem.MolFromSmiles(line)
    try:
        time.sleep(1)
        des = from_smiles(line, fingerprints=True, timeout=180)
        des = str(des).split(',')
        for ii in range(len(des)):
            b = des[ii].split(',')
            b = des[ii].strip('[').strip(']')
            b = re.sub('[^.,a-zA-Z0-9 \n\.]', '', b)
            b = b.replace('[', ' ')
            b = b.replace(']', ' ')
            b = b.strip()
            b = b.split(' ')
            mylist.append(b[0])
            try:
                b = b[1]
            except:
                b = ''
            if bool(b) == True:
Exemple #10
0
def create_db(smiles: list,
              db_name: str,
              targets: list = None,
              id_prefix: str = '',
              extra_strings: dict = {},
              backend: str = 'padel',
              convert_mdl: bool = False):
    ''' create_db: creates an ECNet-formatted database from SMILES strings
    using either PaDEL-Descriptor or alvaDesc software; using alvaDesc
    requires a valid installation/license of alvaDesc

    Args:
        smiles (list): list of SMILES strings
        db_name (str): name/path of database being created
        targets (list): target (experimental) values, align with SMILES
            strings; if None, all TARGETs set to 0
        id_prefix (str): prefix of molecule DATAID, if desired
        extra_strings (dict): extra STRING columns, label = name, value = list
            with length equal to number of SMILES strings
        backend (str): software used to calculate QSPR descriptors, 'padel' or
            'alvadesc'
        convert_mdl (bool): if `True`, converts SMILES strings to MDL 3D
            format before calculating descriptors (PaDEL only)
    '''

    if targets is not None:
        if len(targets) != len(smiles):
            raise ValueError('Must supply same number of targets as SMILES '
                             'strings: {}, {}'.format(len(targets),
                                                      len(smiles)))

    for string in list(extra_strings.keys()):
        if len(extra_strings[string]) != len(smiles):
            raise ValueError('Extra string values for {} not equal in length '
                             'to supplied SMILES: {}, {}'.format(
                                 len(extra_strings[string]), len(smiles)))

    mols = []
    if backend == 'alvadesc':
        for mol in smiles:
            mols.append(smiles_to_descriptors(mol))
    elif backend == 'padel':
        for idx, mol in enumerate(smiles):
            if convert_mdl is True:
                if pybel is None:
                    raise ImportError(
                        'pybel (Python Open Babel wrapper) not installed, '
                        'cannot convert SMILES to MDL')
                mdl = pybel.readstring('smi', mol)
                mdl.make3D()
                curr_time = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3]
                mdl.write('mdl', '{}.mdl'.format(curr_time))
                try:
                    mols.append(from_mdl('{}.mdl'.format(curr_time))[0])
                except RuntimeError:
                    warn(
                        'Could not calculate descriptors for {}, omitting'.
                        format(mol), RuntimeWarning)
                    del smiles[idx]
                    if targets is not None:
                        del targets[idx]
                    for string in list(extra_strings.keys()):
                        del extra_strings[string][idx]
                remove('{}.mdl'.format(curr_time))
            else:
                try:
                    mols.append(from_smiles(mol))
                except RuntimeError:
                    warn(
                        'Could not calculate descriptors for {}, omitting'.
                        format(mol), RuntimeWarning)
                    del smiles[idx]
                    if targets is not None:
                        del targets[idx]
                    for string in list(extra_strings.keys()):
                        del extra_strings[string][idx]
    else:
        raise ValueError('Unknown backend software: {}'.format(backend))

    rows = []
    type_row = ['DATAID', 'ASSIGNMENT', 'STRING', 'STRING']
    title_row = ['DATAID', 'ASSIGNMENT', 'Compound Name', 'SMILES']
    strings = list(extra_strings.keys())
    for string in strings:
        if string != 'Compound Name':
            type_row.append('STRING')
            title_row.append(string)
    type_row.append('TARGET')
    title_row.append('TARGET')
    descriptor_keys = list(mols[0].keys())
    for key in descriptor_keys:
        type_row.append('INPUT')
        title_row.append(key)

    mol_rows = []
    for idx, desc in enumerate(mols):
        for key in descriptor_keys:
            if desc[key] == 'na' or desc[key] == '':
                desc[key] = 0
        mol = _Molecule('{}'.format(id_prefix) + '%04d' % (idx + 1))
        for string in strings:
            mol.strings[string] = extra_strings[string][idx]
        if targets is not None:
            mol.target = targets[idx]
        mol.inputs = desc
        mol_rows.append(mol)

    with open(db_name, 'w', encoding='utf-8') as db_file:
        wr = writer(db_file, delimiter=',', lineterminator='\n')
        wr.writerow(type_row)
        wr.writerow(title_row)
        for idx, mol in enumerate(mol_rows):
            row = [
                mol.id, mol.assignment, mol.strings['Compound Name'],
                smiles[idx]
            ]
            for string in strings:
                if string != 'Compound Name':
                    row.append(mol.strings[string])
            row.append(mol.target)
            for key in descriptor_keys:
                row.append(mol.inputs[key])
            wr.writerow(row)
    db_file.close()
Exemple #11
0
#from zinc_id.get_zincid import get_zincid_from_smile

#zinc_id = get_zincid_from_smile("FC(F)(F)C1=CC=C(NC(=O)CSC2=NC(=CC=N2)C2=CC(=NO2)C2=CC=C(Cl)C=C2Cl)C=C1")
#print(zinc_id)

from padelpy import from_smiles

#calculate molecular descriptors for propane
descriptors = from_smiles(
    'FC(F)(F)C1=CC=C(NC(=O)CSC2=NC(=CC=N2)C2=CC(=NO2)C2=CC=C(Cl)C=C2Cl)C=C1')

#print(type(descriptors))
#print(len(descriptors))
# in addition to descriptors, calculate PubChem fingerprints
#desc_fp = from_smiles('CCC', fingerprints=True)

# only calculate fingerprints
#fingerprints = from_smiles('CCC', fingerprints=True, descriptors=False)

# save descriptors to a CSV file
#_ = from_smiles('FC(F)(F)C1=CC=C(NC(=O)CSC2=NC(=CC=N2)C2=CC(=NO2)C2=CC=C(Cl)C=C2Cl)C=C1', output_csv='descriptors.csv')

from tabula import read_pdf, convert_into

#convert_into("./data/smiles_pdf.pdf", "smiles.csv", output_format="csv", pages='all')
#zinc_id = get_zincid_from_smile("ClC1=CC(NC(=O)CSC2=NC=CC(=N2)C2=CSC(=N2)C2=CC=CC=C2)=CC(Cl)=C1")
#print(zinc_id)

#print("arun")
Exemple #12
0
import sys
from padelpy import from_smiles
i = sys.argv[1]
smi = i
descriptors = from_smiles(smi, timeout=30)
vol = descriptors['McGowan_Volume']
with open("volume_out.txt", "w") as f:
    f.write(str(float(vol) * 100))