Esempio n. 1
0
def calculate_descriptors(drugs_file):
    print("Calculating descriptors..")
    find_smiles(drugs_file)
    # Find the descriptors from the smiles and store it
    padeldescriptor(mol_dir='smiles.smi',
                    d_file='./Data/Clean/descriptors.csv',
                    convert3d=True,
                    retain3d=True,
                    d_2d=True,
                    d_3d=True,
                    fingerprints=False)

    # The descriptors are sometimes out of order, sort them so they match the drugs_with_smiles order
    descriptors_df = pd.read_csv('./Data/Clean/descriptors.csv')
    descriptors_df['Index'] = [
        int(x[2]) for x in descriptors_df.Name.str.split("_")
    ]
    descriptors_df.set_index('Index', drop=True, inplace=True)
    descriptors_df.sort_index(inplace=True)

    # Write the correct order back to the file
    descriptors_df.to_csv('./Data/Clean/descriptors.csv', index=False)

    # Replace all the missing features and infinities with 0s (might have to change this later if
    # the model doesn't shake out)
    cols = (list(descriptors_df.columns))[1:]
    # Replace nan and infinity values with 0s
    descriptors_df[cols] = descriptors_df[cols].replace({
        np.nan: 0,
        '': 0,
        'Infinity': 0,
        '-Infinity': 0
    })
    descriptors_df = descriptors_df.apply(lambda x: np.array(x).astype(float))
    # Some values are huge which causes issues during numpy calculations
    # Make them not so enormous
    descriptors_df[descriptors_df > 10E100] = 10E100
    # Get rid of outliers using the Extreme Studentized Deviate test
    descriptors_df = descriptors_df.apply(scikit_posthocs.outliers_gesd)
    # Get rid of columns which have all 0s
    descriptors_df = descriptors_df.loc[:, (descriptors_df != 0).any(axis=0)]
    # Scale every column
    descriptors_df = descriptors_df.apply(scale_array)
    descriptors_df.to_csv('./Data/Clean/descriptors_scaled.csv', index=False)
Esempio n. 2
0
def krfp(smi):
    """Calculate Klekota-Roth fingerprint using padelpy."""
    # Warning: as this function uses padel it requires descriptors.xml to be
    # in the running directory and have KlekotaRothFingerprinter set to true

    # we don't want to copy and remove the descriptors.xml file for each smiles
    # separately, so we check if it exists and if it has the proper content
    cwd = os.getcwd()
    descriptors_filename = 'descriptors.xml'
    descriptors_hash = 'f6145f57ff346599b907b044316c4e71'

    try:
        with open(os.path.join(cwd, descriptors_filename), 'r') as desc_file:
            desc_file_content = desc_file.read()
        m = hashlib.md5()
        m.update(desc_file_content.encode('utf-8'))
        if m.hexdigest() == descriptors_hash:
            pass  # descriptors.xml exists and has the right content
        else:
            # the file exists but it has a wrong content
            raise RuntimeError("The descriptors.xml was found in the running directory but its content doesn't match the prototype content. Aborting.")
    except FileNotFoundError:
        # the file doesn't exist, we have to create it
        src_directory = os.path.dirname(os.path.realpath(__file__))
        shutil.copyfile(os.path.join(src_directory, descriptors_filename),
                        os.path.join(cwd, descriptors_filename))

    # # #
    # # # descriptors.xml exists and looks good, we can continue with calculating the representation
    # on prometheus we use SCRATCH, everywhere else the default location is fine
    with tempfile.TemporaryDirectory(dir=os.getenv('SCRATCH', None)) as tmpdirname:
        smi_file = os.path.join(tmpdirname, "molecules.smi")
        with open(smi_file, 'w') as sf:
            sf.write(smi)
        out = os.path.join(tmpdirname, "out.csv")
        padeldescriptor(mol_dir=smi_file, d_file=out, fingerprints=True, retainorder=True)
        fp = pd.read_csv(out).values[:,1:].reshape((-1)).astype(int)
    return fp
Esempio n. 3
0
def from_smiles(smiles,
                output_csv: str = None,
                descriptors: bool = True,
                fingerprints: bool = False,
                timeout: int = 60) -> OrderedDict:
    ''' from_smiles: converts SMILES string to QSPR descriptors/fingerprints

    Args:
        smiles (str, list): SMILES string for a given molecule, or a list of
            SMILES strings
        output_csv (str): if supplied, saves descriptors to this CSV file
        descriptors (bool): if `True`, calculates descriptors
        fingerprints (bool): if `True`, calculates fingerprints
        timeout (int): maximum time, in seconds, for conversion

    Returns:
        list or OrderedDict: if multiple SMILES strings provided, returns a
            list of OrderedDicts, else single OrderedDict; each OrderedDict
            contains labels and values for each descriptor generated for each
            supplied molecule
    '''

    timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3]

    with open('{}.smi'.format(timestamp), 'w') as smi_file:
        if type(smiles) == str:
            smi_file.write(smiles)
        elif type(smiles) == list:
            smi_file.write('\n'.join(smiles))
        else:
            raise RuntimeError('Unknown input format for `smiles`: {}'.format(
                type(smiles)))
    smi_file.close()

    save_csv = True
    if output_csv is None:
        save_csv = False
        output_csv = '{}.csv'.format(timestamp)

    for attempt in range(3):
        try:
            padeldescriptor(mol_dir='{}.smi'.format(timestamp),
                            d_file=output_csv,
                            convert3d=True,
                            retain3d=True,
                            d_2d=descriptors,
                            d_3d=descriptors,
                            fingerprints=fingerprints,
                            sp_timeout=timeout)
            break
        except RuntimeError as exception:
            if attempt == 2:
                remove('{}.smi'.format(timestamp))
                if not save_csv:
                    sleep(0.5)
                    try:
                        remove(output_csv)
                    except FileNotFoundError as e:
                        warnings.warn(e, RuntimeWarning)
                raise RuntimeError(exception)
            else:
                continue

    with open(output_csv, 'r', encoding='utf-8') as desc_file:
        reader = DictReader(desc_file)
        rows = [row for row in reader]
    desc_file.close()

    remove('{}.smi'.format(timestamp))
    if not save_csv:
        remove(output_csv)

    if type(smiles) == list and len(rows) != len(smiles):
        raise RuntimeError('PaDEL-Descriptor failed on one or more mols.' +
                           ' Ensure the input structures are correct.')
    elif type(smiles) == str and len(rows) == 0:
        raise RuntimeError('PaDEL-Descriptor failed on {}.'.format(smiles) +
                           ' Ensure input structure is correct.')

    for idx, r in enumerate(rows):
        if len(r) == 0:
            raise RuntimeError(
                'PaDEL-Descriptor failed on {}.'.format(smiles[idx]) +
                ' Ensure input structure is correct.')

    for idx in range(len(rows)):
        del rows[idx]['Name']

    if type(smiles) == str:
        return rows[0]
    return rows
Esempio n. 4
0
def from_mdl(mdl_file: str,
             output_csv: str = None,
             descriptors: bool = True,
             fingerprints: bool = False,
             timeout: int = 60) -> list:
    ''' from_mdl: converts MDL file into QSPR descriptors/fingerprints;
    multiple molecules may be represented in the MDL file

    Args:
        mdl_file (str): path to MDL file
        output_csv (str): if supplied, saves descriptors/fingerprints here
        descriptors (bool): if `True`, calculates descriptors
        fingerprints (bool): if `True`, calculates fingerprints
        timeout (int): maximum time, in seconds, for conversion

    Returns:
        list: list of dicts, where each dict corresponds sequentially to a
            compound in the supplied MDL file
    '''

    is_mdl = compile(r'.*\.mdl$', IGNORECASE)
    if is_mdl.match(mdl_file) is None:
        raise ValueError(
            'MDL file must have a `.mdl` extension: {}'.format(mdl_file))

    save_csv = True
    if output_csv is None:
        save_csv = False
        output_csv = '{}.csv'.format(
            datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3])

    for attempt in range(3):
        try:
            padeldescriptor(mol_dir=mdl_file,
                            d_file=output_csv,
                            convert3d=True,
                            retain3d=True,
                            retainorder=True,
                            d_2d=descriptors,
                            d_3d=descriptors,
                            fingerprints=fingerprints,
                            sp_timeout=timeout)
            break
        except RuntimeError as exception:
            if attempt == 2:
                if not save_csv:
                    sleep(0.5)
                    try:
                        remove(output_csv)
                    except FileNotFoundError as e:
                        warnings.warn(e, RuntimeWarning)
                raise RuntimeError(exception)
            else:
                continue

    with open(output_csv, 'r', encoding='utf-8') as desc_file:
        reader = DictReader(desc_file)
        rows = [row for row in reader]
    desc_file.close()
    if not save_csv:
        remove(output_csv)
    if len(rows) == 0:
        raise RuntimeError('PaDEL-Descriptor returned no calculated values.' +
                           ' Ensure the input structure is correct.')
    for row in rows:
        del row['Name']
    return rows
def from_smiles(smiles: str,
                output_csv: str = None,
                descriptors: bool = True,
                fingerprints: bool = False,
                timeout: int = 12,
                java_path: str = None) -> OrderedDict:
    ''' from_smiles: converts SMILES string to QSPR descriptors/fingerprints

    Args:
        smiles (str): SMILES string for a given molecule
        output_csv (str): if supplied, saves descriptors to this CSV file
        descriptors (bool): if `True`, calculates descriptors
        fingerprints (bool): if `True`, calculates fingerprints
        timeout (int): maximum time, in seconds, for conversion
        java_path(str): custom java path, if None, system java path

    Returns:
        OrderedDict: descriptors/fingerprint labels and values
    '''

    timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3]

    with open('{}.smi'.format(timestamp), 'w') as smi_file:
        smi_file.write(smiles)
    smi_file.close()

    save_csv = True
    if output_csv is None:
        save_csv = False
        output_csv = '{}.csv'.format(timestamp)

    for attempt in range(3):
        try:
            padeldescriptor(mol_dir='{}.smi'.format(timestamp),
                            d_file=output_csv,
                            convert3d=True,
                            retain3d=True,
                            d_2d=descriptors,
                            d_3d=descriptors,
                            fingerprints=fingerprints,
                            sp_timeout=timeout,
                            java_path=java_path)
            break
        except RuntimeError as exception:
            if attempt == 2:
                remove('{}.smi'.format(timestamp))
                if not save_csv:
                    sleep(0.5)
                    remove(output_csv)
                raise RuntimeError(exception)
            else:
                continue

    with open(output_csv, 'r', encoding='utf-8') as desc_file:
        reader = DictReader(desc_file)
        rows = [row for row in reader]
    desc_file.close()

    remove('{}.smi'.format(timestamp))
    if not save_csv:
        remove(output_csv)

    del rows[0]['Name']
    return rows[0]
Esempio n. 6
0
    "SMIES are extracted in list in mols_test and activity in an array dataY_test"
)

print('dataY_test Shape: ' + str(np.shape(dataY_test)))

# In[6]:

with open('test.smi', 'w') as filehandle:
    for listitem in smiles_test:
        filehandle.write('%s\n' % listitem)

padeldescriptor(mol_dir='test.smi',
                d_2d=True,
                d_3d=False,
                fingerprints=False,
                removesalt=True,
                retainorder=True,
                d_file='test_2D.csv',
                maxruntime=100000,
                threads=1)

# In[3]:

padeldescriptor(mol_dir='train.smi',
                d_2d=True,
                d_3d=False,
                fingerprints=False,
                removesalt=True,
                retainorder=True,
                d_file='train_2D.csv',
                maxruntime=100000,
"""
Created on Mon May 18 17:35:13 2020

@author: twsle
"""


import os
import pandas as pd
import tempfile
import shutil
from openbabel import openbabel

from padelpy import from_mdl
from padelpy import padeldescriptor
padeldescriptor(maxruntime=50000)

#*********************************************************
#this script will take the optimized structures from 
#a gaussian run and use them to calculat the topological descriptors with PADEL

        
def get_padel_data(filepath):        
        
    #copy all the files in the path directory to tmpdir
    temp_dir = tempfile.gettempdir()        
    temp_path = os.path.join(temp_dir, 'tempfile')
    
    shutil.copy2(filepath, temp_path)        
    
    obConversion = openbabel.OBConversion()