def calculate_descriptors(drugs_file): print("Calculating descriptors..") find_smiles(drugs_file) # Find the descriptors from the smiles and store it padeldescriptor(mol_dir='smiles.smi', d_file='./Data/Clean/descriptors.csv', convert3d=True, retain3d=True, d_2d=True, d_3d=True, fingerprints=False) # The descriptors are sometimes out of order, sort them so they match the drugs_with_smiles order descriptors_df = pd.read_csv('./Data/Clean/descriptors.csv') descriptors_df['Index'] = [ int(x[2]) for x in descriptors_df.Name.str.split("_") ] descriptors_df.set_index('Index', drop=True, inplace=True) descriptors_df.sort_index(inplace=True) # Write the correct order back to the file descriptors_df.to_csv('./Data/Clean/descriptors.csv', index=False) # Replace all the missing features and infinities with 0s (might have to change this later if # the model doesn't shake out) cols = (list(descriptors_df.columns))[1:] # Replace nan and infinity values with 0s descriptors_df[cols] = descriptors_df[cols].replace({ np.nan: 0, '': 0, 'Infinity': 0, '-Infinity': 0 }) descriptors_df = descriptors_df.apply(lambda x: np.array(x).astype(float)) # Some values are huge which causes issues during numpy calculations # Make them not so enormous descriptors_df[descriptors_df > 10E100] = 10E100 # Get rid of outliers using the Extreme Studentized Deviate test descriptors_df = descriptors_df.apply(scikit_posthocs.outliers_gesd) # Get rid of columns which have all 0s descriptors_df = descriptors_df.loc[:, (descriptors_df != 0).any(axis=0)] # Scale every column descriptors_df = descriptors_df.apply(scale_array) descriptors_df.to_csv('./Data/Clean/descriptors_scaled.csv', index=False)
def krfp(smi): """Calculate Klekota-Roth fingerprint using padelpy.""" # Warning: as this function uses padel it requires descriptors.xml to be # in the running directory and have KlekotaRothFingerprinter set to true # we don't want to copy and remove the descriptors.xml file for each smiles # separately, so we check if it exists and if it has the proper content cwd = os.getcwd() descriptors_filename = 'descriptors.xml' descriptors_hash = 'f6145f57ff346599b907b044316c4e71' try: with open(os.path.join(cwd, descriptors_filename), 'r') as desc_file: desc_file_content = desc_file.read() m = hashlib.md5() m.update(desc_file_content.encode('utf-8')) if m.hexdigest() == descriptors_hash: pass # descriptors.xml exists and has the right content else: # the file exists but it has a wrong content raise RuntimeError("The descriptors.xml was found in the running directory but its content doesn't match the prototype content. Aborting.") except FileNotFoundError: # the file doesn't exist, we have to create it src_directory = os.path.dirname(os.path.realpath(__file__)) shutil.copyfile(os.path.join(src_directory, descriptors_filename), os.path.join(cwd, descriptors_filename)) # # # # # # descriptors.xml exists and looks good, we can continue with calculating the representation # on prometheus we use SCRATCH, everywhere else the default location is fine with tempfile.TemporaryDirectory(dir=os.getenv('SCRATCH', None)) as tmpdirname: smi_file = os.path.join(tmpdirname, "molecules.smi") with open(smi_file, 'w') as sf: sf.write(smi) out = os.path.join(tmpdirname, "out.csv") padeldescriptor(mol_dir=smi_file, d_file=out, fingerprints=True, retainorder=True) fp = pd.read_csv(out).values[:,1:].reshape((-1)).astype(int) return fp
def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, fingerprints: bool = False, timeout: int = 60) -> OrderedDict: ''' from_smiles: converts SMILES string to QSPR descriptors/fingerprints Args: smiles (str, list): SMILES string for a given molecule, or a list of SMILES strings output_csv (str): if supplied, saves descriptors to this CSV file descriptors (bool): if `True`, calculates descriptors fingerprints (bool): if `True`, calculates fingerprints timeout (int): maximum time, in seconds, for conversion Returns: list or OrderedDict: if multiple SMILES strings provided, returns a list of OrderedDicts, else single OrderedDict; each OrderedDict contains labels and values for each descriptor generated for each supplied molecule ''' timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3] with open('{}.smi'.format(timestamp), 'w') as smi_file: if type(smiles) == str: smi_file.write(smiles) elif type(smiles) == list: smi_file.write('\n'.join(smiles)) else: raise RuntimeError('Unknown input format for `smiles`: {}'.format( type(smiles))) smi_file.close() save_csv = True if output_csv is None: save_csv = False output_csv = '{}.csv'.format(timestamp) for attempt in range(3): try: padeldescriptor(mol_dir='{}.smi'.format(timestamp), d_file=output_csv, convert3d=True, retain3d=True, d_2d=descriptors, d_3d=descriptors, fingerprints=fingerprints, sp_timeout=timeout) break except RuntimeError as exception: if attempt == 2: remove('{}.smi'.format(timestamp)) if not save_csv: sleep(0.5) try: remove(output_csv) except FileNotFoundError as e: warnings.warn(e, RuntimeWarning) raise RuntimeError(exception) else: continue with open(output_csv, 'r', encoding='utf-8') as desc_file: reader = DictReader(desc_file) rows = [row for row in reader] desc_file.close() remove('{}.smi'.format(timestamp)) if not save_csv: remove(output_csv) if type(smiles) == list and len(rows) != len(smiles): raise RuntimeError('PaDEL-Descriptor failed on one or more mols.' + ' Ensure the input structures are correct.') elif type(smiles) == str and len(rows) == 0: raise RuntimeError('PaDEL-Descriptor failed on {}.'.format(smiles) + ' Ensure input structure is correct.') for idx, r in enumerate(rows): if len(r) == 0: raise RuntimeError( 'PaDEL-Descriptor failed on {}.'.format(smiles[idx]) + ' Ensure input structure is correct.') for idx in range(len(rows)): del rows[idx]['Name'] if type(smiles) == str: return rows[0] return rows
def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True, fingerprints: bool = False, timeout: int = 60) -> list: ''' from_mdl: converts MDL file into QSPR descriptors/fingerprints; multiple molecules may be represented in the MDL file Args: mdl_file (str): path to MDL file output_csv (str): if supplied, saves descriptors/fingerprints here descriptors (bool): if `True`, calculates descriptors fingerprints (bool): if `True`, calculates fingerprints timeout (int): maximum time, in seconds, for conversion Returns: list: list of dicts, where each dict corresponds sequentially to a compound in the supplied MDL file ''' is_mdl = compile(r'.*\.mdl$', IGNORECASE) if is_mdl.match(mdl_file) is None: raise ValueError( 'MDL file must have a `.mdl` extension: {}'.format(mdl_file)) save_csv = True if output_csv is None: save_csv = False output_csv = '{}.csv'.format( datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3]) for attempt in range(3): try: padeldescriptor(mol_dir=mdl_file, d_file=output_csv, convert3d=True, retain3d=True, retainorder=True, d_2d=descriptors, d_3d=descriptors, fingerprints=fingerprints, sp_timeout=timeout) break except RuntimeError as exception: if attempt == 2: if not save_csv: sleep(0.5) try: remove(output_csv) except FileNotFoundError as e: warnings.warn(e, RuntimeWarning) raise RuntimeError(exception) else: continue with open(output_csv, 'r', encoding='utf-8') as desc_file: reader = DictReader(desc_file) rows = [row for row in reader] desc_file.close() if not save_csv: remove(output_csv) if len(rows) == 0: raise RuntimeError('PaDEL-Descriptor returned no calculated values.' + ' Ensure the input structure is correct.') for row in rows: del row['Name'] return rows
def from_smiles(smiles: str, output_csv: str = None, descriptors: bool = True, fingerprints: bool = False, timeout: int = 12, java_path: str = None) -> OrderedDict: ''' from_smiles: converts SMILES string to QSPR descriptors/fingerprints Args: smiles (str): SMILES string for a given molecule output_csv (str): if supplied, saves descriptors to this CSV file descriptors (bool): if `True`, calculates descriptors fingerprints (bool): if `True`, calculates fingerprints timeout (int): maximum time, in seconds, for conversion java_path(str): custom java path, if None, system java path Returns: OrderedDict: descriptors/fingerprint labels and values ''' timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3] with open('{}.smi'.format(timestamp), 'w') as smi_file: smi_file.write(smiles) smi_file.close() save_csv = True if output_csv is None: save_csv = False output_csv = '{}.csv'.format(timestamp) for attempt in range(3): try: padeldescriptor(mol_dir='{}.smi'.format(timestamp), d_file=output_csv, convert3d=True, retain3d=True, d_2d=descriptors, d_3d=descriptors, fingerprints=fingerprints, sp_timeout=timeout, java_path=java_path) break except RuntimeError as exception: if attempt == 2: remove('{}.smi'.format(timestamp)) if not save_csv: sleep(0.5) remove(output_csv) raise RuntimeError(exception) else: continue with open(output_csv, 'r', encoding='utf-8') as desc_file: reader = DictReader(desc_file) rows = [row for row in reader] desc_file.close() remove('{}.smi'.format(timestamp)) if not save_csv: remove(output_csv) del rows[0]['Name'] return rows[0]
"SMIES are extracted in list in mols_test and activity in an array dataY_test" ) print('dataY_test Shape: ' + str(np.shape(dataY_test))) # In[6]: with open('test.smi', 'w') as filehandle: for listitem in smiles_test: filehandle.write('%s\n' % listitem) padeldescriptor(mol_dir='test.smi', d_2d=True, d_3d=False, fingerprints=False, removesalt=True, retainorder=True, d_file='test_2D.csv', maxruntime=100000, threads=1) # In[3]: padeldescriptor(mol_dir='train.smi', d_2d=True, d_3d=False, fingerprints=False, removesalt=True, retainorder=True, d_file='train_2D.csv', maxruntime=100000,
""" Created on Mon May 18 17:35:13 2020 @author: twsle """ import os import pandas as pd import tempfile import shutil from openbabel import openbabel from padelpy import from_mdl from padelpy import padeldescriptor padeldescriptor(maxruntime=50000) #********************************************************* #this script will take the optimized structures from #a gaussian run and use them to calculat the topological descriptors with PADEL def get_padel_data(filepath): #copy all the files in the path directory to tmpdir temp_dir = tempfile.gettempdir() temp_path = os.path.join(temp_dir, 'tempfile') shutil.copy2(filepath, temp_path) obConversion = openbabel.OBConversion()