def getChemicalDescriptors( self, data_file, destpath, filename ): # datafile(csv file) destpath(where to save) # filename (name of file to save) test1 = pd.read_csv(data_file) for i in range(len(test1)): try: temp = test1["Smiles"][i] descriptors = from_smiles(temp) except RuntimeError: temp = test1["Smiles"][i] descriptors = from_smiles(temp, timeout=30) if i == 0: df = pd.DataFrame(descriptors, columns=descriptors.keys(), index=[0]) else: df1 = pd.DataFrame(descriptors, columns=descriptors.keys(), index=[i]) if i is 1: ff = pd.concat([df, df1], axis=0) if i > 1: ff = pd.concat([ff, df1], axis=0) ff = pd.concat([test1['Activation Status'], ff, test1['Ligand']], axis=1) ff.to_csv(destpath + filename + ".csv", index=False)
def use(self, smiles: list, backend: str = 'padel'): ''' use: uses the trained project to predict values for supplied molecules Args: smiles (list): list of SMILES strings to predict for backend (str): backend software to use for QSPR generation; `padel` or `alvadesc`; default = `padel`; alvadesc requries valid license Returns: numpy.array: predicted values ''' if backend == 'alvadesc': mols = [smiles_to_descriptors(s) for s in smiles] for mol in mols: for key in list(mol.keys()): if mol[key] == 'na': mol[key] = 0 elif backend == 'padel': mols = [from_smiles(s) for s in smiles] else: raise ValueError('Unknown backend software: {}'.format(backend)) return mean([ model.use( asarray([[float(mol[name]) for name in self._df._input_names] for mol in mols])) for model in self._models ], axis=0)
def Pubchem_FP(smi,**kwargs): if type(smi) != str: smi = MolToSmiles(smi) fp = from_smiles(smi,**kwargs) lst_as_vect = [] for key,val in fp.items(): lst_as_vect.append(val) assert len(lst_as_vect)==881 return lst_as_vect
def generate_padel_features_serially(self, java_path): self.jlogger.info("Inside generate_padel_features_serial") test1 = self.ml_pipeline.data for i in range(len(test1)): self.jlogger.debug( "Generating Padel Features for datapoint {}".format(i)) try: temp = test1["SMILES"][i] descriptors = from_smiles(temp, timeout=60, java_path=java_path) except RuntimeError: self.jlogger.error( "Padel feature generation failed with timeout of 60 secs, datapoint {}" .format(i)) continue # try: # temp = test1["SMILES"][i] # descriptors = from_smiles(temp, timeout=60) # except RuntimeError: # self.jlogger.exception( # "Padel feature generation failed on 2nd retry of 60secs, ignoring this datapoint {}".format( # i)) # continue if i == 0: df = pd.DataFrame(descriptors, columns=descriptors.keys(), index=[0]) else: df1 = pd.DataFrame(descriptors, columns=descriptors.keys(), index=[i]) if i is 1: ff = pd.concat([df, df1], axis=0) if i > 1: ff = pd.concat([ff, df1], axis=0) ff = pd.concat([ff, test1[['CNAME', 'Activation Status']]], axis=1) self.ml_pipeline.data = ff return ff
def lower_heating_value(smiles: list) -> tuple: desc = from_smiles(smiles) masses = [[ int(d['nC']) * 12.011, int(d['nH']) * 1.008, int(d['nO']) * 15.999, int(d['nS']) * 32.06 ] for d in desc] totals = [sum(m) for m in masses] mass_fracs = [[ m[0] / totals[i], m[1] / totals[i], m[2] / totals[i], m[3] / totals[i] ] for i, m in enumerate(masses)] lhv = [] for m in mass_fracs: lhv.append(_dulong(m[0], m[1], m[2], m[3])) errors = [] for l in lhv: # error assumed at 3.8% errors.append(0.038 * l) return (lhv, errors)
def _qspr_from_padel(smiles: List[str], timeout: int = None) -> Tuple[List[List[float]], List[str]]: """ Args: smiles (list[str]): list of SMILES strings timeout (int, optional): timeout for PaDEL-Descriptor process call; if None, uses max(15, len(smiles)) seconds; default = None Returns: Tuple[List[List[float]], List[str]]: (descriptors w/ shape (n_compounds, n_desc), descriptor names) """ if timeout is None: timeout = len(smiles) desc = from_smiles(smiles, timeout=max(15, len(smiles))) keys = list(desc[0].keys()) for idx, d in enumerate(desc): for k in keys: if d[k] == '': desc[idx][k] = 0.0 desc = [[float(d[k]) for k in keys] for d in desc] return (desc, keys)
def test_from_smiles(self): descriptors = from_smiles('CCC') self.assertEqual(len(descriptors), 1875) self.assertAlmostEqual(float(descriptors['MW']), 44.0626, 4) self.assertEqual(int(descriptors['nC']), 3)
def test_multiple_smiles(self): smiles = ['CCC', 'CCCC'] descriptors = from_smiles(smiles) self.assertEqual(len(descriptors), 2) self.assertEqual(len(descriptors[0]), 1875)
nms = [x[0] for x in Descriptors._descList] print('\n') calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms) f = open('/scratch/woon/b3lyp_2017/datasmile2.txt') for i, line in enumerate(f): mydes = [] mylist = [] line = line.split(',') number = str(line[0]) print(number) line = line[1] m = Chem.MolFromSmiles(line) try: time.sleep(1) des = from_smiles(line, fingerprints=True, timeout=180) des = str(des).split(',') for ii in range(len(des)): b = des[ii].split(',') b = des[ii].strip('[').strip(']') b = re.sub('[^.,a-zA-Z0-9 \n\.]', '', b) b = b.replace('[', ' ') b = b.replace(']', ' ') b = b.strip() b = b.split(' ') mylist.append(b[0]) try: b = b[1] except: b = '' if bool(b) == True:
def create_db(smiles: list, db_name: str, targets: list = None, id_prefix: str = '', extra_strings: dict = {}, backend: str = 'padel', convert_mdl: bool = False): ''' create_db: creates an ECNet-formatted database from SMILES strings using either PaDEL-Descriptor or alvaDesc software; using alvaDesc requires a valid installation/license of alvaDesc Args: smiles (list): list of SMILES strings db_name (str): name/path of database being created targets (list): target (experimental) values, align with SMILES strings; if None, all TARGETs set to 0 id_prefix (str): prefix of molecule DATAID, if desired extra_strings (dict): extra STRING columns, label = name, value = list with length equal to number of SMILES strings backend (str): software used to calculate QSPR descriptors, 'padel' or 'alvadesc' convert_mdl (bool): if `True`, converts SMILES strings to MDL 3D format before calculating descriptors (PaDEL only) ''' if targets is not None: if len(targets) != len(smiles): raise ValueError('Must supply same number of targets as SMILES ' 'strings: {}, {}'.format(len(targets), len(smiles))) for string in list(extra_strings.keys()): if len(extra_strings[string]) != len(smiles): raise ValueError('Extra string values for {} not equal in length ' 'to supplied SMILES: {}, {}'.format( len(extra_strings[string]), len(smiles))) mols = [] if backend == 'alvadesc': for mol in smiles: mols.append(smiles_to_descriptors(mol)) elif backend == 'padel': for idx, mol in enumerate(smiles): if convert_mdl is True: if pybel is None: raise ImportError( 'pybel (Python Open Babel wrapper) not installed, ' 'cannot convert SMILES to MDL') mdl = pybel.readstring('smi', mol) mdl.make3D() curr_time = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3] mdl.write('mdl', '{}.mdl'.format(curr_time)) try: mols.append(from_mdl('{}.mdl'.format(curr_time))[0]) except RuntimeError: warn( 'Could not calculate descriptors for {}, omitting'. format(mol), RuntimeWarning) del smiles[idx] if targets is not None: del targets[idx] for string in list(extra_strings.keys()): del extra_strings[string][idx] remove('{}.mdl'.format(curr_time)) else: try: mols.append(from_smiles(mol)) except RuntimeError: warn( 'Could not calculate descriptors for {}, omitting'. format(mol), RuntimeWarning) del smiles[idx] if targets is not None: del targets[idx] for string in list(extra_strings.keys()): del extra_strings[string][idx] else: raise ValueError('Unknown backend software: {}'.format(backend)) rows = [] type_row = ['DATAID', 'ASSIGNMENT', 'STRING', 'STRING'] title_row = ['DATAID', 'ASSIGNMENT', 'Compound Name', 'SMILES'] strings = list(extra_strings.keys()) for string in strings: if string != 'Compound Name': type_row.append('STRING') title_row.append(string) type_row.append('TARGET') title_row.append('TARGET') descriptor_keys = list(mols[0].keys()) for key in descriptor_keys: type_row.append('INPUT') title_row.append(key) mol_rows = [] for idx, desc in enumerate(mols): for key in descriptor_keys: if desc[key] == 'na' or desc[key] == '': desc[key] = 0 mol = _Molecule('{}'.format(id_prefix) + '%04d' % (idx + 1)) for string in strings: mol.strings[string] = extra_strings[string][idx] if targets is not None: mol.target = targets[idx] mol.inputs = desc mol_rows.append(mol) with open(db_name, 'w', encoding='utf-8') as db_file: wr = writer(db_file, delimiter=',', lineterminator='\n') wr.writerow(type_row) wr.writerow(title_row) for idx, mol in enumerate(mol_rows): row = [ mol.id, mol.assignment, mol.strings['Compound Name'], smiles[idx] ] for string in strings: if string != 'Compound Name': row.append(mol.strings[string]) row.append(mol.target) for key in descriptor_keys: row.append(mol.inputs[key]) wr.writerow(row) db_file.close()
#from zinc_id.get_zincid import get_zincid_from_smile #zinc_id = get_zincid_from_smile("FC(F)(F)C1=CC=C(NC(=O)CSC2=NC(=CC=N2)C2=CC(=NO2)C2=CC=C(Cl)C=C2Cl)C=C1") #print(zinc_id) from padelpy import from_smiles #calculate molecular descriptors for propane descriptors = from_smiles( 'FC(F)(F)C1=CC=C(NC(=O)CSC2=NC(=CC=N2)C2=CC(=NO2)C2=CC=C(Cl)C=C2Cl)C=C1') #print(type(descriptors)) #print(len(descriptors)) # in addition to descriptors, calculate PubChem fingerprints #desc_fp = from_smiles('CCC', fingerprints=True) # only calculate fingerprints #fingerprints = from_smiles('CCC', fingerprints=True, descriptors=False) # save descriptors to a CSV file #_ = from_smiles('FC(F)(F)C1=CC=C(NC(=O)CSC2=NC(=CC=N2)C2=CC(=NO2)C2=CC=C(Cl)C=C2Cl)C=C1', output_csv='descriptors.csv') from tabula import read_pdf, convert_into #convert_into("./data/smiles_pdf.pdf", "smiles.csv", output_format="csv", pages='all') #zinc_id = get_zincid_from_smile("ClC1=CC(NC(=O)CSC2=NC=CC(=N2)C2=CSC(=N2)C2=CC=CC=C2)=CC(Cl)=C1") #print(zinc_id) #print("arun")
import sys from padelpy import from_smiles i = sys.argv[1] smi = i descriptors = from_smiles(smi, timeout=30) vol = descriptors['McGowan_Volume'] with open("volume_out.txt", "w") as f: f.write(str(float(vol) * 100))