def use(self, smiles: list, backend: str = 'padel'): ''' use: uses the trained project to predict values for supplied molecules Args: smiles (list): list of SMILES strings to predict for backend (str): backend software to use for QSPR generation; `padel` or `alvadesc`; default = `padel`; alvadesc requries valid license Returns: numpy.array: predicted values ''' if backend == 'alvadesc': mols = [smiles_to_descriptors(s) for s in smiles] for mol in mols: for key in list(mol.keys()): if mol[key] == 'na': mol[key] = 0 elif backend == 'padel': mols = [from_smiles(s) for s in smiles] else: raise ValueError('Unknown backend software: {}'.format(backend)) return mean([ model.use( asarray([[float(mol[name]) for name in self._df._input_names] for mol in mols])) for model in self._models ], axis=0)
def _qspr_from_alvadesc(smiles: List[str]) -> Tuple[List[List[float]], List[str]]: """ Args: smiles (list[str]): list of SMILES strings Returns: Tuple[List[List[float]], List[str]]: (descriptors w/ shape (n_compounds, n_desc), descriptor names) """ desc = smiles_to_descriptors(smiles) keys = list(desc[0].keys()) for idx, d in enumerate(desc): for k in keys: if d[k] == 'na': desc[idx][k] = 0.0 desc = [[float(d[k]) for k in keys] for d in desc] return (desc, keys)
def create_db(smiles: list, db_name: str, targets: list = None, id_prefix: str = '', extra_strings: dict = {}, backend: str = 'padel', convert_mdl: bool = False): ''' create_db: creates an ECNet-formatted database from SMILES strings using either PaDEL-Descriptor or alvaDesc software; using alvaDesc requires a valid installation/license of alvaDesc Args: smiles (list): list of SMILES strings db_name (str): name/path of database being created targets (list): target (experimental) values, align with SMILES strings; if None, all TARGETs set to 0 id_prefix (str): prefix of molecule DATAID, if desired extra_strings (dict): extra STRING columns, label = name, value = list with length equal to number of SMILES strings backend (str): software used to calculate QSPR descriptors, 'padel' or 'alvadesc' convert_mdl (bool): if `True`, converts SMILES strings to MDL 3D format before calculating descriptors (PaDEL only) ''' if targets is not None: if len(targets) != len(smiles): raise ValueError('Must supply same number of targets as SMILES ' 'strings: {}, {}'.format(len(targets), len(smiles))) for string in list(extra_strings.keys()): if len(extra_strings[string]) != len(smiles): raise ValueError('Extra string values for {} not equal in length ' 'to supplied SMILES: {}, {}'.format( len(extra_strings[string]), len(smiles))) mols = [] if backend == 'alvadesc': for mol in smiles: mols.append(smiles_to_descriptors(mol)) elif backend == 'padel': for idx, mol in enumerate(smiles): if convert_mdl is True: if pybel is None: raise ImportError( 'pybel (Python Open Babel wrapper) not installed, ' 'cannot convert SMILES to MDL') mdl = pybel.readstring('smi', mol) mdl.make3D() curr_time = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3] mdl.write('mdl', '{}.mdl'.format(curr_time)) try: mols.append(from_mdl('{}.mdl'.format(curr_time))[0]) except RuntimeError: warn( 'Could not calculate descriptors for {}, omitting'. format(mol), RuntimeWarning) del smiles[idx] if targets is not None: del targets[idx] for string in list(extra_strings.keys()): del extra_strings[string][idx] remove('{}.mdl'.format(curr_time)) else: try: mols.append(from_smiles(mol)) except RuntimeError: warn( 'Could not calculate descriptors for {}, omitting'. format(mol), RuntimeWarning) del smiles[idx] if targets is not None: del targets[idx] for string in list(extra_strings.keys()): del extra_strings[string][idx] else: raise ValueError('Unknown backend software: {}'.format(backend)) rows = [] type_row = ['DATAID', 'ASSIGNMENT', 'STRING', 'STRING'] title_row = ['DATAID', 'ASSIGNMENT', 'Compound Name', 'SMILES'] strings = list(extra_strings.keys()) for string in strings: if string != 'Compound Name': type_row.append('STRING') title_row.append(string) type_row.append('TARGET') title_row.append('TARGET') descriptor_keys = list(mols[0].keys()) for key in descriptor_keys: type_row.append('INPUT') title_row.append(key) mol_rows = [] for idx, desc in enumerate(mols): for key in descriptor_keys: if desc[key] == 'na' or desc[key] == '': desc[key] = 0 mol = _Molecule('{}'.format(id_prefix) + '%04d' % (idx + 1)) for string in strings: mol.strings[string] = extra_strings[string][idx] if targets is not None: mol.target = targets[idx] mol.inputs = desc mol_rows.append(mol) with open(db_name, 'w', encoding='utf-8') as db_file: wr = writer(db_file, delimiter=',', lineterminator='\n') wr.writerow(type_row) wr.writerow(title_row) for idx, mol in enumerate(mol_rows): row = [ mol.id, mol.assignment, mol.strings['Compound Name'], smiles[idx] ] for string in strings: if string != 'Compound Name': row.append(mol.strings[string]) row.append(mol.target) for key in descriptor_keys: row.append(mol.inputs[key]) wr.writerow(row) db_file.close()
from alvadescpy import smiles_to_descriptors if __name__ == '__main__': print( smiles_to_descriptors('CCCOC', descriptors=['MW', 'AMW'], labels=True)) res = smiles_to_descriptors(['CCCC', 'CCOCC', 'CCCCC'], descriptors='ALL', labels=True) print(len(res)) print(len(res[0]))
from alvadescpy import smiles_to_descriptors if __name__ == '__main__': print(smiles_to_descriptors('CCC', descriptors=['MW', 'AMW'], labels=True))