def determine_descriptors_from_mordred(smile_series): smile_series = smile_series.drop_duplicates( ) #no need for duplicates they just take time, we can merge by smiles later to sort it out calc = Calculator( descriptors, ignore_3D=True ) # create a calculation object (ignore_3D is the default, just a reminder it is there) molecule_objects = [] bad_smiles = [] for smile in smile_series: if type(smile) != str: bad_smiles.append(smile) continue to_check = Chem.MolFromSmiles( smile) #a "SMILES Parse Error" does not trigger if to_check: molecule_objects.append(to_check) else: bad_smiles.append(smile) if bad_smiles: return bad_smiles, False descriptor_dataframe = calc.pandas( molecule_objects) #so long as all smiles are valid this should be fine #need to merge smiles with the descriptor_dataframe descriptor_dataframe = descriptor_dataframe.set_index(smile_series) return descriptor_dataframe, True
def AutoCorrMordred(mol): from mordred import Calculator, Autocorrelation calc = Calculator() # ATS, ATSC, AATS, AATSC ? if metric == 'MATS': descriptor = Autocorrelation.MATS elif metric == 'ATSC': descriptor = Autocorrelation.ATSC elif metric == 'AATS': descriptor = Autocorrelation.AATS elif metric == 'AATSC': descriptor = Autocorrelation.AATSC else: descriptor = Autocorrelation.ATS calc.register(descriptor) res = calc(mol) res = res.fill_missing() # Z: atomic num, pe=pauling electronegativity, p=polarizability, x=unweighted(identity), v=vdw-volume # dv= nValence d=nsigmaelectrons props= ['Z', 'pe', 'p', 'v', 'd', 'dv' ] keys = [ 'ATS{d}{p}'.format(d=d, p=p) for d in range(maxBonds+1) for p in props ] #print "keys:", keys res = { k:v for k, v in res.asdict().iteritems() if k in keys } for key in keys: if not key in res: print key #print "res:", res vector = [ value for (key, value) in sorted(res.items())] #print "len(vector):", len(vector) return vector
def mordred_descriptors(mol): """ Function to get chemical descriptors from CDK Parameters ---------- mol : object :: rdkit.Chem.rdchem.Mol mol object from rdkit Returns ------- dict dictionary containing the chemical descriptor name and the chemical descriptor value """ calc = Calculator(descriptors, ignore_3D=True) if type(mol) == list: print("here") print(mol) df = calc.pandas(mol, nproc=1).T print("here2") return df.to_dict() else: df = calc.pandas([mol], nproc=1).T return df.to_dict()[0]
def get_md(smi_path, data_path='./'): if type(smi_path) is str: smi_path = Path(smi_path) def get_smi(smi_path): smiles = {} with open(str(smi_path), 'r+') as f: lines = f.readlines() smiles = pd.DataFrame({ 'cindex': [ smi_path.stem + '_' + str(idx) for idx, content in enumerate(lines) ], 'smiles': [content.strip('\n') for idx, content in enumerate(lines)] }) return smiles smiles = get_smi(smi_path)['smiles'] mols = [Chem.MolFromSmiles(smi) for smi in smiles] calc = Calculator(descriptors) md = calc.pandas(mols) data = pd.concat([md, pd.DataFrame(get_smi(smi_path))], axis=1) data.to_csv(data_path + '/' + smi_path.stem + '_md.csv') return data
def smiles_to_mordred(smiles, features=None): # create descriptor calculator with all descriptors calc = Calculator(all_descriptors) print("Convering SMILES string to Mol format...") mols_raw = [Chem.MolFromSmiles(smi) for smi in smiles] print("Computing 3D coordinates...") s = SaltRemover.SaltRemover() mols = {} n = len(mols_raw) p = ProgressBar(n) for i, mol in enumerate(mols_raw): p.animate(i, status="Embedding %s" % smiles[i]) try: mol = s.StripMol(mol, dontRemoveEverything=True) mol = Chem.AddHs(mol) AllChem.Compute2DCoords(mol) AllChem.EmbedMolecule(mol) AllChem.UFFOptimizeMolecule(mol) # Is this deterministic? except Exception: print("Exception for %s" % smiles[i]) else: mols[smiles[i]] = mol p.animate(n, status="Finished embedding all molecules") print("\nComputing Mordred features...") df = calc.pandas(mols.values()) if features is not None: df = df[features] # Retain only the specified features mordred = pd.DataFrame(df.values, index=mols.keys(), columns=df.columns) print("There are %d molecules and %d features" % mordred.shape) return mordred
def sms_bandgap(sms): """Function from sms to predict Bandgap, sms represents the smiles string of the chemical that you want to predict, >>> sms_bandgap('c1ccccc1') >>> array([2.70371115]) """ bandgap = pd.DataFrame(columns=['substance', 'bandgap']) bandgap.loc[0, 'substance'] = sms freeze_support() mols = Chem.MolFromSmiles( sms) #transform smiles string to molecular structure if mols is None: raise TypeError('Invalid Smiles String') else: m = [Chem.MolFromSmiles(sms)] calc = Calculator(descriptors) raw_data = calc.pandas(m) #calculate descriptors new = { 'AXp-0d': raw_data['AXp-0d'].values, 'AXp-1d': raw_data['AXp-1d'].values, 'AXp-2d': raw_data['AXp-2d'].values, 'ETA_eta_L': raw_data['ETA_eta_L'].values, 'ETA_epsilon_3': raw_data['ETA_epsilon_3'].values } # extract the five most useful descriptors data new_data = pd.DataFrame(index=[1], data=new) regressor2 = load_model() bandgap.loc[0, 'bandgap'] = regressor2.predict(new_data)[ 0] # calculate bandgap return bandgap
def calc_mordred_desc(mols: list): from mordred import Calculator, descriptors calc = Calculator(descriptors, ignore_3D=True) res = calc.pandas(mols) res = _convert_error_columns(res) return res
def calculate_molecular_descriptors(df: pd.DataFrame) -> pd.DataFrame: calc = Calculator(descriptors, ignore_3D=True) mols = [Chem.MolFromSmiles(smi) for smi in df.SMILES] invalid_indices = get_invalid_smiles_indices(mols) mols_without_invalid = [ mol for index, mol in enumerate(mols) if index not in invalid_indices ] descriptor_df = calc.pandas(mols_without_invalid) return df.drop(df.index[invalid_indices]).join(descriptor_df)
def transform(self): super().transform() self.mol_names = [] calc = Calculator(descriptors, ignore_3D=True) self.df = calc.pandas(self.structures) self.columns = self.df.columns self.features = self.df.values self.mol_names = [mol.GetProp("_Name") for mol in self.structures] return self.features
def mol_to_mordred(mols, features=None): calc = Calculator(all_descriptors) print("\nComputing Mordred features...") df = calc.pandas(mols.values()) df = df.fill_missing() # Use NaN instead of Missing object if features is not None: df = df[features] # Retain only the specified features mordred = pd.DataFrame(df.values, index=mols.keys(), columns=df.columns) print("There are %d molecules and %d features" % mordred.shape) return mordred
def __init__(self, rank=None, args=None): self.calc = Calculator(descriptors, ignore_3D=True) if rank is not None: self.rank = rank else: raise ValueError('rank is not set properly') if args is not None: self.args = args else: raise ValueError('args is not set properly')
def get_MD(self, ignore_3D=True): """ Get MD ONLY for non-error cases """ calc = Calculator(descriptors, ignore_3D=ignore_3D) error_cases = np.squeeze(np.argwhere(self._error_mask)) mol_noError = list_where(self._mol_lst, error_cases, False) # index(error_cases)에 없으면 가져옴 mol_descriptor = calc.pandas(mol_noError) self._MD = mol_descriptor.astype("float64")
def __init__(self, dict_mode=True, auto_correct=True, ignore_3D=True): from mordred import Calculator, descriptors super(RDKitDescriptors, self).__init__() self.dict_mode = dict_mode self.calculator = Calculator(descriptors, ignore_3D=ignore_3D) self.auto_correct = auto_correct self.desc_list = list( self.calculator.pandas([mol_from_smiles("C")]).columns) self.desc_list = [ "Mordred_desc_" + desc_name for desc_name in self.desc_list ]
def test_parallel(): calc = Calculator(descriptors) mols = [m for m in Chem.SDMolSupplier(data_file, removeHs=False)] for serial, parallel in zip(calc.map(mols, nproc=1, quiet=True), calc.map(mols, quiet=True)): for d, s, p in zip(calc.descriptors, serial, parallel): if isinstance(s, MissingValueBase): yield eq_, s.error.__class__, p.error.__class__ else: msg = "{} (serial: {}, parallel: {})".format(str(d), s, p) yield assert_almost_equal, s, p, 7, msg
def test_by_references(): calc = Calculator( d for d in descriptors.all if d.__class__ not in [Polarizability.APol, Polarizability.BPol] ) calc.register([Polarizability.APol(True), Polarizability.BPol(True)]) actuals = {} for mol in Chem.SDMolSupplier( os.path.join(data_dir, "structures.sdf"), removeHs=False ): actuals[mol.GetProp("_Name")] = { str(d): v for d, v in zip(calc.descriptors, calc(mol)) } for path in glob(os.path.join(data_dir, "*.yaml")) + glob( os.path.join(data_dir, "**/*.yaml") ): for test in yaml.load(open(path), Loader=Loader): dnames = test["names"] if not isinstance(dnames, list): dnames = [dnames] desireds = ( (mname, zip(dnames, values if isinstance(values, list) else [values])) for mname, values in test["results"].items() ) digit = test.get("digit") if digit is None: assert_f = eq_ else: def assert_f(a, d, m): if np.isnan(d): assert isinstance(a, MissingValueBase) return assert_almost_equal(a, d, digit, m) for mname, descs in desireds: for dname, desired in descs: if not desired == "skip": yield ( assert_f, actuals[mname][dname], desired, "{} of {}".format(dname, mname), )
def write_mordred_descriptors(smiles, csv, data): if os.path.isfile(smiles) and not os.path.isfile(f'{csv}.gz'): from rdkit import Chem from mordred import Calculator, descriptors calc = Calculator(descriptors, ignore_3D=True) # Get molecules from SMILES mols = [Chem.MolFromSmiles(smi) for smi in data['SMILES']] msg = st.text('Sit back! This may take a while...') df = calc.pandas(mols, quiet=False, nproc=1) df.insert(0, column='CID', value=data['CID'].tolist()) df.to_csv(f'{csv}.gz', index=False, compression='gzip') msg.text('')
def mols_to_mordred_csv(mols, path): from mordred import Calculator, descriptors calc = Calculator(descriptors) if len(mols) <= 3000: df = calc.pandas(mols) df.to_csv(path, index=False) else: i = 0 while i * 3000 <= len(mols): df = calc.pandas(mols[i * 3000:(i + 1) * 3000]) df.to_csv(path, index=False, mode='a') i += 1 df = calc.pandas(mols[i * 3000:]) df.to_csv(path, index=False, mode='a')
def calculate_descriptors_from_smiles(excel_filename): calc = Calculator(descriptors, ignore_3D=True) excel = ExcelFile(excel_filename) excel_data_frame = excel.parse("Sheet1", 0) mols = [Chem.MolFromSmiles(smi) for smi in excel_data_frame["Updated SMILES"]] df = calc.pandas(mols) df = df[USEFUL_DESCRIPTORS] df["name"] = excel_data_frame["Name"] df["Updated SMILES"] = excel_data_frame["Updated SMILES"] #df["BA"] = excel_data_frame["Biološka uporabnost (%)"] df.to_excel("mordred_descriptors.xlsx")
def computeLigMolProps( transfrm_path="transformations/", working_dir="features/MOLPROPS/", target_columns=None, verbose=False): """ Compute molecular properties for the molecules in given transfrm_path and write to file. --args transfrm_path (str): path to directory containing ligand files working_dir (str): path to directory to pickle into verbose (bool): whether or not to print featurisation info to stdout --returns molprops_set (pandas dataframe): set of molecules with molecular properties """ mol_paths = glob.glob(transfrm_path+"*") # generate RDKit mol objects from paths: mols_rdkit = [ retrieveMoleculePDB(mol) for mol in mol_paths ] # generate molecule name from paths for indexing: mols_names = [ mol.replace(transfrm_path, "").split(".")[0] for mol in mol_paths ] # generate all descriptors available in mordred: calc = Calculator(descriptors, ignore_3D=False) print("Computing molecular properties:") molprops_set = calc.pandas(mols_rdkit) # remove columns with bools or strings (not fit for subtraction protocol): if target_columns is not None: # if variable is input the function is handling a testset and must # keep the same columns as train dataset: molprops_set = molprops_set[target_columns] else: # if making a training dataset, decide which columns to retain: molprops_set = molprops_set.select_dtypes(include=["float64", "int64"]) molprops_set.index = mols_names # pickle dataframe to specified directory: molprops_set.to_pickle(working_dir+"molprops.pickle") if verbose: print(molprops_set) return molprops_set
def generate_features_using_mordered(self): if self.ml_pipeline.config.fg_mordered_flg: self.jlogger.info("Inside generate_features_using_mordered method") data = self.ml_pipeline.data calc = Calculator(descriptors) mols = [Chem.MolFromSmiles(smi) for smi in data["SMILES"]] df = calc.pandas(mols) ## All features df["CNAME"] = data["CNAME"].values df["Activation Status"] = data["Activation Status"].values mordred_df = df.copy() self.ml_pipeline.data = mordred_df return mordred_df else: return None
def calculation(mol_list, smiles_list, index_list, descriptor_type): if descriptor_type == '2D': calc = Calculator(descriptors, ignore_3D=True) elif descriptor_type == '3D': calc = Calculator(descriptors) df = calc.pandas(mol_list) df = df.astype(str) masks = df.apply(lambda d: d.str.contains('[a-zA-Z]', na=False)) df = df[~masks] df = df.astype(float) # reset index df['SMILES'] = smiles_list df['index'] = index_list df = df.set_index('index') return df
def mordred_fe(data, cwd, debug=False): # print(os.getcwd()) filepath = cwd / "../features/mordred_fe.pkl" if not os.path.isfile(filepath) or debug: data["SMILES"] = data["SMILES"].transform( lambda x: Chem.MolFromSmiles(x)) calc = Calculator(descriptors, ignore_3D=True) new_data = calc.pandas(data["SMILES"]) if cwd != Path(""): new_data.to_pickle(filepath) else: new_data = pd.read_pickle(filepath) return new_data
def FilterItLogS(mol): ''' Fragement based solubity value: """Filter-it™ LogS descriptor.: based on a simple fragment-based method. http://silicos-it.be.s3-website-eu-west-1.amazonaws.com/software/filter-it/1.0.2/filter-it.html#installation ''' calc = Calculator(descriptors.LogS) return calc(mol).asdict().get('FilterItLogS')
def compute_descript(smile, walltime=1): """ import random import time if random.randint(0,8) == 0: time.sleep(1) """ from mordred import Calculator, descriptors from rdkit import Chem import numpy as np import pickle calc = Calculator( descriptors, ignore_3D=True ) # this object doesn't need to be created everytime. Can make global I think? #read smiles mol = Chem.MolFromSmiles(smile) if mol is None: print("Error processing mol") return pickle.dumps(None) descs = calc(mol) data = np.array(descs).flatten().astype( np.float32) #could run in FP16 UNO , something to think about return pickle.dumps( data ) # We do this to avoid a bug in the serialization routines that Parsl
def _featurize(self, mol: RDKitMol) -> np.ndarray: """ Calculate Mordred descriptors. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray 1D array of Mordred descriptors for `mol`. If ignore_3D is True, the length is 1613. If ignore_3D is False, the length is 1826. """ if self.calc is None: try: from mordred import Calculator, descriptors, is_missing self.is_missing = is_missing self.calc = Calculator(descriptors, ignore_3D=self.ignore_3D) self.descriptors = list(descriptors.__all__) except ModuleNotFoundError: raise ImportError("This class requires Mordred to be installed.") feature = self.calc(mol) # convert errors to zero feature = [ 0.0 if self.is_missing(val) or isinstance(val, str) else val for val in feature ] return np.asarray(feature)
class Worker(): def __init__(self, rank=None, args=None): self.calc = Calculator(descriptors, ignore_3D=True) if rank is not None: self.rank = rank else: raise ValueError('rank is not set properly') if args is not None: self.args = args else: raise ValueError('args is not set properly') def do(self, data): if self.args.verbose: print('rank {} received data: {} rows'.format( self.rank, len(data))) if self.args.echo: df = pd.DataFrame(data, columns=['SMILE']) else: mols = [Chem.MolFromSmiles(smi) for smi in data] df = self.calc.pandas(mols, quiet=True) df.fill_missing(inplace=True) df.insert(0, 'SMILE', data) if self.args.verbose: print('rank {} generated data: {}'.format(self.rank, len(df))) return df
def test_VEA(): calc = Calculator([AdjacencyMatrix, DistanceMatrix]) for line in data: line = line.strip().split() smi = line[0] mol = Chem.MolFromSmiles(smi) desireds = dict(zip(descs, map(parse_reference, line[1:]))) actuals = {str(k): v for k, v in zip(calc.descriptors, calc(mol))} for desc in descs: actual = actuals[desc] decimal, desired = desireds[desc] if desired is None: continue assert not is_missing(actual), actual yield ( assert_almost_equal, actual, desired, decimal, "{} of {}".format(desc, smi), )
def mordred_descriptors(mols, output, header, use_3d): """ Calculate Mordred descriptors and save as tabular """ calc = Calculator(descriptors, ignore_3D=(not use_3d)) invalid_mols = np.where( np.array(mols) == None)[0] # indices of invalid SMILES/SDMols mols = [Chem.MolFromSmiles('') if n is None else n for n in mols] # replace invalid mols with placeholder df = calc.pandas(mols, quiet=True) # calculate descriptors for mol in invalid_mols: # remove placeholders df.iloc[mol] = np.nan df = df.applymap(convert_errors_to_nan) # remove descriptors which errored df = df.round(6) df.to_csv(output, na_rep='', sep='\t', index=False, header=header) # write output
def calculate(SMILEs, filter=None): calc = Calculator(descriptors, ignore_3D=True) d = [] for smi in SMILEs: try: m = Chem.MolFromSmiles(smi) d.append(calc(m)) except: # The input SMILEs is invaild raise ValueError("Bad SMILEs Detected. Please Check: " + smi) # warnings.warn("Bad SMILEs Detect. Filling NA Values: "+smi) # d.append(['NA'] * len(calc)) d_df = pd.DataFrame(d, index=SMILEs, columns=[str(e_d) for e_d in calc.descriptors ]).apply(pd.to_numeric, errors='coerce') if filter: d_df = d_df.loc[:, filter] d_df.fillna(0, inplace=True) return d_df.values
def __init__(self, s, e, n, table): self.s = s self.e = e self.n = n self.data = np.load("data.npy") self.table = pd.read_csv(table) self.calc = Calculator(descriptors, ignore_3D=True)
def test_by_references(): calc = Calculator( d for d in all_descriptors() if d.__class__ not in [Polarizability.APol, Polarizability.BPol] ) calc.register([ Polarizability.APol(True), Polarizability.BPol(True), ]) actuals = dict() for mol in Chem.SDMolSupplier(os.path.join(data_dir, 'structures.sdf'), removeHs=False): actuals[mol.GetProp('_Name')] = {str(d): v for d, v in zip(calc.descriptors, calc(mol))} for path in glob(os.path.join(data_dir, '*.yaml')) + glob(os.path.join(data_dir, '**/*.yaml')): for test in yaml.load(open(path), Loader=Loader): dnames = test['names'] if not isinstance(dnames, list): dnames = [dnames] desireds = ( (mname, zip(dnames, values if isinstance(values, list) else [values])) for mname, values in test['results'].items() ) digit = test.get('digit') if digit is None: assert_f = eq_ else: def assert_f(a, d, m): if np.isnan(d): assert isinstance(a, MissingValueBase) return assert_almost_equal(a, d, digit, m) for mname, descs in desireds: for dname, desired in descs: if not desired == 'skip': yield ( assert_f, actuals[mname][dname], desired, '{} of {}'.format(dname, mname) )
def test_parallel(): calc = Calculator(all_descriptors()) mols = list(map(Chem.AddHs, [ Chem.MolFromSmiles('c1ccccc1'), Chem.MolFromSmiles('C1=CC(=C(C=C1C2=C(C=C3C(=CC(=CC3=[O+]2)O)O)O)O)O'), Chem.MolFromSmiles('CCCCCC'), ])) for mol in mols: Chem.EmbedMolecule(mol) for serial, parallel in zip(calc.map(mols, nproc=1, quiet=True), calc.map(mols, quiet=True)): for d, s, p in zip(calc.descriptors, serial, parallel): if isinstance(s, MissingValueBase): yield eq_, pickle.dumps(s), pickle.dumps(p), str(d) else: yield assert_almost_equal, s, p, 7, str(d)
def construct_mordred_features(table_in): # Constructs feature matrix from mordred physico-chemical features # out of 2-column pandas table of names and smiles [Compound, smiles] from rdkit import Chem from mordred import Calculator, descriptors # Create descriptors calc = Calculator(descriptors, ignore_3D=False) # Get features all_smiles = list(table_in.smiles) all_drugs = list(table_in.Compound) mols = [Chem.MolFromSmiles(smi) for smi in all_smiles] # Clean up feat_table = calc.pandas(mols) feat_table = feat_table.select_dtypes(["number"]) feat_table.index = all_drugs return feat_table
def test_pickle_calculator(): orig = Calculator(all_descriptors()) d0 = orig.descriptors[0] d1 = orig.descriptors[1] orig.register([ d0 + d1, d0 - d1, d0 * d1, d0 // d1, d0 % d1, d0 ** d1, -d0, +d1, abs(d0), math.trunc(d0), ]) if six.PY3: orig.register([math.ceil(d0), math.floor(d1)]) pickled = pickle.loads(pickle.dumps(orig)) mol = Chem.MolFromSmiles('c1ccccc1C(O)O') for a, b in zip(orig.descriptors, pickled.descriptors): yield eq_, a, b for a, b in zip(orig(mol), pickled(mol)): if isinstance(a, MissingValueBase): yield eq_, a.__class__, b.__class__ else: yield assert_almost_equal, a, b
def test_Calculator_descriptors(): calc = Calculator() def check(l, msg=None): yield eq_, len(calc), l, msg yield ok_, all(isinstance(d, Descriptor) for d in calc.descriptors) # register instance calc.register(Dummy.Dummy2()) for c in check(1, 'instance register failed'): yield c # register class yield raises(ValueError)(lambda: calc.register(Dummy.Dummy1)) for c in check(1): yield c calc.register(Dummy.Dummy2) for c in check(1): yield c calc.register(Dummy.Dummy3) for c in check(2): yield c calc.register(Dummy.Dummy4) for c in check(4): yield c # register module calc.register(Dummy) for c in check(7): yield c # delete del calc.descriptors for c in check(0): yield c # set instance calc.descriptors = Dummy.Dummy2() for c in check(1): yield c