def get_fitness(anion, genes, target): model_ID = "density" cation = Chem.MolFromSmiles(genes) model = genetic.load_data("{}_qspr.h5".format(model_ID), h5File=True) deslist = genetic.load_data("{}_desc.csv".format(model_ID)) feature_vector = [] with genetic.suppress_rdkit_sanity(): for item in deslist: if "anion" in item: feature_vector.append( calculator([item.partition('-')[0] ]).CalcDescriptors(anion)[0]) elif "cation" in item: feature_vector.append( calculator([item.partition('-')[0] ]).CalcDescriptors(cation)[0]) elif "Temperature, K" in item: feature_vector.append(298.15) elif "Pressure, kPa" in item: feature_vector.append(101.325) else: print("unknown descriptor in list: %s" % item) features_normalized = (feature_vector - deslist.iloc[0].values) /\ deslist.iloc[1].values prediction = exp( model.predict(np.array(features_normalized).reshape(1, -1))[0]) error = abs((prediction - target) / target) return 1 - error, prediction
class GuessIonTests(unittest.TestCase): geneSet = genetic.generate_geneset() df = genetic.load_data("saltInfo.csv") df = df['anion_SMILES'].unique() ohPickMe = random.sample(range(df.shape[0]), 1) anion = Chem.MolFromSmiles(df[ohPickMe[0]]) def test_1_density(self): target = random.sample(range(800, 1500), 1)[0] self.guess_password(target) def test_benchmark(self): genetic.Benchmark.run(self.test_1_density) def guess_password(self, target): startTime = datetime.datetime.now() def fnGetFitness(genes): return get_fitness(self.anion, genes, target) def fnDisplay(candidate, mutation): display(candidate, mutation, startTime) def fnShowIon(genes, target, mutation_attempts): show_ion(genes, target, mutation_attempts) optimalFitness = 0.99 best = genetic.get_best(fnGetFitness, optimalFitness, self.geneSet, fnDisplay, fnShowIon, target) return best
def _get_fitness(anion, genes, target, model_ID): """ the fitness function passed to the engine. In this case fitness is determined by a model developed by the salty module. It is important to note that the fitness function can handle multi- output models """ cation = Chem.MolFromSmiles(genes) model = genetic.load_data("{}.sav".format(model_ID), pickleFile=True) deslist = genetic.load_data("{}_descriptors.csv".format(model_ID)) feature_vector = [] for item in deslist: if "anion" in item: with genetic.suppress_rdkit_sanity(): feature_vector.append( calculator([item.partition('-')[0] ]).CalcDescriptors(anion)[0]) elif "cation" in item: with genetic.suppress_rdkit_sanity(): feature_vector.append( calculator([item.partition('-')[0] ]).CalcDescriptors(cation)[0]) elif "Temperature, K" in item: feature_vector.append(298.15) elif "Pressure, kPa" in item: feature_vector.append(101.325) else: print("unknown descriptor in list: %s" % item) features_normalized = (feature_vector - deslist.iloc[0].values) /\ deslist.iloc[1].values prediction = np.round(np.exp( model.predict(np.array(features_normalized).reshape(1, -1))[0]), decimals=2) error = abs((prediction - target) / target) error = np.average(error) return 1 - error, prediction
def generate_solvent(target, model_ID, heavy_atom_limit=50, sim_bounds=[0.4, 1.0], hits=1, write_file=False): """ the primary public function of the salt_generator module Parameters ---------- target : array, float, or int the desired property value to be achieved by the engine, if an array, a multi-output model must be supplied to the engine model_ID : str the name of the model to be used by the engine. Gains has several built-in models to choose from heavy_atom_limit : int, optional the upper value for allowable heavy atoms in the returned candidate sim_bounds : array, optional the tanimoto similarity score between the returned candidate and its closest molecular relative in parent_candidates hits : int, optional the number of desired solutions write_file : boolean, optional defaults to False. if True will return the solutions and a csv log file Returns ------- new : object default behavior is to return a pandas DataFrame. This is a log file of the solution(s). if write_file = True the function will also return pdb files of the cations/anions """ parent_candidates = eval( genetic.load_data("{}_summary.csv".format(model_ID)).loc[1][1]) anion_candidates = eval( genetic.load_data("{}_summary.csv".format(model_ID)).loc[2][1]) cols = [ "Salt ID", "Salt Smiles", "Cation Heavy Atoms", "Tanimoto Similarity Score", "Molecular Relative", "Anion", "Model Prediction", "MD Calculation", "Error" ] salts = pd.DataFrame(columns=cols) for i in range(1, hits + 1): while True: anion_smiles = random.sample(list(anion_candidates), 1)[0] anion = Chem.MolFromSmiles(anion_smiles) best = _guess_password(target, anion, parent_candidates, model_ID) tan_sim_score, sim_index =\ genetic.molecular_similarity(best, parent_candidates) cation_heavy_atoms = best.Mol.GetNumAtoms() salt_smiles = best.Genes + "." + Chem.MolToSmiles(anion) if cation_heavy_atoms < heavy_atom_limit and\ tan_sim_score >= sim_bounds[0] and\ tan_sim_score < sim_bounds[1] and\ salt_smiles not in salts["Salt Smiles"]: scr, pre = _get_fitness(anion, best.Genes, target, model_ID) if i < 10: CAT_ID = "C0%s" % i AN_ID = "A0%s" % i else: CAT_ID = "C%s" % i AN_ID = "A%s" % i salt_ID = CAT_ID + "_" + AN_ID molecular_relative = salty.check_name( parent_candidates[sim_index]) anion_name = salty.check_name(anion_smiles) new_entry = pd.DataFrame([[ salt_ID, salt_smiles, cation_heavy_atoms, tan_sim_score, molecular_relative, anion_name, pre ]], columns=cols[:-2]) try: cation = Chem.AddHs(best.Mol) Chem.EmbedMolecule(cation, Chem.ETKDG()) Chem.UFFOptimizeMolecule(cation) anion = Chem.AddHs(anion) Chem.EmbedMolecule(anion, Chem.ETKDG()) Chem.UFFOptimizeMolecule(anion) new = pd.DataFrame(pd.concat([salts, new_entry]), columns=cols) except BaseException: continue if write_file: MolToPDBFile(cation, "{}.pdb".format(CAT_ID)) MolToPDBFile(anion, "{}.pdb".format(AN_ID)) break else: continue if write_file: pd.DataFrame.to_csv(new, path_or_buf="salt_log.csv", index=False) salts = new if not write_file: return new
def test_1_similarity_map(self): df = genetic.load_data("saltInfo.csv") df = df.loc[df["cation_name"].str.contains("imid", case=False)]
def generate_solvent(target, model_ID, heavy_atom_limit=50, sim_bounds=[0, 1.0], hits=1, write_file=False, seed=None, hull=None, simplex=None, path=None, exp_data=None, verbose=0, gen_token=False, hull_bounds=[0, 1], inner_search=True, parent_cap=25, mutation_cap=1000): """ the primary public function of the salt_generator module Parameters ---------- target : array, float, or int the desired property value to be achieved by the engine, if an array, a multi-output model must be supplied to the engine model_ID : str the name of the model to be used by the engine. Gains has several built-in models to choose from heavy_atom_limit : int, optional the upper value for allowable heavy atoms in the returned candidate sim_bounds : array, optional the tanimoto similarity score between the returned candidate and its closest molecular relative in parent_candidates hits : int, optional the number of desired solutions write_file : boolean, optional defaults to False. if True will return the solutions and a csv log file seed : int, optional optional randint seed for unittest consistency hull : pandas DataFrame, optional nxm pandas DataFrame to use convex hull search strategy. hull columns should be the same properties used in the genetic algorithm fitness test simplex : array, optional array to access boundary datapoints in the convex hull. This is used during target resampling defined by the convex hull/simplex path : str, optional absolute path to the qspr model used as the fitness function exp_data: salty devmodel obj, optional used during hull target reassignment search strategy. Salty devmodel object of the original experimental data verbose : int, optional, default 0 0 : most verbose. Best child, parent/target resampling, sanitization failure 1 : parent/target resampling, solution metadata, sanitization failure 2 : solution metdata, sanitization failure 3 : target resampling, csv-formatted solution metadata 4 : csv-formatted solution metadata gen_token : int, str, optional a string or integer to append to file outputs. Useful in the case of parallel searches. hull_bounds : array, optional if hull and simplex are not none, hull_bounds describes the proximity convex_search should be to the simplex inner_search : bool, optional if hull and simplex are not none, inner_search specifies if convex_search should return values only within the convex hull Returns ------- new : object default behavior is to return a pandas DataFrame. This is a log file of the solution(s). if write_file = True the function will also return pdb files of the cations/anions """ parent_candidates = [] anion_candidates = [] models = [] deslists = [] for i, name in enumerate(model_ID): if path: model = np.array( [load_model(join(path, '{}_qspr.h5'.format(name)))]) with open(join(path, '{}_desc.csv'.format(name)), 'rb') as csv_file: deslist = list([pd.read_csv(csv_file, encoding='latin1')]) with open(join(path, '{}_summ.csv'.format(name)), 'rb') as csv_file: summary = pd.read_csv(csv_file, encoding='latin1') else: model = np.array( [genetic.load_data("{}_qspr.h5".format(name), h5File=True)]) deslist = list([genetic.load_data("{}_desc.csv".format(name))]) summary = genetic.load_data("{}_summ.csv".format(name)) parents = eval(summary.iloc[1][1]) anions = eval(summary.iloc[2][1]) if i > 0: parent_candidates = np.concatenate((parents, parent_candidates)) anion_candidates = np.concatenate((anions, anion_candidates)) models = np.concatenate((models, model)) deslists = list([deslists, deslist]) else: parent_candidates = parents anion_candidates = anions models = model deslists = deslist cols = [ "Salt ID", "Salt Smiles", "Cation Heavy Atoms", "Tanimoto Similarity Score", "Molecular Relative", "Anion", "Model Prediction", "MD Calculation", "Error" ] salts = pd.DataFrame(columns=cols) if exp_data: anion_candidates = eval(exp_data.Data_summary.iloc[2][0]) for i in range(1, hits + 1): while True: if seed: random.seed(seed) anion_smiles = random.sample(list(anion_candidates), 1)[0] anion = Chem.MolFromSmiles(anion_smiles) best = _guess_password(target, anion_smiles, parent_candidates, models, deslists, seed=seed, hull=hull, simplex=simplex, exp_data=exp_data, verbose=verbose, hull_bounds=hull_bounds, inner_search=inner_search, parent_cap=parent_cap, mutation_cap=mutation_cap) if exp_data: exp_parent_candidates = eval(exp_data.Data_summary.iloc[1][0]) tan_sim_score, sim_index = \ genetic.molecular_similarity(best, exp_parent_candidates) else: tan_sim_score, sim_index = \ genetic.molecular_similarity(best, parent_candidates) cation_heavy_atoms = best.Mol.GetNumAtoms() salt_smiles = best.Genes + "." + Chem.MolToSmiles(anion) if cation_heavy_atoms < heavy_atom_limit and \ sim_bounds[0] <= tan_sim_score < sim_bounds[1] and\ salt_smiles not in salts["Salt Smiles"]: scr, pre = _get_fitness(anion, best.Genes, target, models, deslists) if i < 10: CAT_ID = "C0%s" % i AN_ID = "A0%s" % i else: CAT_ID = "C%s" % i AN_ID = "A%s" % i salt_ID = CAT_ID + "_" + AN_ID if exp_data: molecular_relative = salty.check_name( exp_parent_candidates[sim_index]) else: molecular_relative = salty.check_name( parent_candidates[sim_index]) anion_name = salty.check_name(anion_smiles) new_entry = pd.DataFrame([[ salt_ID, salt_smiles, cation_heavy_atoms, tan_sim_score, molecular_relative, anion_name, pre ]], columns=cols[:-2]) try: cation = Chem.AddHs(best.Mol) Chem.EmbedMolecule(cation, Chem.ETKDG()) Chem.UFFOptimizeMolecule(cation) anion = Chem.AddHs(anion) Chem.EmbedMolecule(anion, Chem.ETKDG()) Chem.UFFOptimizeMolecule(anion) new = pd.DataFrame(pd.concat([salts, new_entry]), columns=cols) except BaseException: if verbose == any([0, 1, 2]): print("molecule not sanitizable") continue if write_file: if verbose == any([3, 4]): print(new) if gen_token: MolToPDBFile(cation, "{}_{}.pdb".format(gen_token, CAT_ID)) MolToPDBFile(anion, "{}_{}.pdb".format(gen_token, AN_ID)) else: MolToPDBFile(cation, "{}.pdb".format(CAT_ID)) MolToPDBFile(anion, "{}.pdb".format(AN_ID)) break else: continue if write_file: if gen_token: pd.DataFrame.to_csv( new, path_or_buf="{}_salt_log.csv".format(gen_token), index=False) else: pd.DataFrame.to_csv(new, path_or_buf="salt_log.csv", index=False) salts = new if not write_file: return new