def check_data(self, df): startTime = datetime.datetime.now() def fnDisplay(message): display(message, startTime) smiles = df.smiles for i in range(len(smiles)): ion = smiles[i] try: Chem.SanitizeMol(Chem.MolFromSmiles(ion)) except ValueError: name = salty.check_name(ion) message = "RDKit cannot interpret %s ion SMILES in datafile" \ % name fnDisplay(message) if "-" not in ion and "+" not in ion: name = salty.check_name(ion) message = "%s ion does not have a charge" % name fnDisplay(message) if "." in ion: name = salty.check_name(ion) message = "%s ion contains more than one molecular entity" \ % name fnDisplay(message)
def _show_ion(genes, target, mutation_attempts, sim_score, molecular_relative, models, deslists, anion_smiles, exp_data=None): """ for printing results to the screen. _show_ion is called when a candidate has achieved the desired fitness core and is returned by the engine """ mol = Chem.MolFromSmiles(genes) anion = Chem.MolFromSmiles(anion_smiles) fitness, mol_property = _get_fitness(anion, genes, target, models, deslists) anion_name = salty.check_name(anion_smiles) if exp_data: chrom = genetic.Chromosome(genes, fitness) exp_parent_candidates = eval(exp_data.Data_summary.iloc[1][0]) tan_sim_score, sim_index = \ genetic.molecular_similarity(chrom, exp_parent_candidates) molecular_relative = exp_parent_candidates[sim_index] print("{}\t{}".format("Salt Smiles: ", genes)) print("{}\t{}".format("Cation Heavy Atoms: ", mol.GetNumAtoms())) print("Tanimoto Similarity Score: \t{0:10.3f}".format(sim_score)) print("{}\t{}".format("Molecular Relative: ", salty.check_name(molecular_relative))) print("{}\t{}".format("Anion: ", anion_name)) print("{}\t{}".format("Model Prediction: ", mol_property)) print("{}\t{}".format("Mutation Attempts: ", mutation_attempts))
def _show_ion(genes, target, mutation_attempts, sim_score, molecular_relative, models, deslists, anion_smiles): """ for printing results to the screen. _show_ion is called when a candidate has achieved the desired fitness core and is returned by the engine """ mol = Chem.MolFromSmiles(genes) anion = Chem.MolFromSmiles(anion_smiles) fitness, mol_property = _get_fitness(anion, genes, target, models, deslists) anion_name = salty.check_name(anion_smiles) print("{}\t{}".format("Salt Smiles: ", genes)) print("{}\t{}".format("Cation Heavy Atoms: ", mol.GetNumAtoms())) print("Tanimoto Similarity Score: \t{0:10.3f}".format(sim_score)) print("{}\t{}".format("Molecular Relative: ", salty.check_name(molecular_relative))) print("{}\t{}".format("Anion: ", anion_name)) print("{}\t{}".format("Model Prediction: ", mol_property)) print("{}\t{}".format("Mutation Attempts: ", mutation_attempts))
def _show_ion(genes, target, mutation_attempts, sim_score, molecular_relative, model_ID, anion): """ for printing results to the screen. _show_ion is called when a candidate has achieved the desired fitness core and is returned by the engine """ mol = Chem.MolFromSmiles(genes) fitness, mol_property = _get_fitness(anion, genes, target, model_ID) print("{}\t{}".format("number of atoms: ", mol.GetNumAtoms())) print("{}\t{}".format("mutation attempts: ", mutation_attempts)) print("with prediction: \t{}".format(mol_property)) print("similarity score: {0:10.3f}".format(sim_score)) print("{}\t{}\n".format("molecular relative: ", salty.check_name(molecular_relative)))
class iupac_smiles_tests(unittest.TestCase): data_files = ["cationInfo.csv", "anionInfo.csv"] df = salty.load_data(data_files[0]) smiles = df.smiles for i in range(len(smiles)): ion = smiles[i] salty.check_name(ion) def test_1_check_data(self): for i in range(len(self.data_files)): df = salty.load_data(self.data_files[i]) self.check_data(df) def test_2_check_wrong_ion(selfs): ion = 'stupid_nonsense_string' salty.check_name(ion) def test_benchmark(self): salty.Benchmark.run(self.test_1_check_data) salty.Benchmark.run(self.test_2_check_wrong_ion) def check_data(self, df): startTime = datetime.datetime.now() def fnDisplay(message): display(message, startTime) smiles = df.smiles for i in range(len(smiles)): ion = smiles[i] try: Chem.SanitizeMol(Chem.MolFromSmiles(ion)) except ValueError: name = salty.check_name(ion) message = "RDKit cannot interpret %s ion SMILES in datafile" \ % name fnDisplay(message) if "-" not in ion and "+" not in ion: name = salty.check_name(ion) message = "%s ion does not have a charge" % name fnDisplay(message) if "." in ion: name = salty.check_name(ion) message = "%s ion contains more than one molecular entity" \ % name fnDisplay(message)
def test_2_check_wrong_ion(selfs): ion = 'stupid_nonsense_string' salty.check_name(ion)
print('There are ' + str(len(four)) + ' salts of 2 each') print('There are ' + str(len(more)) + ' salts of 2 or more each') cation2 = [] anion2 = [] error2_anion = [] error2_cation = [] for i in two: cation2.append(i[0]) anion2.append(i[1]) for i in anion2: #CHECK CHECK_NAME FUNC FOR MISSING ANION OR CATION #print(i) try: check_name(i) except: UnboundLocalError error2_anion.append(i) #print(i) for i in cation2: #print(i) try: check_name(i) except: UnboundLocalError error2_cation.append(i) print('There are ' + str(len(set(error2_anion))) + ' unique missing anions from the data base') #print(error2_anion)
def generate_solvent(target, model_ID, heavy_atom_limit=50, sim_bounds=[0.4, 1.0], hits=1, write_file=False): """ the primary public function of the salt_generator module Parameters ---------- target : array, float, or int the desired property value to be achieved by the engine, if an array, a multi-output model must be supplied to the engine model_ID : str the name of the model to be used by the engine. Gains has several built-in models to choose from heavy_atom_limit : int, optional the upper value for allowable heavy atoms in the returned candidate sim_bounds : array, optional the tanimoto similarity score between the returned candidate and its closest molecular relative in parent_candidates hits : int, optional the number of desired solutions write_file : boolean, optional defaults to False. if True will return the solutions and a csv log file Returns ------- new : object default behavior is to return a pandas DataFrame. This is a log file of the solution(s). if write_file = True the function will also return pdb files of the cations/anions """ parent_candidates = eval( genetic.load_data("{}_summary.csv".format(model_ID)).loc[1][1]) anion_candidates = eval( genetic.load_data("{}_summary.csv".format(model_ID)).loc[2][1]) cols = [ "Salt ID", "Salt Smiles", "Cation Heavy Atoms", "Tanimoto Similarity Score", "Molecular Relative", "Anion", "Model Prediction", "MD Calculation", "Error" ] salts = pd.DataFrame(columns=cols) for i in range(1, hits + 1): while True: anion_smiles = random.sample(list(anion_candidates), 1)[0] anion = Chem.MolFromSmiles(anion_smiles) best = _guess_password(target, anion, parent_candidates, model_ID) tan_sim_score, sim_index =\ genetic.molecular_similarity(best, parent_candidates) cation_heavy_atoms = best.Mol.GetNumAtoms() salt_smiles = best.Genes + "." + Chem.MolToSmiles(anion) if cation_heavy_atoms < heavy_atom_limit and\ tan_sim_score >= sim_bounds[0] and\ tan_sim_score < sim_bounds[1] and\ salt_smiles not in salts["Salt Smiles"]: scr, pre = _get_fitness(anion, best.Genes, target, model_ID) if i < 10: CAT_ID = "C0%s" % i AN_ID = "A0%s" % i else: CAT_ID = "C%s" % i AN_ID = "A%s" % i salt_ID = CAT_ID + "_" + AN_ID molecular_relative = salty.check_name( parent_candidates[sim_index]) anion_name = salty.check_name(anion_smiles) new_entry = pd.DataFrame([[ salt_ID, salt_smiles, cation_heavy_atoms, tan_sim_score, molecular_relative, anion_name, pre ]], columns=cols[:-2]) try: cation = Chem.AddHs(best.Mol) Chem.EmbedMolecule(cation, Chem.ETKDG()) Chem.UFFOptimizeMolecule(cation) anion = Chem.AddHs(anion) Chem.EmbedMolecule(anion, Chem.ETKDG()) Chem.UFFOptimizeMolecule(anion) new = pd.DataFrame(pd.concat([salts, new_entry]), columns=cols) except BaseException: continue if write_file: MolToPDBFile(cation, "{}.pdb".format(CAT_ID)) MolToPDBFile(anion, "{}.pdb".format(AN_ID)) break else: continue if write_file: pd.DataFrame.to_csv(new, path_or_buf="salt_log.csv", index=False) salts = new if not write_file: return new
def generate_solvent(target, model_ID, heavy_atom_limit=50, sim_bounds=[0, 1.0], hits=1, write_file=False, seed=None, hull=None, simplex=None, path=None, exp_data=None, verbose=0, gen_token=False, hull_bounds=[0, 1], inner_search=True, parent_cap=25, mutation_cap=1000): """ the primary public function of the salt_generator module Parameters ---------- target : array, float, or int the desired property value to be achieved by the engine, if an array, a multi-output model must be supplied to the engine model_ID : str the name of the model to be used by the engine. Gains has several built-in models to choose from heavy_atom_limit : int, optional the upper value for allowable heavy atoms in the returned candidate sim_bounds : array, optional the tanimoto similarity score between the returned candidate and its closest molecular relative in parent_candidates hits : int, optional the number of desired solutions write_file : boolean, optional defaults to False. if True will return the solutions and a csv log file seed : int, optional optional randint seed for unittest consistency hull : pandas DataFrame, optional nxm pandas DataFrame to use convex hull search strategy. hull columns should be the same properties used in the genetic algorithm fitness test simplex : array, optional array to access boundary datapoints in the convex hull. This is used during target resampling defined by the convex hull/simplex path : str, optional absolute path to the qspr model used as the fitness function exp_data: salty devmodel obj, optional used during hull target reassignment search strategy. Salty devmodel object of the original experimental data verbose : int, optional, default 0 0 : most verbose. Best child, parent/target resampling, sanitization failure 1 : parent/target resampling, solution metadata, sanitization failure 2 : solution metdata, sanitization failure 3 : target resampling, csv-formatted solution metadata 4 : csv-formatted solution metadata gen_token : int, str, optional a string or integer to append to file outputs. Useful in the case of parallel searches. hull_bounds : array, optional if hull and simplex are not none, hull_bounds describes the proximity convex_search should be to the simplex inner_search : bool, optional if hull and simplex are not none, inner_search specifies if convex_search should return values only within the convex hull Returns ------- new : object default behavior is to return a pandas DataFrame. This is a log file of the solution(s). if write_file = True the function will also return pdb files of the cations/anions """ parent_candidates = [] anion_candidates = [] models = [] deslists = [] for i, name in enumerate(model_ID): if path: model = np.array( [load_model(join(path, '{}_qspr.h5'.format(name)))]) with open(join(path, '{}_desc.csv'.format(name)), 'rb') as csv_file: deslist = list([pd.read_csv(csv_file, encoding='latin1')]) with open(join(path, '{}_summ.csv'.format(name)), 'rb') as csv_file: summary = pd.read_csv(csv_file, encoding='latin1') else: model = np.array( [genetic.load_data("{}_qspr.h5".format(name), h5File=True)]) deslist = list([genetic.load_data("{}_desc.csv".format(name))]) summary = genetic.load_data("{}_summ.csv".format(name)) parents = eval(summary.iloc[1][1]) anions = eval(summary.iloc[2][1]) if i > 0: parent_candidates = np.concatenate((parents, parent_candidates)) anion_candidates = np.concatenate((anions, anion_candidates)) models = np.concatenate((models, model)) deslists = list([deslists, deslist]) else: parent_candidates = parents anion_candidates = anions models = model deslists = deslist cols = [ "Salt ID", "Salt Smiles", "Cation Heavy Atoms", "Tanimoto Similarity Score", "Molecular Relative", "Anion", "Model Prediction", "MD Calculation", "Error" ] salts = pd.DataFrame(columns=cols) if exp_data: anion_candidates = eval(exp_data.Data_summary.iloc[2][0]) for i in range(1, hits + 1): while True: if seed: random.seed(seed) anion_smiles = random.sample(list(anion_candidates), 1)[0] anion = Chem.MolFromSmiles(anion_smiles) best = _guess_password(target, anion_smiles, parent_candidates, models, deslists, seed=seed, hull=hull, simplex=simplex, exp_data=exp_data, verbose=verbose, hull_bounds=hull_bounds, inner_search=inner_search, parent_cap=parent_cap, mutation_cap=mutation_cap) if exp_data: exp_parent_candidates = eval(exp_data.Data_summary.iloc[1][0]) tan_sim_score, sim_index = \ genetic.molecular_similarity(best, exp_parent_candidates) else: tan_sim_score, sim_index = \ genetic.molecular_similarity(best, parent_candidates) cation_heavy_atoms = best.Mol.GetNumAtoms() salt_smiles = best.Genes + "." + Chem.MolToSmiles(anion) if cation_heavy_atoms < heavy_atom_limit and \ sim_bounds[0] <= tan_sim_score < sim_bounds[1] and\ salt_smiles not in salts["Salt Smiles"]: scr, pre = _get_fitness(anion, best.Genes, target, models, deslists) if i < 10: CAT_ID = "C0%s" % i AN_ID = "A0%s" % i else: CAT_ID = "C%s" % i AN_ID = "A%s" % i salt_ID = CAT_ID + "_" + AN_ID if exp_data: molecular_relative = salty.check_name( exp_parent_candidates[sim_index]) else: molecular_relative = salty.check_name( parent_candidates[sim_index]) anion_name = salty.check_name(anion_smiles) new_entry = pd.DataFrame([[ salt_ID, salt_smiles, cation_heavy_atoms, tan_sim_score, molecular_relative, anion_name, pre ]], columns=cols[:-2]) try: cation = Chem.AddHs(best.Mol) Chem.EmbedMolecule(cation, Chem.ETKDG()) Chem.UFFOptimizeMolecule(cation) anion = Chem.AddHs(anion) Chem.EmbedMolecule(anion, Chem.ETKDG()) Chem.UFFOptimizeMolecule(anion) new = pd.DataFrame(pd.concat([salts, new_entry]), columns=cols) except BaseException: if verbose == any([0, 1, 2]): print("molecule not sanitizable") continue if write_file: if verbose == any([3, 4]): print(new) if gen_token: MolToPDBFile(cation, "{}_{}.pdb".format(gen_token, CAT_ID)) MolToPDBFile(anion, "{}_{}.pdb".format(gen_token, AN_ID)) else: MolToPDBFile(cation, "{}.pdb".format(CAT_ID)) MolToPDBFile(anion, "{}.pdb".format(AN_ID)) break else: continue if write_file: if gen_token: pd.DataFrame.to_csv( new, path_or_buf="{}_salt_log.csv".format(gen_token), index=False) else: pd.DataFrame.to_csv(new, path_or_buf="salt_log.csv", index=False) salts = new if not write_file: return new