def set_computable(self): mol = tool_chemical.read_string("mol", self._mol) # molecular_formula = Descriptors.rdMolDescriptors.CalcMolFormula(mol) # molecular_weight = Descriptors.ExactMolWt(mol) self._smiles = Chem.MolToSmiles(mol, isomericSmiles=False) self._inchi = inchi.MolToInchi(mol) self._inchikey = inchi.MolToInchiKey(mol) self._molecular_formula = Chem.CalcMolFormula(mol) self._molecular_weight = Chem.CalcExactMolWt(mol)
def add_to_summary(summary_dic, conf_dic, smiles, save_dir): inchikey = inchi.MolToInchiKey(get_mol(smiles)) pickle_path = os.path.join(os.path.abspath(save_dir), f"{inchikey}.pickle") summary_dic[smiles] = { key: val for key, val in conf_dic.items() if key != "conformers" } summary_dic[smiles].update({"pickle_path": pickle_path}) return summary_dic, pickle_path
def sdf_to_inchikey(): molecules = Chem.SDMolSupplier(sys.argv[1]) csv = open(sys.argv[1] + ".inchikey", "w") for mol in molecules: if mol: csv.write( inchi.MolToInchiKey(mol) + " " + mol.GetProp("_Name") + "\n") csv.close()
def convert(input, input_mod='smi'): """ convert SMILES into other molecular identifier :param input: SMILES :param input_mod: 'smi' :return: str(molecular formula), str(inchi), str(inchikey) """ mol = read_string(input_mod, input) molecular_formula = Descriptors.rdMolDescriptors.CalcMolFormula(mol) molecular_inchi = inchi.MolToInchi(mol) molecular_inchikey = inchi.MolToInchiKey(mol) return molecular_formula, molecular_inchi, molecular_inchikey
def _append_inchi_keys_dictionary_by_reference(self, inchi_dict: dict, smile: str): mol = Chem.MolFromSmiles(smile) if mol is not None: try: inchi_key = inchi.MolToInchiKey(mol) try: inchi_dict[inchi_key][0] += 1 except: inchi_dict[inchi_key] = [1, mol] except: self.log_message(f"Failed to transform SMILES string: {smile}")
def set_computables_from_mol(self, mol): try: # warning comes up in pycharm (bug of pycharm) self.molecular_formula = Descriptors.rdMolDescriptors.CalcMolFormula( mol) self.molecular_weight = Descriptors.ExactMolWt(mol) self.inchi = inchi.MolToInchi(mol) self.inchikey = inchi.MolToInchiKey(mol) self.smiles = Chem.MolToSmiles(mol, isomericSmiles=False) except Exception as e: raise SpectrumError("Error occurred while computing properties" + e.args) from e assert self.molecular_formula is not None, "molecular-formula can't be None" assert self.molecular_weight is not None, "molecular-weight can't be None" assert self.inchi is not None, "inchi can't be None" assert self.inchikey is not None, "inchikey can't be None" assert self.smiles is not None, "smiles can't be None"
def _count_unique_inchi_keys(self, smiles): """returns key value pair where value is [count, mol]""" inchi_dict = {} for smile in smiles: mol = Chem.MolFromSmiles(smile) if mol is not None: inchi_key = inchi.MolToInchiKey(mol) try: inchi_dict[inchi_key][0] += 1 except: inchi_dict[inchi_key] = [1, mol] counts = [v[0] for v in inchi_dict.values()] mols = [v[1] for v in inchi_dict.values()] to_sort = zip(counts, mols) sorted_tuple = sorted(to_sort, key=lambda tup: -tup[0]) sorted_tuple = sorted_tuple[:self._sample_size] list_of_labels = [f"Times sampled: {v[0]}" for v in sorted_tuple] sorted_mols = [v[1] for v in sorted_tuple] return list_of_labels, sorted_mols
def main(): missingfile = sys.argv[1] outputfile = sys.argv[2] missing = pd.read_csv(missingfile, sep="\t", header=0) names = [] for row in missing.iterrows(): smile = row[1].smiles m = Chem.MolFromSmiles(smile) inchikey = ri.MolToInchiKey(m) names.append(inchikey) missing['name'] = names print(missing) missing.to_csv(outputfile, sep="\t", index=False)
def main(): parser = argparse.ArgumentParser(description='Convert compounds and \ associated information from an sdf file \ into a csv file and generate smiles') parser.add_argument('-S', '--sdf', action='store', nargs=1, dest='sdf', help='File containing compounds \ (.sdf format)') parser.add_argument('-n', '--name', action='store', nargs=1, dest='name', help='Name of output csv file to write') parser.add_argument('-i', '--input_directory', action='store', nargs=1, dest='input', default=['./'], help='Directory where input files are stored') parser.add_argument('-o', '--output_directory', action='store', nargs=1, dest='output', default=['./'], help='Directory where output files should be written') args = vars(parser.parse_args()) sdf_df = PandasTools.LoadSDF(args['input'][0] + args['sdf'][0], smilesName='smiles') sdf_df['inchikey'] = [inchi.MolToInchiKey(mol) for mol in sdf_df['ROMol']] sdf_df.to_csv(args['output'][0] + args['name'][0], index=False)
def fill_base_test(cursor): df = pd.read_csv("toxicity_85832.csv") #df = df.drop("Unnamed: 0", axis=1) names_of_columns = list(df.columns) smiles = list(df["SMILES"]) df = df.drop("SMILES", axis=1) toxic_vals = np.array(df.values) #molecules canonize_smiles = [_canonize_mixture(smile) for smile in smiles] inchi_smiles = [ inchi.MolToInchi(Chem.MolFromSmiles(smile)) for smile in canonize_smiles ] inchikey = [ inchi.MolToInchiKey(Chem.MolFromSmiles(smile)) for smile in canonize_smiles ] ids = [x for x in range(len(canonize_smiles))] ziped_vals = zip(inchikey, inchi_smiles, canonize_smiles) cursor.executemany( """insert into 'molecules' (inchi_key,inchi,canonical_smiles) values (?,?,?)""", ziped_vals) #tasks descr_tasks = [ randomStringwithDigitsAndSymbols(random.randint(1, 30)) for i in range(20) ] cursor.executemany("""insert into 'tasks' (descr) values (?)""", zip(descr_tasks)) #tasks_running completed = [random.randint(0, 1) for i in range(1000)] id_tasks = [random.randint(1, len(descr_tasks)) for i in range(1000)] id_molecules = [random.randint(1, len(smiles)) for i in range(1000)] zip_tasks_running = zip(id_tasks, id_molecules, completed) cursor.executemany( """insert into 'tasks_running' (id_task, id_molecule, completed) values (?,?,?)""", zip_tasks_running) #descriptors name_of_descr = [ randomStringwithDigitsAndSymbols(random.randint(1, 30)) for i in range(10) ] name_of_version = [ randomStringwithDigitsAndSymbols(random.randint(1, 30)) for i in range(10) ] ziped_versions = zip(name_of_descr, name_of_version) cursor.executemany( """insert into 'descriptors' (descriptor, version) values (?,?)""", ziped_versions) cursor.execute( """insert into 'descriptors' (descriptor, version) values (?,?)""", ("mordred", "0.315")) #descriptor_values id_descriptor = [11 for i in range(len(smiles))] id_molecule = [x + 1 for x in range(len(smiles))] id_tasks = [ random.randint(1, len(descr_tasks)) for i in range(len(smiles)) ] valid = [random.randint(0, 1) for i in range(len(smiles))] value = func(canonize_smiles) ziped_descr_vals = zip(id_molecule, id_descriptor, id_tasks, valid, value) cursor.executemany( """insert into 'descriptors_values' (id_molecule, id_descriptor, id_task, valid, value) values (?,?,?,?,?)""", ziped_descr_vals) #endpoints features = names_of_columns[1:] descriptions = [feature.split('_')[1] for feature in features] types = ['_'.join(feature.split('_')[2:]) for feature in features] ziped_endpoints = zip(descriptions, types) cursor.executemany("""insert into 'endpoints' (desc, type) values (?,?)""", ziped_endpoints) #experimnetal data ids_molecules = [] ids_endpoints = [] values_endpoints = [] for i in range(len(toxic_vals[:, 0])): for j in range(len(toxic_vals[0, :])): if (~np.isnan(toxic_vals[i, j])): ids_molecules.append(i + 1) ids_endpoints.append(j + 1) values_endpoints.append(toxic_vals[i, j]) ziped_experimental_data = zip(ids_molecules, ids_endpoints, values_endpoints) cursor.executemany( """insert into 'experimental_data' (id_molecule, id_endpoint, value) values (?,?,?)""", ziped_experimental_data) return cursor
def calculate_inchi_key(smile): return inchi.MolToInchiKey(Chem.MolFromSmiles(smile))
def loadSDF(sdfPath): # Create images #generateImages(sdfPath) # Create a molecule supplier suppl = Chem.SDMolSupplier(sdfPath) # Filter empty entries sdf = [x for x in suppl if x is not None] # For each molecule in supplier for mol in sdf: data = {} try: data['fCharge'] = mol.GetProp('Charge') except: data['fCharge'] = Chem.GetFormalCharge(mol) try: data['name'] = mol.GetProp('DATABASE_ID') except: data['name'] = 'unkown' try: data['molMass'] = mol.GetProp('Total Molweight') except: data['molMass'] = Descriptors.ExactMolWt(mol) try: data['cLogP'] = mol.GetProp('cLogP') except: data['cLogP'] = Crippen.MolLogP(mol) # não sei se ta certo try: data['cLogS'] = mol.GetProp('cLogS') except: data['cLogS'] = 0.0 try: data['tpsa'] = mol.GetProp('Polar Surface Area') except: data['tpsa'] = rdMolDescriptors.CalcTPSA(mol) try: data['totalSurfaceArea'] = mol.GetProp('Total Surface Area') except: data['totalSurfaceArea'] = rdMolDescriptors.CalcTPSA(mol) try: data['hbondAcceptors'] = mol.GetProp('H-Acceptors') except: data['hbondAcceptors'] = rdMolDescriptors.CalcNumHBA(mol) try: data['hbondDonnors'] = mol.GetProp('H-Donors') except: data['hbondDonnors'] = rdMolDescriptors.CalcNumHBD(mol) try: data['rotable'] = mol.GetProp('Rotatable Bonds') except: data['rotable'] = rdMolDescriptors.CalcNumRotatableBonds(mol) try: data['mutagenic'] = mol.GetProp('Mutagenic') except: data['mutagenic'] = 'Unknown' try: data['tumorigenic'] = mol.GetProp('Tumorigenic') except: data['tumorigenic'] = 'Unknown' try: data['irritant'] = mol.GetProp('Irritant') except: data['irritant'] = 'Unkown' try: data['smiles'] = mol.GetProp('SMILES') except: data['smiles'] = Chem.MolToSmiles(mol) try: data['InChI'] = mol.GetProp('INCHI_IDENTIFIER') except: data['InChI'] = inchi.MolToInchi(mol) try: data['inchiKey'] = mol.GetProp('INCHI_KEY') except: data['inchiKey'] = inchi.MolToInchiKey(mol) try: data['nonHAtoms'] = mol.GetProp('Non-H Atoms') except: data['nonHAtoms'] = -1 # Não sei calcular try: data['numAtoms'] = mol.GetProp('numAtoms') except: data['numAtoms'] = mol.GetNumAtoms() try: data['stereoCenters'] = mol.GetProp('Stereo Centers') except: data['stereoCenters'] = mol.GetNumAtoms() try: data['provider'] = mol.GetProp('DATABASE_NAME') except: print("Nenhum fornecedor encontrado, o campo é obrigatório!") continue tmp = AllChem.Compute2DCoords(mol) # Compute its coordinates Draw.MolToFile(mol, os.path.join(settings.FILES_DIR, f'molImages/' + data["inchiKey"] + '.png'), size=(300,300), kekulize=True, wedgeBonds=True, fitImage=True) # Save it Draw.MolToFile(mol, os.path.join(settings.FILES_DIR, f'molThumbs/' + data["inchiKey"] + '.png'), size=(150,150), kekulize=True, wedgeBonds=True, fitImage=True) feedDatabase(data) if Compounds.objects.filter(inChIKey=data['inchiKey']).exists(): if not Compounds.objects.filter(provider=['provider']).exists(): feedDatabase(data) print("feed1") # append no sdf da base de dados a = 1 else: print("continue123") continue else: a = 1 feedDatabase(data) print("feed2") '''except:
def one_species_confs(molecule, log, other_props, max_confs, forcefield, nconf_gen, e_window, rms_tol, prun_tol, job_dir, log_file, rep_e_window, fallback_to_align, temp, clean_up, start_time): smiles = copy.deepcopy(molecule) with open(log, "w") as output: output.write("The smiles strings that will be run are:\n") output.write("\n".join([molecule]) + "\n") if any([element in molecule for element in UFF_ELEMENTS]): output.write(("Switching to UFF, since MMFF94 does " "not have boron and/or aluminum\n")) forcefield = 'uff' confgen, gen_time, min_time = minimize(output=output, molecule=molecule, forcefield=forcefield, nconf_gen=nconf_gen, prun_tol=prun_tol, e_window=e_window, rms_tol=rms_tol, rep_e_window=rep_e_window) clustered_confs = confgen.cluster(rms_tolerance=float(rms_tol), max_ranked_conformers=int(max_confs), energy_window=float(e_window), Report_e_tol=float(rep_e_window), output=output) cluster_time = time.time() inchikey = inchi.MolToInchiKey(get_mol(molecule)) for i, conformer in enumerate(clustered_confs): write_clusters(output=output, idx=i, conformer=conformer, inchikey=inchikey, path=job_dir) molecule = run_obabel(inchikey=inchikey, idx=i) confgen.recluster(path=job_dir, rms_tolerance=float(rms_tol), max_ranked_conformers=int(max_confs), energy_window=float(e_window), output=output, clustered_confs=clustered_confs, molecule=molecule, key=inchikey, fallback_to_align=fallback_to_align) rename_xyz_files(path=job_dir) summarize(output=output, gen_time=gen_time, start_time=start_time, min_time=min_time, cluster_time=cluster_time) conf_dic = parse_results(job_dir=job_dir, log_file=log_file, inchikey=inchikey, max_confs=max_confs, other_props=other_props, temp=temp, smiles=smiles, clean_up=clean_up) return conf_dic
def get_inchi_key(mol, stereo): inchi_key = inchi.MolToInchiKey(mol) if not stereo: q = inchi_key.split('-') inchi_key = q[0] + '-' + q[2] # remove middle part responsible for stereo and isotopes return inchi_key
converted_not_match_file = open( "out/inchi_valid_check/converted_not_match.txt", mode="w", encoding="utf-8") w = SDWriter("out/inchi_valid_check/converted.sdf") np: Unique_NP for np in repo.get_unique_stream(): mol = Mol() try: mol = Chem.MolFromInchi(inchi=np.inchi, treatWarningAsError=True) mol.SetProp("coconut_id", np.coconut_id) except: not_converted += 1 if mol: mol_inchikey = inchi.MolToInchiKey(mol) if np.inchikey == mol_inchikey: converted_list_file.write(np.inchi + "\n") w.write(mol) converted += 1 else: converted_not_match_file.write(np.inchi + "\n") not_converted += 1 i += 1 del np if i % 1000 == 0: print("{}th checked".format(i)) # if i > 5: