def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="DIR", default="_tmp_") parser.add_argument('--json', action='store', help='', metavar="FILE") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" data = misc.load_json(args.json) keys = data.keys() keys = list(keys) canonical_data = {} for key in keys: molobj, status = cheminfo.smiles_to_molobj(key) if molobj is None: print("error none mol:", key) continue smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True) if "." in smiles: print("error multi mol:", smiles) continue atoms = cheminfo.molobj_to_atoms(molobj) if not is_mol_allowed(atoms): print("error heavy mol:", smiles) continue canonical_data[smiles] = data[key] misc.save_json(args.scratch + "molecule_data", canonical_data) misc.save_obj(args.scratch + "molecule_data", canonical_data) return
def clean_data(listdata): data = {} atom_types = [] for row in listdata: idx = row[0] smi = row[1] value = row[3] value = float(value) molobj, status = cheminfo.smiles_to_molobj(smi) if molobj is None: print("error:", smi) continue smi = cheminfo.molobj_to_smiles(molobj, remove_hs=True) atoms = cheminfo.molobj_to_atoms(molobj) # filter for organic chemistry if not is_mol_allowed(atoms): continue atom_types += list(atoms) if smi not in data: data[smi] = [] data[smi].append(value) atom_types, counts = np.unique(atom_types, return_counts=True) for atom, count in zip(atom_types, counts): print(atom, count) keys = data.keys() print("Total molecules", len(keys)) return data
def test_dot(): smiles = "Oc1ccccc1" molobj, status = cheminfo.smiles_to_molobj(smiles) fp1 = get_rdkitfp(molobj) bm = fp_to_bitmap(fp1) print(list(bm)) # hello = np.array([0, 1, 0,0,0,0,0,0,0,0,0,1]) # res = np.dot(hello, hello) bm = np.array(bm, dtype=int) s = np.sum(bm) other = np.dot(bm, bm) print(s, other) return
def filter_dict(molecules): keys = molecules.keys() keys = list(keys) max_atoms = 0 for key in keys: molobj, status = cheminfo.smiles_to_molobj(key) if molobj is None: continue status = filter_molobj(molobj) if not status: del molecules[key] print(key, status) continue status = filter_value(molecules[key]) if not status: print(status, key, molecules[key]) del molecules[key] continue # Report atoms = cheminfo.molobj_to_atoms(molobj) n_atoms = len(atoms) if n_atoms > max_atoms: max_atoms = n_atoms continue print("max atoms: ", max_atoms) return molecules
def clean_data(df, scratch): smiles = df.iloc[1] data = {} atom_types = [] for index, row in df.iterrows(): smi = row.smiles value = row.mpC + 273.15 molobj, status = cheminfo.smiles_to_molobj(smi) if molobj is None: print("error:", smi) continue smi = cheminfo.molobj_to_smiles(molobj, remove_hs=True) # Atoms atoms = cheminfo.molobj_to_atoms(molobj) atom_types += list(atoms) if smi not in data: data[smi] = [] data[smi].append(value) atom_types, counts = np.unique(atom_types, return_counts=True) for atom, count in zip(atom_types, counts): print(atom, count) misc.save_obj(scratch + "molecule_data", data) misc.save_json(scratch + "molecule_data", data) return
def test_kernel(): smiles = ['c1ccccn1'] smiles += ['c1ccco1'] smiles += ['Oc1ccccc1'] smiles += ['Nc1ccccc1'] smiles += ['CCO'] smiles += ['CCN'] molobjs = [cheminfo.smiles_to_molobj(x)[0] for x in smiles] molobjs = cheminfo.read_sdffile("_tmp_bing_bp_/structures.sdf.gz") molobjs = [next(molobjs) for _ in range(5000)] init = time.time() vectors = molobjs_to_fps(molobjs) print("init", time.time() - init) time_pykernel = time.time() kernel = bitmap_jaccard_kernel(vectors) print("pykernel", time.time() - time_pykernel) print(kernel) del kernel n_items = vectors.shape[0] # kernel = np.zeros((n_items, n_items)) vectors = vectors.T vectors = np.array(vectors, dtype=int) # help(bitmap_kernels) time_fkernel = time.time() kernel = bitmap_kernels.symmetric_jaccard_kernel(n_items, vectors) print("fokernel", time.time() - time_fkernel) print(kernel) return
def prepare_training_data_protonafinity(): distance_cut = 20.0 parameters = { "pad": 25, 'nRs2': 22, 'nRs3': 17, 'eta2': 0.41, 'eta3': 0.97, 'three_body_weight': 45.83, 'three_body_decay': 2.39, 'two_body_decay': 2.39, "rcut": distance_cut, "acut": distance_cut, "elements": [1, 6, 7, 8, 9, 12] } dirprefix = "data/dataset-proton-affinity/data/" filename = dirprefix + "pm3_properties.csv" df = pd.read_csv(filename, sep=",") n_rows = df.shape[0] # column names col_neuidx = "MoleculeIdx" col_proidx = "ProtonatedIdx" col_refsmi = "ReferenceSmiles" col_prosmi = "ProtonatedSmiles" col_neueng = "NeutralEnergy" col_proeng = "ProtonatedEnergy" # Collect energies energies_neutr = df[col_neueng] energies_proto = df[col_proeng] energies = [energies_neutr, energies_proto] energies = np.array(energies) # Protonated representation p_representations = [] p_coord_list = [] p_atoms_list = [] # Neutral representation n_representations = [] n_coord_list = [] n_atoms_list = [] for idx, row in tqdm.tqdm(df.iterrows(), desc="Preparing FCHL19", total=n_rows, **TQDM_OPTIONS): # print(row) nidx = row[col_neuidx] pidx = row[col_proidx] nname = f"xyz{nidx}_n.xyz" pname = f"xyz{nidx}_{pidx}.xyz" # Neutral state atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "pm3.cosmo.mop/" + nname) atoms = [cheminfo.convert_atom(atom) for atom in atoms] n_representation = generate_fchl_acsf(atoms, coord, **parameters) n_representations.append(n_representation) n_coord_list.append(coord) n_atoms_list.append(atoms) # Protonated state atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "pm3.cosmo.mop/" + pname) atoms = [cheminfo.convert_atom(atom) for atom in atoms] # Find protonated atom smiles = row[col_prosmi] molobj = cheminfo.smiles_to_molobj(smiles) assert molobj is not None, "Molobj failed for {smiles}" smi_atoms = molobj.GetAtoms() atom_charges = [atom.GetFormalCharge() for atom in smi_atoms] atom_charges = np.array(atom_charges) idx, = np.where(atom_charges > 0) assert len(idx) == 1, f"Should only be one charged atom in {pname}" idx = idx[0] # Set nitrogen to heavy atom atoms[idx] = 12 p_representation = generate_fchl_acsf(atoms, coord, **parameters) p_representations.append(n_representation) p_coord_list.append(coord) p_atoms_list.append(atoms) # proton_idxs = np.array(proton_idxs) n_representations = np.array(n_representations) p_representations = np.array(p_representations) return n_representations, p_representations, n_coord_list, p_coord_list, n_atoms_list, p_atoms_list, energies
def main(): # smiles_list = ['c1ccccn1', 'c1ccco1']*10 # molobjs = [cheminfo.smiles_to_molobj(smiles)[0] for smiles in smiles_list] smiles1 = 'c1ccccn1' smiles2 = 'c1ccco1' smiles1 = 'Oc1ccccc1' smiles2 = 'Nc1ccccc1' # smiles1 = 'CCO' # smiles2 = 'CCN' molobj1, status = cheminfo.smiles_to_molobj(smiles1) molobj2, status = cheminfo.smiles_to_molobj(smiles2) fp1 = get_rdkitfp(molobj1) fp2 = get_rdkitfp(molobj2) bm1 = fp_to_bitmap(fp1) bm2 = fp_to_bitmap(fp2) print(bm1) print() sim = rdkit.DataStructs.FingerprintSimilarity(fp1, fp2) print(sim) sim = jaccard_index(bm1, bm2) print(sim) sim = dice_coefficient(bm1, bm2) print(sim) print() fp1 = AllChem.GetMorganFingerprintAsBitVect(molobj1, 2, nBits=1024 * 5, useFeatures=True) fp2 = AllChem.GetMorganFingerprintAsBitVect(molobj2, 2, nBits=1024 * 5, useFeatures=True) bm1 = fp_to_bitmap(fp1) bm2 = fp_to_bitmap(fp2) sim = jaccard_index(bm1, bm2) print(sim) sim = rdkit.DataStructs.FingerprintSimilarity(fp1, fp2) print(sim) fp1 = get_morgan(molobj1) fp2 = get_morgan(molobj2) sim = AllChem.DataStructs.DiceSimilarity(fp1, fp2) print(sim) # molobjs = cheminfo.read_sdffile("_tmp_bing_bp_/structures.sdf.gz") # molobjs = [next(molobjs) for _ in range(20)] # # fingerprints = molobjs_to_fps(molobjs, procs=2) # kernel = fingerprints_to_kernel(fingerprints, fingerprints, procs=2, similarity=dice_similarity) # # print(kernel) return
def parse_molandprop(*args, debug=False, **kwargs): if len(args) > 1: molobj = args[0] props = args[1] else: molobj, props = args[0] if molobj is None: return None, None keys = props.keys() if "SMILES" not in keys: return None, None prop_smiles = props["SMILES"] # Ignore multi molecules if "." in prop_smiles: if debug: print(f"ignore: {prop_smiles}") return None, None # Count atoms = cheminfo.molobj_to_atoms(molobj) # if len(atoms) < 3: # if debug: # print("ignore small", props) # return None, None # if len(atoms) > 40: # if debug: # print("ignore large", props) # return None, None # atoms_carbons, = np.where(atoms == 6) # if len(atoms_carbons) < 1: # if debug: # print("ignore non-org", props) # return None, None # Add hydrogens and optimize structure molobj = cheminfo.molobj_add_hydrogens(molobj) status = cheminfo.molobj_optimize(molobj) # if unconverged if status == 5: # try the smiles molobj, status = cheminfo.smiles_to_molobj(prop_smiles) if molobj is None: print("error", props) return None, None molobj = cheminfo.molobj_add_hydrogens(molobj) status = cheminfo.molobj_optimize(molobj) if status == 5: print("error", props) return None, None idx_ref = [key for key in keys if "{measured}" in key] idx_ref = idx_ref[0] value = str(props[idx_ref]) if "<" in value: return None, None if ">" in value: return None, None idx_value = [key for key in keys if "measured, converted" in key] idx_value = idx_value[0] idx_unit = [key for key in keys if "UNIT" in key] idx_unit = [key for key in idx_unit if "Point" in key] idx_unit = idx_unit[0] prop_unit = props[idx_unit] prop_value = props[idx_value] if prop_unit == "Celsius": prop_value += 273.15 elif prop_unit == "K": pass else: print("error unknown unit", prop_unit, props) return None, None return molobj, prop_value