def test_rmsd_xyz(): filename_1 = pathlib.PurePath(RESOURCE_PATH, "ethane.xyz") filename_2 = pathlib.PurePath(RESOURCE_PATH, "ethane_mini.xyz") p_atoms, p_coord = rmsd.get_coordinates_xyz(filename_1) q_atoms, q_coord = rmsd.get_coordinates_xyz(filename_2) pure_rmsd = rmsd.rmsd(p_coord, q_coord) np.testing.assert_almost_equal(0.33512, pure_rmsd, decimal=3)
def load_data( data_dir="data/xyz/", ref_file="data/qm9-reference.csv", offset=0, query_size=100, ): """ Inputs: data_file (str): The data_file offset (int): The row offset for the data query query_size (int): The number of rows to return Returns: atom_list: List of chemical species for each molecule in query coords_list: List of species coordinates for each molecule in query charges: List of species charges for each molecule in query filenames: List of names for each reference """ reference = pd.read_csv(ref_file, skiprows=range(1, offset), nrows=query_size) atoms_list, coord_list, charges = [], [], [] filenames = reference["name"] for filename in filenames: filename = os.path.join(data_dir, f"{filename}.xyz") atoms, coords = rmsd.get_coordinates_xyz(filename) charges.append(0) atoms_list.append(atoms) coord_list.append(coords) return atoms_list, coord_list, charges, filenames, reference
def parse_xyz(filename): atoms, coordinates = rmsd.get_coordinates_xyz(filename) inertia = get_inertia(atoms, coordinates) return inertia
def load_data(): reference = "../dataset-qm9/reference.csv" reference = pd.read_csv(reference) filenames = reference["name"] # energies = reference["binding energy"] atoms_list = [] coord_list = [] charges = [] titles = [] for filename in filenames: titles.append(filename) charges.append(0) filename = "../dataset-qm9/xyz/" + filename + ".xyz" atoms, coord = rmsd.get_coordinates_xyz(filename) atoms_list.append(atoms) coord_list.append(coord) offset = 10 + 100 to_offset = 110 + 100 atoms_list = atoms_list[offset:to_offset] coord_list = coord_list[offset:to_offset] charges = charges[offset:to_offset] titles = titles[offset:to_offset] reference = reference[offset:to_offset] return atoms_list, coord_list, charges, titles, reference
def test_get_coordinates_xyz(): filename = pathlib.PurePath(RESOURCE_PATH, "ethane.xyz") atoms, coords = rmsd.get_coordinates_xyz(filename) assert "C" == atoms[0] assert [-0.98353, 1.81095, -0.0314] == coords[0].tolist()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--mol', action='store', default=None, help='Load molecule for live simulation', metavar="FILE") parser.add_argument('--model', action='store', default="ethanol", help='') args = parser.parse_args() if args.mol is None: nuclear_charges = np.array([6, 6, 8, 1, 1, 1, 1, 1, 1]) coordinates = np.array([[0.07230959, 0.61441211, -0.03115568], [-1.26644639, -0.27012846, -0.00720771], [1.11516977, -0.30732869, 0.06414394], [0.10673943, 1.44346835, -0.79573006], [-0.02687486, 1.19350887, 0.98075343], [-2.06614011, 0.38757505, 0.39276693], [-1.68213881, -0.60620688, -0.97804526], [-1.18668224, -1.07395366, 0.67075071], [1.37492532, -0.56618891, -0.83172035]]) else: nuclear_charges, coordinates = rmsd.get_coordinates_xyz(args.mol) nuclear_charges = [cheminfo.convert(x) for x in nuclear_charges] calculator = get_calculator(args.model) constant_energy(nuclear_charges, coordinates, calculator=calculator) return
def prepare_xyz(filename, charge, header): """ """ atoms, coordinates = rmsd.get_coordinates_xyz("test.xyz") lines = prepare_atoms(atoms, coordinates) header = header.format(charge) gmsin = header + lines return gmsin
def test_reorder_qml(): filename_1 = pathlib.PurePath(RESOURCE_PATH, "CHEMBL3039407.xyz") p_atoms, p_coord = rmsd.get_coordinates_xyz(filename_1) # Reorder atoms n_atoms = len(p_atoms) random_reorder = np.arange(n_atoms, dtype=int) np.random.seed(5) np.random.shuffle(random_reorder) q_atoms = copy.deepcopy(p_atoms) q_coord = copy.deepcopy(p_coord) q_atoms = q_atoms[random_reorder] q_coord = q_coord[random_reorder] # Mess up the distance matrix by rotating the molecule theta = 180.0 rotation_y = np.array( [ [np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)], ] ) q_coord = np.dot(q_coord, rotation_y) # Reorder with standard hungarian, this will fail reorder and give large # RMSD view_dist = rmsd.reorder_hungarian(p_atoms, q_atoms, p_coord, q_coord) q_atoms_dist = q_atoms[view_dist] q_coord_dist = q_coord[view_dist] _rmsd_dist = rmsd.kabsch_rmsd(p_coord, q_coord_dist) assert q_atoms_dist.tolist() == p_atoms.tolist() assert _rmsd_dist > 3.0 # Reorder based in chemical similarity view = rmsd.reorder_similarity(p_atoms, q_atoms, p_coord, q_coord) q_atoms = q_atoms[view] q_coord = q_coord[view] # Calculate new RMSD with correct atom order _rmsd = rmsd.kabsch_rmsd(p_coord, q_coord) # Assert correct atom order assert q_atoms.tolist() == p_atoms.tolist() # Assert this is the same molecule pytest.approx(0.0) == _rmsd
def main(): calculator = calculators.get_calculator("_deploy_", debug=False) atom_labels, coordinates = rmsd.get_coordinates_xyz("examples/ethanol.xyz") molecule = ase.Atoms(atom_labels, coordinates) molecule.set_calculator(calculator) dyn = BFGS(molecule) dyn.run(fmax=0.3) dump_xyz(molecule, "_tmp_molecule_optimize.xyz") return
def main_md(): calculator = calculators.get_calculator("_deploy_", debug=False) atom_labels, coordinates = rmsd.get_coordinates_xyz("examples/ethanol.xyz") molecule = ase.Atoms(atom_labels, coordinates) molecule.set_calculator(calculator) energy = molecule.get_potential_energy() print(energy) dyn = BFGS(molecule) dyn.run(fmax=0.5) dump_xyz(molecule, "_tmp_molecule_optimize.xyz") return
def main(): read_model("data/butane") atoms, coordinates = rmsd.get_coordinates_xyz("data/butane/butane-1.xyz") energy, force = calculate(atoms, coordinates) print(energy) print(force) quit() description = """ """ import argparse parser = argparse.ArgumentParser() parser.add_argument('-f', '--filename', action='store', help='List of molecules', metavar='listfile') parser.add_argument('-m', '--model', action='store', help='Output model in npy format', metavar='file') args = parser.parse_args() # Load model PARAMETERS = np.load(args.model + ".parameters.npy") train_representations = np.load(args.model + ".representations.npy") train_displaced_representations = np.load(args.model + ".displaced_representations.npy") train_alphas = np.load(args.model + ".alphas.npy") # Get molecule filenames f = open(args.filename, 'r') molecules = f.readlines() molecules = [mol.strip() for mol in molecules] f.close() DIRECTORY = args.filename.split("/") DIRECTORY = "/".join(DIRECTORY[:-1]) + "/" # Init all the rep lists list_atoms = [] list_charges = [] list_coordinates = [] list_energies = [] list_forces = [] list_rep = [] list_disp_rep = [] list_disp_rep5 = [] # HYPER PARAMETERS CUT_DISTANCE = PARAMETERS.item().get('cut_distance') KERNEL_ARGS = PARAMETERS.item().get('kernel_args') DX = PARAMETERS.item().get('dx') NMAX = PARAMETERS.item().get('max_atoms') # read coordinates for filename in molecules: atoms, coordinates = rmsd.get_coordinates_xyz(DIRECTORY + filename + ".xyz") charges = [NUCLEAR_CHARGE[atom] for atom in atoms] rep = generate_representation(coordinates, charges, max_size=NMAX, cut_distance=CUT_DISTANCE) disp_rep = generate_displaced_representations(coordinates, charges, max_size=NMAX, cut_distance=CUT_DISTANCE, dx=DX) list_rep.append(rep) list_disp_rep.append(disp_rep) break list_rep = np.array(list_rep) list_disp_rep = np.array(list_disp_rep) # generate kernel kernel_energies, kernel_forces = get_kernel( train_representations, list_rep, train_displaced_representations, list_disp_rep, kernel_args=KERNEL_ARGS, dx=DX) kernel_energies = kernel_energies[0] kernel_forces = kernel_forces[0] # predict energies = np.dot(kernel_energies.T, train_alphas) forces = np.dot(kernel_forces.T, train_alphas) print(energies) print(forces)
def prepare_training_data_protonafinity(): distance_cut = 20.0 parameters = { "pad": 25, 'nRs2': 22, 'nRs3': 17, 'eta2': 0.41, 'eta3': 0.97, 'three_body_weight': 45.83, 'three_body_decay': 2.39, 'two_body_decay': 2.39, "rcut": distance_cut, "acut": distance_cut, "elements": [1, 6, 7, 8, 9, 12] } dirprefix = "data/dataset-proton-affinity/data/" filename = dirprefix + "pm3_properties.csv" df = pd.read_csv(filename, sep=",") n_rows = df.shape[0] # column names col_neuidx = "MoleculeIdx" col_proidx = "ProtonatedIdx" col_refsmi = "ReferenceSmiles" col_prosmi = "ProtonatedSmiles" col_neueng = "NeutralEnergy" col_proeng = "ProtonatedEnergy" # Collect energies energies_neutr = df[col_neueng] energies_proto = df[col_proeng] energies = [energies_neutr, energies_proto] energies = np.array(energies) # Protonated representation p_representations = [] p_coord_list = [] p_atoms_list = [] # Neutral representation n_representations = [] n_coord_list = [] n_atoms_list = [] for idx, row in tqdm.tqdm(df.iterrows(), desc="Preparing FCHL19", total=n_rows, **TQDM_OPTIONS): # print(row) nidx = row[col_neuidx] pidx = row[col_proidx] nname = f"xyz{nidx}_n.xyz" pname = f"xyz{nidx}_{pidx}.xyz" # Neutral state atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "pm3.cosmo.mop/" + nname) atoms = [cheminfo.convert_atom(atom) for atom in atoms] n_representation = generate_fchl_acsf(atoms, coord, **parameters) n_representations.append(n_representation) n_coord_list.append(coord) n_atoms_list.append(atoms) # Protonated state atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "pm3.cosmo.mop/" + pname) atoms = [cheminfo.convert_atom(atom) for atom in atoms] # Find protonated atom smiles = row[col_prosmi] molobj = cheminfo.smiles_to_molobj(smiles) assert molobj is not None, "Molobj failed for {smiles}" smi_atoms = molobj.GetAtoms() atom_charges = [atom.GetFormalCharge() for atom in smi_atoms] atom_charges = np.array(atom_charges) idx, = np.where(atom_charges > 0) assert len(idx) == 1, f"Should only be one charged atom in {pname}" idx = idx[0] # Set nitrogen to heavy atom atoms[idx] = 12 p_representation = generate_fchl_acsf(atoms, coord, **parameters) p_representations.append(n_representation) p_coord_list.append(coord) p_atoms_list.append(atoms) # proton_idxs = np.array(proton_idxs) n_representations = np.array(n_representations) p_representations = np.array(p_representations) return n_representations, p_representations, n_coord_list, p_coord_list, n_atoms_list, p_atoms_list, energies
def prepare_training_data_qmepa890(): # distance_cut = 10.0 # parameters = { # "pad": 25, # max atoms # "rcut": distance_cut, # "acut": distance_cut, # "elements": [1, 6, 7, 8], # } # Table 5. Free atom energies from DFT/PBE0/def2TZVP. # H C N O S # Multiplicity 2 3 4 3 3 # Energy / Eh −0.501036 −37.8054 −54.5438 −75.0186 −397.974 au2kcal = 627.518135759111 atom_energies = {} atom_energies["H"] = -0.501036 * au2kcal atom_energies["C"] = -37.8054 * au2kcal atom_energies["N"] = -54.5438 * au2kcal atom_energies["O"] = -75.0186 * au2kcal atom_energies["S"] = -397.974 * au2kcal distance_cut = 20.0 parameters = { "pad": 25, 'nRs2': 22, 'nRs3': 17, 'eta2': 0.41, 'eta3': 0.97, 'three_body_weight': 45.83, 'three_body_decay': 2.39, 'two_body_decay': 2.39, "rcut": distance_cut, "acut": distance_cut, "elements": [1, 6, 7, 8, 12] } dirprefix = "data/qmepa890/" filename = dirprefix + "data.csv" # 1. File ID (e.g. 0415 means the information pertains to the files `0415.xyz` and `0415_+.xyz`) # 2. Index of the proton (in the `XXXX_+.xyz` file listed in the same row) # 3. Gas-phase energy of neutral molecule plus thermal corrections from vibrational analysis # 4. Gas-phase energy of protonated molecule plus thermal corrections from vibrational analysis # 5. Gas-phase energy of neutral molecule # 6. Gas-phase energy of protonated molecule # 7. Energy of neutral molecule using SMD implicit solvent model # 8. Energy of protonated molecule using SMD implicit solvent model # 9. PM6 heat-of-formation of neutral molecule using COSMO implicit solvent model # 10. PM6 heat-of-formation of protonated molecule using COSMO implicit solvent model df = pd.read_csv(filename, sep=",", header=None) molecule_names = df.iloc[:, 0] proton_idxs = df.iloc[:, 1] energies = df.iloc[:, 2:] p_representations = [] p_coord_list = [] p_atoms_list = [] n_representations = [] n_coord_list = [] n_atoms_list = [] atomization_list = [] for h_idx, name in zip(proton_idxs, molecule_names): name = str(name).zfill(4) print(f"representing {name}") atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "structures/" + name + ".xyz") atom_energy = 0 for atom in atoms: atom_energy += atom_energies[atom] atomization_list.append(atom_energy) atoms = [cheminfo.convert_atom(atom) for atom in atoms] n_representation = generate_fchl_acsf(atoms, coord, **parameters) n_representations.append(n_representation) n_coord_list.append(coord) n_atoms_list.append(atoms) atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "structures/" + name + "_+.xyz") atoms = [cheminfo.convert_atom(atom) for atom in atoms] atoms[h_idx - 1] = 12 p_representation = generate_fchl_acsf(atoms, coord, **parameters) p_representations.append(n_representation) p_coord_list.append(coord) p_atoms_list.append(atoms) proton_idxs = np.array(proton_idxs) n_representations = np.array(n_representations) p_representations = np.array(p_representations) atomization_list = np.array(atomization_list) return n_representations, p_representations, n_coord_list, p_coord_list, n_atoms_list, p_atoms_list, proton_idxs, energies, atomization_list
def main(): description = """ Based on a list of molecules, train a representation-set and alpha set. Output the npy files """ import argparse parser = argparse.ArgumentParser() parser.add_argument('-f', '--filename', action='store', help='List of molecules', metavar='listfile') parser.add_argument('-d', '--dump', action='store', help='Output model in npy format', metavar='file') parser.add_argument('--test', action='store_true') parser.add_argument('--optimize', action='store_true') args = parser.parse_args() # Get molecule filenames f = open(args.filename, 'r') molecules = f.readlines() molecules = [mol.strip() for mol in molecules] f.close() DIRECTORY = args.filename.split("/") DIRECTORY = "/".join(DIRECTORY[:-1]) + "/" # Init all the rep lists list_atoms = [] list_charges = [] list_coordinates = [] list_energies = [] list_forces = [] list_rep = [] list_disp_rep = [] list_disp_rep5 = [] # HYPER PARAMETERS CUT_DISTANCE = 1e6 KERNEL_ARGS = { "verbose": False, "cut_distance": CUT_DISTANCE, "kernel": "gaussian", "kernel_args": { "sigma": [0.64], }, } DX = 0.005 # read coordinates for filename in molecules: atoms, coordinates = rmsd.get_coordinates_xyz(DIRECTORY + filename + ".xyz") nuclear_charges = [NUCLEAR_CHARGE[atom] for atom in atoms] f = open(DIRECTORY + filename + ".energy", 'r') energy = next(f) energy = float(energy) force = [] for line in f: force.append(line.split(",")) force = np.array(force, dtype=float) list_atoms.append(atoms) list_charges.append(nuclear_charges) list_coordinates.append(coordinates) list_energies.append(energy) list_forces.append(force) # Calculate NMAX hyperprameter NMAX = [len(x) for x in list_atoms] NMAX = np.max(NMAX) # Save model parameters PARAMETERS = { "kernel_args": KERNEL_ARGS, "cut_distance": CUT_DISTANCE, "max_atoms": NMAX, "dx": DX } # Calculate representations for charges, coordinates in zip(list_charges, list_coordinates): rep = generate_representation(coordinates, charges, max_size=NMAX, cut_distance=CUT_DISTANCE) disp_rep = generate_displaced_representations( coordinates, charges, max_size=NMAX, cut_distance=CUT_DISTANCE, dx=DX) list_rep.append(rep) list_disp_rep.append(disp_rep) list_atoms = np.array(list_atoms) list_coordinates = np.array(list_coordinates) list_energies = np.array(list_energies) list_forces = np.array(list_forces) list_rep = np.array(list_rep) list_disp_rep = np.array(list_disp_rep) # Hack, easy way to normalize energies (same molecule) avg = np.sum(list_energies) / len(list_energies) list_energies -= avg # hatree / bohr to hatree / aangstroem list_forces *= 1.0 / 0.529177249 # generate train / test views view_all = np.array(range(len(molecules))) # view_train, view_valid = np.split(view_all, 2) view_train = view_all # TODO cross-validation of hyper-parameter optimization # generate kernel kernel_train_energies, kernel_train_deriv = wrapper.get_kernel( list_rep[view_train], list_rep[view_train], list_disp_rep[view_train], list_disp_rep[view_train], dx=DX, kernel_args=KERNEL_ARGS) kernel_train_energies = kernel_train_energies[0] kernel_train_deriv = kernel_train_deriv[0] # generate alphas alphas = wrapper.get_alphas(kernel_train_energies, kernel_train_deriv, list_energies[view_train], list_forces[view_train]) # dump the model np.save(args.dump + ".alphas", alphas) np.save(args.dump + ".representations", list_rep) np.save(args.dump + ".displaced_representations", list_disp_rep) np.save(args.dump + ".parameters", PARAMETERS) # self test if args.selftest: energy_valid = np.dot(kernel_train_energies.T, alphas) force_valid = np.dot(kernel_train_deriv.T, alphas) print( mae(list_energies[view_train], energy_valid) < 0.08, "Error in operator test energy") print( mae(list_forces[view_train].flatten(), force_valid) < 0.1, "Error in operator test force") return