def test_fit_1(): """ This function tests the first way of fitting the descriptor: the data is passed by first creating compounds and then the descriptors are created from the compounds. """ test_dir = os.path.dirname(os.path.realpath(__file__)) filenames = glob.glob(test_dir + "/CN_isobutane/*.xyz") energies = np.loadtxt(test_dir + '/CN_isobutane/prop_kjmol_training.txt', usecols=[1]) filenames.sort() estimator = ARMP(representation="acsf") estimator.generate_compounds(filenames[:50]) estimator.set_properties(energies[:50]) estimator.generate_representation() idx = np.arange(0, 50) estimator.fit(idx)
} # Generate estimator estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', representation_params=acsf_params, tensorboard=True, store_frequency=2, hidden_layer_sizes=(50, 30, 10), batch_size=200) estimator.set_properties(ene_isopent) estimator.generate_representation(pad_xyz, pad_zs, method='fortran') print("Generated the representations") print(estimator.representation.shape) idx = list(range(n_samples)) idx_train, idx_test = modsel.train_test_split(idx, random_state=42, shuffle=True) estimator.fit(idx_train) data_squal = h5py.File( "/Volumes/Transcend/data_sets/CN_squalane/dft/squalane_cn_dft.hdf5", "r") xyz_squal = np.array(data_squal.get("xyz")[:10])
"zeta": 220.127, "eta": 30.8065 } estimator = ARMP(iterations=6000, representation_name='acsf', representation_params=acsf_params, l1_reg=0.0, l2_reg=0.0, scoring_function="rmse", tensorboard=False, store_frequency=10, learning_rate=0.075) estimator.set_properties(energies[:100]) estimator.generate_compounds(filenames[:100]) estimator.generate_representation(method="tf") print(estimator.representation.shape) idx = list(range(100)) idx_train, idx_test = modsel.train_test_split(idx, test_size=0, random_state=42, shuffle=True) estimator.fit(idx_train) score = estimator.score(idx_train) print("The RMSE is %s kcal/mol." % (str(score))) ene_pred = estimator.predict(idx_train)
from qml.aglaia.aglaia import ARMP import glob import numpy as np from sklearn import model_selection as modsel test_dir = "/Volumes/Transcend/repositories/my_qml_fork/qml/test/" filenames = glob.glob(test_dir + "/qm7/*.xyz") energies = np.loadtxt(test_dir + '/data/hof_qm7.txt', usecols=[1]) filenames.sort() n_samples = 500 estimator = ARMP(representation_name="acsf", iterations=100) estimator.generate_compounds(filenames[:n_samples]) estimator.set_properties(energies[:n_samples]) estimator.generate_representation(method="fortran") idx = np.arange(0, n_samples) idx_train, idx_test = modsel.train_test_split(idx, random_state=42, shuffle=True, test_size=0.1) estimator.fit(idx_train) estimator.score(idx_train)
"angular_rs": np.arange(0, 10, 0.5), "theta_s": np.arange(0, 3.14, 0.25) } estimator = ARMP(iterations=2000, batch_size=256, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.00015, representation='acsf', representation_params=acsf_params, tensorboard=True, store_frequency=50) estimator.set_properties(ene) estimator.generate_representation(xyz, zs) idx = list(range(n_samples)) idx_train, idx_test = modsel.train_test_split(idx, test_size=0.15, random_state=42, shuffle=True) all_scores = [] for lr in learning_rate: for l1 in l1_reg: for l2 in l2_reg: estimator.fit(idx_train) score = estimator.score(idx_test)
"zeta": 100.06564927139748, "eta": 39.81824764370754 } estimator = ARMP(iterations=2633, batch_size=22, l1_reg=1.46e-05, l2_reg=0.0001, learning_rate=0.0013, representation_name='acsf', representation_params=acsf_params, tensorboard=True, store_frequency=25, hidden_layer_sizes=(185, )) estimator.set_properties(ene_isopent) estimator.generate_representation(xyz_isopent, zs_isopent, method="fortran") # Training the model on 3 folds of n data points for n in n_samples: cv_idx = idx_train[:n] splitter = modsel.KFold(n_splits=3, random_state=42, shuffle=True) indices = splitter.split(cv_idx) scores_per_fold = [] traj_scores_per_fold = [] for item in indices: idx_train_fold = cv_idx[item[0]] idx_test_fold = cv_idx[item[1]]
l1_reg=1.46e-05, l2_reg=0.0001, learning_rate=0.0013, representation_name='acsf', representation_params=acsf_params, tensorboard=True, store_frequency=25, hidden_layer_sizes=(185, )) # Loading the model previously trained estimator.load_nn("../trained_nn/vr-nn") estimator.set_properties(ene_surface) # Generating the representation start = time.time() estimator.generate_representation(xyz_surface, zs_surface, method="fortran") end = time.time() print("The time taken to generate the representations is %s s" % (str(end - start))) print("The shape of the representations is %s" % (str(estimator.representation.shape))) # Predicting the energies idx = list(range(n_samples)) predictions = estimator.predict(idx) # Printing the mean absolute error mae = mean_absolute_error(ene_surface, predictions) print("The MAE is %.2f kJ/mol" % mae) # Saving the results to a HDF5 file
## ------------- ** Loading the data ** --------------- current_dir = os.path.dirname(os.path.realpath(__file__)) filenames = glob.glob(current_dir + '/../test/CN_isobutane/*.xyz') energies = np.loadtxt(current_dir + '/../test/CN_isobutane/prop_kjmol_training.txt', usecols=[1]) filenames.sort() ## ------------- ** Setting up the estimator ** --------------- estimator = ARMP(iterations=10, representation='acsf', representation_params={"radial_rs": np.arange(0, 10, 1), "angular_rs": np.arange(0.5, 10.5, 1), "theta_s": np.arange(0, 5, 1)}, tensorboard=False) estimator.generate_compounds(filenames) estimator.set_properties(energies) estimator.generate_representation() ## ------------- ** Fitting to the data ** --------------- idx = np.arange(0,100) estimator.fit(idx) ## ------------- ** Predicting and scoring ** --------------- score = estimator.score(idx) print("The mean absolute error is %s kJ/mol." % (str(-score))) energies_predict = estimator.predict(idx)
# Getting the dataset data = h5py.File("/Volumes/Transcend/data_sets/CN_isopentane/pruned_dft_with_forces/pruned_isopentane_cn_dft.hdf5", "r") n_samples = 500 xyz = np.array(data.get("xyz")[-n_samples:]) ene = np.array(data.get("ene")[-n_samples:])*2625.50 ene = ene - data.get("ene")[0]*2625.50 zs = np.array(data["zs"][-n_samples:], dtype=np.int32) # Creating the estimator acsf_param = {"nRs2": 5, "nRs3": 5, "nTs": 5, "rcut": 5, "acut": 5, "zeta": 220.127, "eta": 30.8065} estimator = ARMP(iterations=1000, batch_size=512, l1_reg=0.0, l2_reg=0.0, learning_rate=0.001, representation_name='acsf', representation_params=acsf_param, tensorboard=False, store_frequency=50) estimator.set_properties(ene) estimator.generate_representation(xyz, zs, method='fortran') print(estimator.g.shape) # Doing cross validation idx = list(range(n_samples)) idx_train, idx_test = modsel.train_test_split(idx, test_size=0.15, random_state=42, shuffle=False) print("Starting the fitting...") estimator.fit(idx_train) # estimator.save_nn("saved_model") pred1 = estimator.predict(idx_train) pred2 = estimator.predict_from_xyz(xyz[idx_train], zs[idx_train])