def test_krr_cmat(): test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects mols = [] for xyz_file in sorted(data.keys())[:1000]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.generate_coulomb_matrix(size=23, sorting="row-norm") mols.append(mol) # Shuffle molecules np.random.seed(666) np.random.shuffle(mols) # Make training and test sets n_test = 300 n_train = 700 training = mols[:n_train] test = mols[-n_test:] # List of representations X = np.array([mol.representation for mol in training]) Xs = np.array([mol.representation for mol in test]) # List of properties Y = np.array([mol.properties for mol in training]) Ys = np.array([mol.properties for mol in test]) # Set hyper-parameters sigma = 10**(4.2) llambda = 10**(-10.0) # Generate training Kernel K = laplacian_kernel(X, X, sigma) # Solve alpha K[np.diag_indices_from(K)] += llambda alpha = cho_solve(K, Y) # Calculate prediction kernel Ks = laplacian_kernel(X, Xs, sigma) Yss = np.dot(Ks.transpose(), alpha) mae = np.mean(np.abs(Ys - Yss)) assert mae < 6.0, "ERROR: Too high MAE!"
def krr(kernel, properties, rcond=1e-9, solver="cho"): # rcond = 1e-4 if solver == "cho": alpha = cho_solve(kernel, properties, l2reg=rcond) else: alpha = svd_solve(kernel, properties, rcond=rcond) return alpha
def get_alphas(X, Y, sigma, llambda): ''' calculates the regression coefficient alpha ''' K = laplacian_kernel(X, X, sigma) C = deepcopy(K) C[np.diag_indices_from(C)] += llambda alpha = cho_solve(C, Y) return alpha
def cross_validation(X, Y, sigmas, llambdas, Ntot): """ finds optimal hyperparameters sigma & lambda using cross validation """ parameters = [] random.seed(666) for i in range(len(sigmas)): K = laplacian_kernel(X, X, sigmas[i]) for j in range(len(llambdas)): for m in range(5): maes = [] split = range(Ntot) random.shuffle(split) train = int(len(split)*0.8) test = int(Ntot - train) training_index = split[:train] test_index = split[-test:] y_train = Y[training_index] y_test = Y[test_index] C = deepcopy(K[training_index][:,training_index]) C[np.diag_indices_from(C)] += llambdas[j] alpha = cho_solve(C, y_train) y_est = np.dot((K[training_index][:,test_index]).T, alpha) diff = y_est - y_test mae = np.mean(np.abs(diff)) maes.append(mae) parameters.append([llambdas[j], sigmas[i], np.mean(maes)]) maes = [mae[2] for mae in parameters] index = maes.index(min(maes)) print("minimum MAE after CV: ", min(maes)) return parameters[index][0], parameters[index][1]
def do_qml_gaussian_kernel(self): #K is also a np array, create kernel matrix K = gaussian_kernel(self.x_training, self.x_training, self.sigma) #add small lambda to the diagonal of the kernel matrix K[np.diag_indices_from(K)] += self.lamda #use the built in Cholesky-decomposition to solve alpha = cho_solve(K, self.y_training) #predict new, calculate kernel matrix between test and training Ks = gaussian_kernel(self.x_test, self.x_training, self.sigma) #make prediction Y_predicted = np.dot(Ks, alpha) # Calculate mean-absolute-error (MAE): self.mae = np.mean(np.abs(Y_predicted - self.y_test)) self.test_predicted_results = Y_predicted
def get_learning_curve(X, X_test, Y, Y_test, sigma, llambda, Ntot): ''' generate data (predictions) for learning curves ''' K = laplacian_kernel(X, X, sigma) K_test = laplacian_kernel(X, X_test, sigma) N = [] j = 10 while(j < Ntot): N.append(j) j *= 2 N.append(Ntot) random.seed(667) for train in N: maes = [] for i in range(10): split = range(Ntot) random.shuffle(split) training_index = split[:train] y = Y[training_index] C = deepcopy(K[training_index][:,training_index]) C[np.diag_indices_from(C)] += llambda alpha = cho_solve(C, y) Yss = np.dot(K_test[training_index].T, alpha) diff = Yss - Y_test mae = np.mean(np.abs(diff)) maes.append(mae) print(str(train) + "\t" + str(sum(maes)/len(maes)))
K_test = gaussian_kernel(X, X_test, sigmas[j]) for train in N: test = total - train maes = [] for i in range(nModels): split = list(range(total)) random.shuffle(split) training_index = split[:train] test_index = split[-test:] Y = Yprime[training_index] Ys = Yprime[test_index] C = deepcopy(K[training_index][:, training_index]) C[np.diag_indices_from(C)] += 10.0**(-7.0) alpha = cho_solve(C, Y) Yss = np.dot((K_test[training_index]).T, alpha) diff = Yss - Ytest mae = np.mean(np.abs(diff)) maes.append(mae) rms = sqrt(mean_squared_error(Yss, Ytest)) s = np.std(maes) / np.sqrt(nModels) print( str(sigmas[j]) + "\t" + str(train) + "\t" + str(sum(maes) / len(maes)) + "\t" + str(s) + "\t" + str(rms))
reps.append(rep) energies.append(energy) except: print(ani['molecule']) print(ani['species']) print(ani['coordinates']) print(ani['energy']) X = np.array(reps)[:5000] y = np.array(energies)[:5000] sigma = 2.5 K = get_local_kernels(X, X, [sigma], cut_distance=10.0)[0] K[np.diag_indices_from(K)] += 1e-8 alpha = cho_solve(K, y) np.save('/ihome/ghutchison/dlf57/ml-benchmark/alpha-sig25-5k.npz', alpha) data = [] for out in sorted(glob.iglob( '/ihome/ghutchison/dlf57/ml-benchmark/molecules/stretch/*/sdf/*.sdf'), key=numericalSort): name = out.split('stretch/')[1].split('/sdf')[0] pt = out.split('sdf/')[1].split('.')[0] if name != 'HF': mol = Molecule(out) coords = mol.xyz at_num = mol.at_num rep = generate_representation(coords, at_num, max_size=45) rep = np.array([rep])
def test_krr_fchl_local(): # Test that all kernel arguments work kernel_args = { "cut_distance": 1e6, "cut_start": 0.5, "two_body_width": 0.1, "two_body_scaling": 2.0, "two_body_power": 6.0, "three_body_width": 3.0, "three_body_scaling": 2.0, "three_body_power": 3.0, "alchemy": "periodic-table", "alchemy_period_width": 1.0, "alchemy_group_width": 1.0, "fourier_order": 2, } test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects" mols = [] for xyz_file in sorted(data.keys())[:100]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.generate_fchl_representation(cut_distance=1e6) mols.append(mol) # Shuffle molecules np.random.seed(666) np.random.shuffle(mols) # Make training and test sets n_test = len(mols) // 3 n_train = len(mols) - n_test training = mols[:n_train] test = mols[-n_test:] X = np.array([mol.representation for mol in training]) Xs = np.array([mol.representation for mol in test]) # List of properties Y = np.array([mol.properties for mol in training]) Ys = np.array([mol.properties for mol in test]) # Set hyper-parameters sigma = 2.5 llambda = 1e-8 K_symmetric = get_local_symmetric_kernels(X, [sigma], **kernel_args)[0] K = get_local_kernels(X, X, [sigma], **kernel_args)[0] assert np.allclose(K, K_symmetric), "Error in FCHL symmetric local kernels" assert np.invert(np.all( np.isnan(K_symmetric))), "FCHL local symmetric kernel contains NaN" assert np.invert(np.all(np.isnan(K))), "FCHL local kernel contains NaN" # Solve alpha K[np.diag_indices_from(K)] += llambda alpha = cho_solve(K, Y) # Calculate prediction kernel Ks = get_local_kernels(Xs, X, [sigma], **kernel_args)[0] assert np.invert(np.all( np.isnan(Ks))), "FCHL local testkernel contains NaN" Yss = np.dot(Ks, alpha) mae = np.mean(np.abs(Ys - Yss)) assert abs(2 - mae) < 1.0, "Error in FCHL local kernel-ridge regression"
def test_krr_fchl_global(): test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects" mols = [] for xyz_file in sorted(data.keys())[:100]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.representation = generate_representation(mol.coordinates, \ mol.nuclear_charges, cut_distance=1e6) mols.append(mol) # Shuffle molecules np.random.seed(666) np.random.shuffle(mols) # Make training and test sets n_test = len(mols) // 3 n_train = len(mols) - n_test training = mols[:n_train] test = mols[-n_test:] X = np.array([mol.representation for mol in training]) Xs = np.array([mol.representation for mol in test]) # List of properties Y = np.array([mol.properties for mol in training]) Ys = np.array([mol.properties for mol in test]) # Set hyper-parameters sigma = 100.0 llambda = 1e-8 K_symmetric = get_global_symmetric_kernels(X, [sigma])[0] K = get_global_kernels(X, X, [sigma])[0] assert np.allclose(K, K_symmetric), "Error in FCHL symmetric global kernels" assert np.invert(np.all( np.isnan(K_symmetric))), "FCHL global symmetric kernel contains NaN" assert np.invert(np.all(np.isnan(K))), "FCHL global kernel contains NaN" # Solve alpha K[np.diag_indices_from(K)] += llambda alpha = cho_solve(K, Y) # # Calculate prediction kernel Ks = get_global_kernels(Xs, X, [sigma])[0] assert np.invert(np.all( np.isnan(Ks))), "FCHL global testkernel contains NaN" Yss = np.dot(Ks, alpha) mae = np.mean(np.abs(Ys - Yss)) assert abs(2 - mae) < 1.0, "Error in FCHL global kernel-ridge regression"
for mol in compounds: mol.generate_coulomb_matrix(size=23, sorting="row-norm") # mol.generate_bob(size=23, asize={"O":3, "C":7, "N":3, "H":16, "S":1}) # Make a big 2D array with all the representations X = np.array([mol.representation for mol in compounds]) # X = np.array([mol.bob for mol in compounds]) # Print all representations print("Representations:") print(X) # Assign 1000 first molecules to the training set X_training = X[:1000] Y_training = energy_pbe0[:1000] sigma = 4000.0 K = gaussian_kernel(X_training, X_training, sigma) print("Gaussian kernel:") print(K) # Add a small lambda to the diagonal of the kernel matrix K[np.diag_indices_from(K)] += 1e-8 # Use the built-in Cholesky-decomposition to solve alpha = cho_solve(K, Y_training) print("Alphas:") print(alpha)
for lamb in lambda1: maes = [] for ibin in bins: split = np.array(list(range(N_train_val))) random.shuffle(split) training_index = split[:int(xxx * train)] val_index = split[-int(train / nbins):] # Assign bin for training Y_train = Y_train_val[training_index] # Assign remaining bins to cross validation Y_val = Y_train_val[val_index] C = deepcopy(K[training_index][:, training_index]) # Add a small lambda to the diagonal of the kernel matrix C[np.diag_indices_from(C)] += lamb # Use the built-in Cholesky-decomposition to solve alpha = cho_solve(C, Y_train) # Validate Yss = np.dot((K[training_index][:, val_index]).T, alpha) diff = Yss - Y_val mae = np.mean(np.abs(diff)) maes.append(mae) # Calculate mean-absolute-error (MAE): s = np.std(maes) / np.sqrt(nbins) print(sig, lamb, train, sum(maes) / len(maes), s) #print(sig, lamb, train, sum(maes)/len(maes), s, file=open("direkt_S1_TZVP_osc_no_neg.dat", "a")) #print(sig, lamb, train, sum(maes)/len(maes), s, file=open("direkt_S1_TZVP_no_neg.dat", "a"))
def get_alphas(kernel, properties): alpha = qml_math.cho_solve(kernel, properties) return alpha
test_filenames = filenames[500:750] # hyper parameters sigmas = [1.0, 10.0, 10.0**2, 10.0**3] cutoffs = [2.0, 3.0, 4.0] llambda = 1e-8 # doesn't usually need to be changed # try 3 different cutoffs for cutoff in cutoffs: train_x, train_y = get_descriptor_and_property(train_filenames, atype, cutoff) test_x, test_y = get_descriptor_and_property(test_filenames, atype, cutoff) # in this case try out 4 different values of sigma for sigma in sigmas: # Get the kernel between all descriptors in the training set K = laplacian_kernel(train_x, train_x, sigma) + llambda * np.identity(train_x.shape[0]) # get the KRR prefactors, i.e. this is the training of the network alpha = cho_solve(K, train_y) # get the kernel between all descriptors in the training set and all in the test set Ks = laplacian_kernel(test_x, train_x, sigma) # predict values of y y_pred = np.dot(Ks, alpha) print( "predicted MAE of %.4f for sigma: %.4g, cutoff: %.1f and %d training points" % (calc_mae(y_pred, test_y), sigma, cutoff, len(train_x)))
def test_krr_gaussian_local_cmat(): test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects" mols = [] for xyz_file in sorted(data.keys())[:1000]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.generate_atomic_coulomb_matrix(size=23, sorting="row-norm") mols.append(mol) # Shuffle molecules np.random.seed(666) np.random.shuffle(mols) # Make training and test sets n_test = 100 n_train = 200 training = mols[:n_train] test = mols[-n_test:] X = np.concatenate([mol.representation for mol in training]) Xs = np.concatenate([mol.representation for mol in test]) N = np.array([mol.natoms for mol in training]) Ns = np.array([mol.natoms for mol in test]) # List of properties Y = np.array([mol.properties for mol in training]) Ys = np.array([mol.properties for mol in test]) # Set hyper-parameters sigma = 724.0 llambda = 10**(-6.5) K = get_local_kernels_gaussian(X, X, N, N, [sigma])[0] assert np.allclose(K, K.T), "Error in local Gaussian kernel symmetry" K_test = np.loadtxt(test_dir + "/data/K_local_gaussian.txt") assert np.allclose( K, K_test), "Error in local Gaussian kernel (vs. reference)" K_test = get_atomic_kernels_gaussian(training, training, [sigma])[0] assert np.allclose(K, K_test), "Error in local Gaussian kernel (vs. wrapper)" # Solve alpha K[np.diag_indices_from(K)] += llambda alpha = cho_solve(K, Y) # Calculate prediction kernel Ks = get_local_kernels_gaussian(Xs, X, Ns, N, [sigma])[0] Ks_test = np.loadtxt(test_dir + "/data/Ks_local_gaussian.txt") # Somtimes a few coulomb matrices differ because of parallel sorting and numerical error # Allow up to 5 molecules to differ from the supplied reference. differences_count = len(set(np.where(Ks - Ks_test > 1e-7)[0])) assert differences_count < 5, "Error in local Laplacian kernel (vs. reference)" # assert np.allclose(Ks, Ks_test), "Error in local Gaussian kernel (vs. reference)" Ks_test = get_atomic_kernels_gaussian(test, training, [sigma])[0] assert np.allclose(Ks, Ks_test), "Error in local Gaussian kernel (vs. wrapper)" Yss = np.dot(Ks, alpha) mae = np.mean(np.abs(Ys - Yss)) assert abs(19.0 - mae) < 1.0, "Error in local Gaussian kernel-ridge regression"
def __init__(self, wds, ia1, ia2, coeff=1.0, llambda=1.e-4): """ ia1, ia2 -- atomic index, starting from 0, """ s1 = SLATM(wds, 'out', regexp='', properties='AE', M='slatm', \ local=True, igroup=False, ow=False, nproc=1, istart=0, \ slatm_params = { 'nbody':3, 'dgrids': [0.03,0.03], 'sigmas':[0.05,0.05],\ 'rcut':4.8, 'rpower2':6, 'ws':[1.,1.,1.], \ 'rpower3': 3, 'isf':0, 'kernel':'g', 'intc':3 }, \ iY=False) fs = s1.fs coords = s1.coords iast2 = s1.nas.cumsum() iast1 = np.array([ 0, ] + list(kas2[:-1])) objs = [] ds = [] for i, f in enumerate(fs): obj = wfn(f) obj.get_dm() objs.append(obj) if i < self.nm - 1: ds.append(ssd.cdist(coords[i], coords[self.nm - 1])) ## specify target atom pairs!! #ia1, ia2 = 0, 1 #coeff = 1.0; llambda = 1e-6 cia1 = coords[-1][ia1] cia2 = coords[-1][ia2] xs = [] ys = [] nhass = [] for i, f in enumerate(fs): dsi = ds[i] jas = np.arange(dsi.shape[0]) filt1 = (dsi[:, ia1] <= 0.01) filt2 = (dsi[:, ia2] <= 0.01) if np.any(filt1) and np.any(filt2): nhass.append(s1.nhass[i]) obj = objs[i] ja1 = jas[filt1] ja2 = jas[filt2] p, q, r, s = obj.ibs1[ja1], obj.ibs2[ja1], obj.ibs1[ ja2], obj.ibs2[ja2] dmij = obj.dm[p:q, r:s].ravel() ys.append(dmij) iat1 = iast1[i] + ja1 iat2 = iast1[i] + ja2 x1 = s1.X[iat1] x2 = s1.X[iat2] xs.append(np.concatenate((x1, x2), axis=0)) nprop = len(dmij) nt = len(nhass) nhass = np.array(nhass) tidxs = np.arange(nt) nhass_u = np.unique(nhass) nu = len(nhass_u) xs = np.array(xs) ys = np.array(ys) xs2 = np.array([xs[-1]]) ys2 = np.array([ys[-1]]) for j in range(nu): jdxs = tidxs[nhass <= nhass_u[j]] xs1 = xs[jdxs, :] ys1 = ys[jdxs, :] ds1 = qd.l2_distance(X1, X1) # ssd.pdist(x1, metric='euclidean') dmax = max(ds1.ravel()) sigma = coeff * dmax / np.sqrt(2.0 * np.log(2.0)) K1 = qk.gaussian_kernel(xs1, xs1, sigma) assert np.allclose(K1, K1.T), "Error in local Gaussian kernel symmetry" K1[np.diag_indices_from(K1)] += llambda alpha = np.array([cho_solve(K1, ys1)]).T K2 = qk.gaussian_kernel(xs2, xs1, sigma) ys2_est = np.dot(K2, alpha) error = np.squeeze(ys2_est) - ys2 mae = np.sum(np.abs(error)) / nprop rmse = np.sqrt(np.sum(error**2) / nprop) print('%4d %12.8f %12.8f' % (len(xs1), mae, rmse))
#copy relevant rows&columns from K for learning C = deepcopy(K[training_indices][:, training_indices]) #add slight alteration lambda C[np.diag_indices_from(C)] += lamda #further info m_c.x_training = X_list[rep][training_indices] m_c.x_test = X_list[rep][test_indices] m_c.y_training = Y_energy_list[training_indices] m_c.y_test = Y_energy_list[test_indices] #solve for alphas alphas = cho_solve(C, m_c.y_training) K_test = m_c.laplacian_kernel_matrix( x_training=m_c.x_training, x_test=m_c.x_test) m_c.test_predicted_results = np.dot(K_test, alphas) mae = m_c.calculate_mae() mae_nmodels += mae print("mae_nmodels ", mae_nmodels) avg_mae = mae_nmodels / float(nModels) print("avg mae:", avg_mae) m_c.mae = avg_mae learning_list.append(m_c)
d_power = 4.0 # Width for Gaussians in the spectrum d_width = 0.1 # Gaussian-kernel width sigma = 50.0 K = boss_kernel(X, X, n_train, n_train, sigma, d_width, d_power) Ks = boss_kernel(X, Xs, n_train, n_test, sigma, d_width, d_power) print K K[np.diag_indices_from(K)] += 1e-8 alpha = cho_solve(K,Y) # Calculate prediction kernel Yss = np.dot(Ks.transpose(), alpha) mae = np.mean(np.abs(Ys - Yss)) print "boss" print mae for mol in training: mol.generate_bob() for mol in test: mol.generate_bob()
def train_KRR_qml(X, y, sigma=1e3, llambda=1e-8): K = compute_kernel_qml(X, X, sigma=sigma) K[np.diag_indices_from(K)] += llambda alpha = cho_solve(K, y) return alpha