def main(): """ Test if Ridge regression is working. Parameters ---------- fxyz: string giving location of xyz file prefix: string giving the filename prefix """ fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz') fmat = ['SOAP-n4-l3-c1.9-g0.23'] fy = 'dft_formation_energy_per_atom_in_eV' prefix = "test-skrr" test_ratio = 0.05 lc_points = 8 lc_repeats = 8 # try to read the xyz file asapxyz = ASAPXYZ(fxyz) desc, _ = asapxyz.get_descriptors(fmat, False) y_all = asapxyz.get_property(fy) # print(desc) dm = Design_Matrix(X=desc, y=y_all, whiten=True, test_ratio=test_ratio) # kernel, jitter, delta, sigma, sparse_mode="fps", n_sparse=None k_spec = { 'k0': { "type": "linear" } } # { 'k1': {"type": "polynomial", "d": power}} # if sigma is not set... sigma = 0.001 * np.std(y_all) krr = KRRSparse(0., None, sigma) skrr = SPARSE_KRR_Wrapper(k_spec, krr, sparse_mode="fps", n_sparse=-1) # fit the model dm.compute_fit(skrr, 'skrr', store_results=True, plot=True) # learning curve if lc_points > 1: dm.compute_learning_curve(skrr, 'ridge_regression', lc_points=lc_points, lc_repeats=lc_repeats, randomseed=42, verbose=False) dm.save_state(prefix) plt.show()
def kernelridge(ctx, sigma, kernel, kernel_parameter, sparse_mode, n_sparse): """Kernel Ridge Regression (with sparsification)""" from asaplib.fit import SPARSE_KRR_Wrapper, KRRSparse k_spec = {"first_kernel": {"type": kernel, "d": kernel_parameter}} krr = KRRSparse(0., None, sigma) skrr = SPARSE_KRR_Wrapper(k_spec, krr, sparse_mode=sparse_mode, n_sparse=n_sparse) # fit the model ctx.obj['dm'].compute_fit(skrr, 'skrr', store_results=True, plot=True) if ctx.obj['fit_options']["learning_curve"] > 1: ctx.obj['dm'].compute_learning_curve(skrr, 'skrr', ctx.obj['fit_options']["learning_curve"], ctx.obj['fit_options']["lc_points"], randomseed=42, verbose=False) ctx.obj['dm'].save_state(ctx.obj['fit_options']['prefix']) from matplotlib import pyplot as plt plt.show()
def main(fmat, fy, prefix, test_ratio, jitter, n_sparse, sigma): """ Parameters ---------- fmat: Location of kernel matrix file. fy: Location of property list (1D-array of floats) prefix: filename prefix for learning curve figure test_ratio: train/test ratio jitter: jitter level, default is 1e-10 n_sparse: number of representative samples sigma: noise level in kernel ridge regression Returns ------- Fitting outcome & Learning curve. """ # if it has been computed before we can simply load it try: K_all = np.genfromtxt(fmat, dtype=float) except OSError: raise Exception( 'fmat file could not be loaded. Please check the filename') print("loaded", fmat) try: y_all = np.genfromtxt(fy, dtype=float) except OSError: raise Exception( 'property vector file could not be loaded. Please check the filename' ) if len(y_all) != len(K_all): raise ValueError( 'Length of the vector of properties is not the same as number of samples' ) else: n_sample = len(K_all) # train test split if test_ratio > 0: K_train, K_test, y_train, y_test, _, _ = kernel_random_split( K_all, y_all, test_ratio) else: K_train = K_test = K_all y_train = y_test = y_all n_train = len(K_train) n_test = len(K_test) # sparsification if n_sparse >= n_train: print( "the number of representative structure is too large, please select n < ", n_train) elif n_sparse > 0: ifps, dfps = fps(K_train, n_sparse, 0) K_MM = K_train[:, ifps][ifps] K_NM = K_train[:, ifps] K_TM = K_test[:, ifps] else: print("it's usually better to use some sparsification") K_MM = K_train K_NM = K_train K_TM = K_test delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM)) krr = KRRSparse(jitter, delta, sigma) # fit the model krr.fit(K_MM, K_NM, y_train) # get the predictions for train set y_pred = krr.predict(K_NM) # compute the CV score for the dataset print("train score: ", get_score(y_pred, y_train)) # get the predictions for test set y_pred_test = krr.predict(K_TM) # compute the CV score for the dataset print("test score: ", get_score(y_pred_test, y_test)) plot_styles.set_nice_font() fig = plt.figure(figsize=(8 * 2.1, 8)) ax = fig.add_subplot(121) ax.plot(y_train, y_pred, 'b.', label='train') ax.plot(y_test, y_pred_test, 'r.', label='test') ax.legend() ax.set_title('KRR for: ' + fy) ax.set_xlabel('actual y') ax.set_ylabel('predicted y') # learning curve # decide train sizes lc_points = 10 train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points) print("Learning curves using train sizes: ", train_sizes) lc_stats = 12 * np.ones(lc_points, dtype=int) lc = LCSplit(ShuffleSplit, n_repeats=lc_stats, train_sizes=train_sizes, test_size=n_test, random_state=10) scores = {size: [] for size in train_sizes} for lctrain, lctest in lc.split(y_train): Ntrain = len(lctrain) lc_K_NM = K_NM[lctrain, :] lc_y_train = y_train[lctrain] # lc_K_test = K_NM[lctest,:] lc_K_test = K_TM # lc_y_test = y_train[lctest] lc_y_test = y_test krr.fit(K_MM, lc_K_NM, lc_y_train) lc_y_pred = krr.predict(lc_K_test) scores[Ntrain].append(get_score(lc_y_pred, lc_y_test)) sc_name = 'RMSE' Ntrains = [] avg_scores = [] avg_scores_error = [] for Ntrain, score in scores.items(): avg = 0. var = 0. for sc in score: avg += sc[sc_name] var += sc[sc_name]**2. avg /= len(score) var /= len(score) var -= avg**2. avg_scores.append(avg) avg_scores_error.append(np.sqrt(var)) Ntrains.append(Ntrain) ax2 = fig.add_subplot(122) ax2.errorbar(Ntrains, avg_scores, yerr=avg_scores_error) ax2.set_title('Learning curve') ax2.set_xlabel('Number of training samples') ax2.set_ylabel('Test {}'.format(sc_name)) ax2.set_xscale('log') ax2.set_yscale('log') plt.show() fig.savefig('KRR_4_' + prefix + '.png')
def main(fmat, fxyz, fy, prefix, test_ratio, jitter, n_sparse, sigma, lc_points, lc_repeats): """ Parameters ---------- fmat: Location of kernel matrix file. fy: Location of property list (1D-array of floats) prefix: filename prefix for learning curve figure test_ratio: train/test ratio jitter: jitter level, default is 1e-10 n_sparse: number of representative samples, default is 5% of the data sigma: noise level in kernel ridge regression, default is 0.1% of the standard deviation of the data. lc_points : number of points on the learning curve lc_repeats : number of sub-sampling when compute the learning curve Returns ------- Fitting outcome & Learning curve. """ # if it has been computed before we can simply load it try: K_all = np.genfromtxt(fmat, dtype=float) except OSError: raise Exception( 'fmat file could not be loaded. Please check the filename') print("loaded", fmat) # read in the properties to be predicted y_all = [] try: y_all = np.genfromtxt(fy, dtype=float) except: try: # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) y_all = asapxyz.get_property(fy) except OSError: raise Exception( 'property vector file could not be loaded. Please check the filename' ) if len(y_all) != len(K_all): raise ValueError( 'Length of the vector of properties is not the same as number of samples' ) else: n_sample = len(K_all) # train test split if test_ratio > 0: K_train, K_test, y_train, y_test, _, _ = kernel_random_split( K_all, y_all, test_ratio) else: K_train = K_test = K_all y_train = y_test = y_all n_train = len(K_train) n_test = len(K_test) # set default value of n_sparse if n_sparse == 0: n_sparse = n_train // 20 # sparsification if n_sparse >= n_train: print( "the number of representative structure is too large, please select n < ", n_train) elif n_sparse > 0: ifps, dfps = fps(K_train, n_sparse, 0) K_MM = K_train[:, ifps][ifps] K_NM = K_train[:, ifps] K_TM = K_test[:, ifps] else: print("it's usually better to use some sparsification") K_MM = K_train K_NM = K_train K_TM = K_test # if sigma is not set... if sigma < 0: sigma = 0.001 * np.std(y_train) delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM)) krr = KRRSparse(jitter, delta, sigma) # fit the model krr.fit(K_MM, K_NM, y_train) fit_error = {} # get the predictions for train set y_pred = krr.predict(K_NM) # compute the CV score for the dataset y_pred, y_pred_test, fit_error = krr.get_train_test_error(K_NM, y_train, K_TM, y_test, verbose=True, return_pred=True) # dump to file import json with open('KRR_train_test_errors_4' + prefix + '.json', 'w') as fp: json.dump(fit_error, fp) # learning curve # decide train sizes if lc_points > 1 and n_sparse > 0: train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points) print("Learning curves using train sizes: ", train_sizes) lc_stats = lc_repeats * np.ones(lc_points, dtype=int) lc = LCSplit(ShuffleSplit, n_repeats=lc_stats, train_sizes=train_sizes, test_size=n_test, random_state=10) lc_scores = LC_SCOREBOARD(train_sizes) for lctrain, _ in lc.split(y_train): Ntrain = len(lctrain) lc_K_NM = K_NM[lctrain, :] lc_y_train = y_train[lctrain] # here we always use the same test set # otherwise, one can do `lc_K_test = K_NM[lctest,:]; lc_y_test = y_train[lctest]` krr.fit(K_MM, lc_K_NM, lc_y_train) # here we always use the same test set _, lc_score_now = krr.fit_predict_error(K_MM, lc_K_NM, lc_y_train, K_TM, y_test) lc_scores.add_score(Ntrain, lc_score_now) sc_name = 'RMSE' # MAE, RMSE, SUP, R2, CORR lc_results = lc_scores.fetch(sc_name) # output learning curve np.savetxt("KRR_learning_curve_4" + prefix + ".dat", lc_results) plot_styles.set_nice_font() if lc_points > 1 and n_sparse > 0: fig = plt.figure(figsize=(8 * 2.1, 8)) ax = fig.add_subplot(121) else: fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax.plot(y_train, y_pred, 'b.', label='train') ax.plot(y_test, y_pred_test, 'r.', label='test') ax.legend() ax.set_title('KRR for: ' + fy) ax.set_xlabel('actual y') ax.set_ylabel('predicted y') if lc_points > 1 and n_sparse > 0: ax2 = fig.add_subplot(122) ax2.errorbar(lc_results[:, 0], lc_results[:, 1], yerr=lc_results[:, 2], linestyle='', uplims=True, lolims=True) ax2.set_title('Learning curve') ax2.set_xlabel('Number of training samples') ax2.set_ylabel('Test {}'.format(sc_name)) ax2.set_xscale('log') ax2.set_yscale('log') plt.show() fig.savefig('KRR_4_' + prefix + '.png')