def sparsify(self, desc_or_ntotal, n_or_ratio, sparse_param=0): """ Function handing the sparsification of data Parameters ---------- desc_or_ntotal: np.matrix or int Either a design matrix [n_sample, n_desc], or simply the total number of samples n_or_ratio: int or float Either the number or the fraction of sparsified points sparse_param: int additional parameter that may be needed for the specific sparsifier used Returns ---------- sbs: list a list of the indexes for the sparsified points """ if isinstance(desc_or_ntotal, int): n_total = desc_or_ntotal input_desc = False else: desc = desc_or_ntotal n_total = len(desc_or_ntotal) input_desc = True if n_or_ratio == 1 or isinstance(n_or_ratio, float): n_sparse = n_total * n_or_ratio elif isinstance(n_or_ratio, int): n_sparse = n_or_ratio else: raise ValueError("the sparsification ratio/number should be a float or int.") self._check(n_sparse, n_total) if self.sparse_mode == 'fps': if not input_desc: raise ValueError("fps needs design matrix") sbs, _ = fps(desc, n_sparse, int(sparse_param)) elif self.sparse_mode == 'cur': if not input_desc: raise ValueError("cur needs design matrix") import numpy as np cov = np.dot(np.asmatrix(desc), np.asmatrix(desc).T) sbs, _ = CUR_deterministic(cov, n_sparse) elif self.sparse_mode == 'random': _, sbs = random_split(n_total, n_sparse/n_total) elif self.sparse_mode == 'sequential': sbs = range(n_sparse) else: raise ValueError("sparse mode not right") return sbs
def main(fmat, fy, prefix, test_ratio, jitter, n_sparse, sigma): """ Parameters ---------- fmat: Location of kernel matrix file. fy: Location of property list (1D-array of floats) prefix: filename prefix for learning curve figure test_ratio: train/test ratio jitter: jitter level, default is 1e-10 n_sparse: number of representative samples sigma: noise level in kernel ridge regression Returns ------- Fitting outcome & Learning curve. """ # if it has been computed before we can simply load it try: K_all = np.genfromtxt(fmat, dtype=float) except OSError: raise Exception( 'fmat file could not be loaded. Please check the filename') print("loaded", fmat) try: y_all = np.genfromtxt(fy, dtype=float) except OSError: raise Exception( 'property vector file could not be loaded. Please check the filename' ) if len(y_all) != len(K_all): raise ValueError( 'Length of the vector of properties is not the same as number of samples' ) else: n_sample = len(K_all) # train test split if test_ratio > 0: K_train, K_test, y_train, y_test, _, _ = kernel_random_split( K_all, y_all, test_ratio) else: K_train = K_test = K_all y_train = y_test = y_all n_train = len(K_train) n_test = len(K_test) # sparsification if n_sparse >= n_train: print( "the number of representative structure is too large, please select n < ", n_train) elif n_sparse > 0: ifps, dfps = fps(K_train, n_sparse, 0) K_MM = K_train[:, ifps][ifps] K_NM = K_train[:, ifps] K_TM = K_test[:, ifps] else: print("it's usually better to use some sparsification") K_MM = K_train K_NM = K_train K_TM = K_test delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM)) krr = KRRSparse(jitter, delta, sigma) # fit the model krr.fit(K_MM, K_NM, y_train) # get the predictions for train set y_pred = krr.predict(K_NM) # compute the CV score for the dataset print("train score: ", get_score(y_pred, y_train)) # get the predictions for test set y_pred_test = krr.predict(K_TM) # compute the CV score for the dataset print("test score: ", get_score(y_pred_test, y_test)) plot_styles.set_nice_font() fig = plt.figure(figsize=(8 * 2.1, 8)) ax = fig.add_subplot(121) ax.plot(y_train, y_pred, 'b.', label='train') ax.plot(y_test, y_pred_test, 'r.', label='test') ax.legend() ax.set_title('KRR for: ' + fy) ax.set_xlabel('actual y') ax.set_ylabel('predicted y') # learning curve # decide train sizes lc_points = 10 train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points) print("Learning curves using train sizes: ", train_sizes) lc_stats = 12 * np.ones(lc_points, dtype=int) lc = LCSplit(ShuffleSplit, n_repeats=lc_stats, train_sizes=train_sizes, test_size=n_test, random_state=10) scores = {size: [] for size in train_sizes} for lctrain, lctest in lc.split(y_train): Ntrain = len(lctrain) lc_K_NM = K_NM[lctrain, :] lc_y_train = y_train[lctrain] # lc_K_test = K_NM[lctest,:] lc_K_test = K_TM # lc_y_test = y_train[lctest] lc_y_test = y_test krr.fit(K_MM, lc_K_NM, lc_y_train) lc_y_pred = krr.predict(lc_K_test) scores[Ntrain].append(get_score(lc_y_pred, lc_y_test)) sc_name = 'RMSE' Ntrains = [] avg_scores = [] avg_scores_error = [] for Ntrain, score in scores.items(): avg = 0. var = 0. for sc in score: avg += sc[sc_name] var += sc[sc_name]**2. avg /= len(score) var /= len(score) var -= avg**2. avg_scores.append(avg) avg_scores_error.append(np.sqrt(var)) Ntrains.append(Ntrain) ax2 = fig.add_subplot(122) ax2.errorbar(Ntrains, avg_scores, yerr=avg_scores_error) ax2.set_title('Learning curve') ax2.set_xlabel('Number of training samples') ax2.set_ylabel('Test {}'.format(sc_name)) ax2.set_xscale('log') ax2.set_yscale('log') plt.show() fig.savefig('KRR_4_' + prefix + '.png')
def main(fxyz, fy, prefix, nkeep, algorithm, fmat, fkde, reweight_lambda): """ Select frames from the supplied xyz file (fxyz) using one of the following algorithms: 1. random: random selection 2. fps: farthest point sampling selection. Need to supply a kernel matrix or descriptor matrix using -fmat 3. sortmin/sortmax: select the frames with the largest/smallest value. Need to supply the vector of properties using -fy 4. CUR decomposition 5. Reweight according to the re-weighted distribution exp(-f/\lambda), where exp(-f) is the precomputed kernel density estimation of the original samples. Parameters ---------- fxyz: Path to xyz file. fy: Path to the list of properties (N floats) or name of the tags in ase xyz file prefix: Filename prefix, default is ASAP nkeep: The number of representative samples to select algorithm: 'the algorithm for selecting frames ([random], [fps], [sort], [reweight])') fmat: Location of descriptor or kernel matrix file. Needed if you select [fps]. You can use gen_kmat.py to compute it. reweight_lambda: select samples according to the re-weighted distribution exp(-f/\lambda), where exp(-f) is the kernel density estimation of the original samples. """ # read the xyz file asapxyz = ASAPXYZ(fxyz) nframes = asapxyz.get_num_frames() if nkeep == 0: nkeep = nframes if fy != 'none': y_all = [] try: y_all = np.genfromtxt(fy, dtype=float) except: y_all = asapxyz.get_property(fy) if len(y_all) != nframes: raise ValueError( 'Length of the vector of properties is not the same as number of samples' ) if algorithm == 'random' or algorithm == 'RANDOM': idx = np.asarray(range(nframes)) sbs = np.random.choice(idx, nkeep, replace=False) elif algorithm == 'sortmax' or algorithm == 'sortmin': if fy == 'none': raise ValueError( 'must supply the vector of properties for sorting') idx = np.asarray(range(nframes)) if algorithm == 'sortmax': sbs = [x for _, x in sorted(zip(y_all, idx))][:nkeep] elif algorithm == 'sortmin': sbs = [x for _, x in sorted(zip(y_all, idx))][nkeep:] elif algorithm == 'fps' or algorithm == 'FPS' or algorithm == 'cur' or algorithm == 'CUR': # for both algo we read in the descriptor matrix desc, _ = asapxyz.get_descriptors(fmat) if os.path.isfile(fmat): try: desc = np.genfromtxt(fmat, dtype=float) except: raise ValueError('Cannot load the kernel matrix') print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0])) # FPS if algorithm == 'fps' or algorithm == 'FPS': sbs, dmax_remain = fps(desc, nkeep, 0) print("Making farthest point sampling selection") np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.error', dmax_remain, fmt='%4.8f', header='the maximum remaining distance in FPS') # CUR decomposition if algorithm == 'cur' or algorithm == 'CUR': desc = np.asmatrix(desc) cov = np.dot(desc, desc.T) print("Making CUR selection") print("shape of the covariance matrix:", np.shape(cov)) sbs, rcov_error = CUR_deterministic(cov, nkeep) np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.error', rcov_error, fmt='%4.8f', header='the remaining error of the covariance matrix') elif algorithm == 'reweight': if os.path.isfile(fkde): try: logkde = np.genfromtxt(fkde, dtype=float)[:, 1] except: raise IOError( 'Cannot load the (log of) kernel density for each sample') if len(logkde) != nframes: raise ValueError( 'mismatch of number of frames and kernel densities') else: raise ValueError( 'must suply the (log of) kernel density for each sample') new_kde = np.zeros(nframes) for i in range(nframes): new_kde[i] = np.exp(logkde[i] / reweight_lambda) / np.exp( logkde[i]) # compute the normalization factor so we expect to select n samples in the end normalization = nkeep / np.sum(new_kde) new_kde *= normalization sbs = [] randomchoice = np.random.rand(nframes) for i in range(nframes): if randomchoice[i] < new_kde[i]: sbs.append(i) algorithm = algorithm + "-lambda-" + str(reweight_lambda) # save selection = np.zeros(nframes, dtype=int) for i in sbs: selection[i] = 1 np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.index', selection, fmt='%d') if fy != 'none': np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '-' + fy, np.asarray(y_all)[sbs], fmt='%4.8f') asapxyz.write(prefix + "-" + algorithm + "-n-" + str(nkeep), sbs)
def main(fmat, fxyz, fy, prefix, test_ratio, jitter, n_sparse, sigma, lc_points, lc_repeats): """ Parameters ---------- fmat: Location of kernel matrix file. fy: Location of property list (1D-array of floats) prefix: filename prefix for learning curve figure test_ratio: train/test ratio jitter: jitter level, default is 1e-10 n_sparse: number of representative samples, default is 5% of the data sigma: noise level in kernel ridge regression, default is 0.1% of the standard deviation of the data. lc_points : number of points on the learning curve lc_repeats : number of sub-sampling when compute the learning curve Returns ------- Fitting outcome & Learning curve. """ # if it has been computed before we can simply load it try: K_all = np.genfromtxt(fmat, dtype=float) except OSError: raise Exception( 'fmat file could not be loaded. Please check the filename') print("loaded", fmat) # read in the properties to be predicted y_all = [] try: y_all = np.genfromtxt(fy, dtype=float) except: try: # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) y_all = asapxyz.get_property(fy) except OSError: raise Exception( 'property vector file could not be loaded. Please check the filename' ) if len(y_all) != len(K_all): raise ValueError( 'Length of the vector of properties is not the same as number of samples' ) else: n_sample = len(K_all) # train test split if test_ratio > 0: K_train, K_test, y_train, y_test, _, _ = kernel_random_split( K_all, y_all, test_ratio) else: K_train = K_test = K_all y_train = y_test = y_all n_train = len(K_train) n_test = len(K_test) # set default value of n_sparse if n_sparse == 0: n_sparse = n_train // 20 # sparsification if n_sparse >= n_train: print( "the number of representative structure is too large, please select n < ", n_train) elif n_sparse > 0: ifps, dfps = fps(K_train, n_sparse, 0) K_MM = K_train[:, ifps][ifps] K_NM = K_train[:, ifps] K_TM = K_test[:, ifps] else: print("it's usually better to use some sparsification") K_MM = K_train K_NM = K_train K_TM = K_test # if sigma is not set... if sigma < 0: sigma = 0.001 * np.std(y_train) delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM)) krr = KRRSparse(jitter, delta, sigma) # fit the model krr.fit(K_MM, K_NM, y_train) fit_error = {} # get the predictions for train set y_pred = krr.predict(K_NM) # compute the CV score for the dataset y_pred, y_pred_test, fit_error = krr.get_train_test_error(K_NM, y_train, K_TM, y_test, verbose=True, return_pred=True) # dump to file import json with open('KRR_train_test_errors_4' + prefix + '.json', 'w') as fp: json.dump(fit_error, fp) # learning curve # decide train sizes if lc_points > 1 and n_sparse > 0: train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points) print("Learning curves using train sizes: ", train_sizes) lc_stats = lc_repeats * np.ones(lc_points, dtype=int) lc = LCSplit(ShuffleSplit, n_repeats=lc_stats, train_sizes=train_sizes, test_size=n_test, random_state=10) lc_scores = LC_SCOREBOARD(train_sizes) for lctrain, _ in lc.split(y_train): Ntrain = len(lctrain) lc_K_NM = K_NM[lctrain, :] lc_y_train = y_train[lctrain] # here we always use the same test set # otherwise, one can do `lc_K_test = K_NM[lctest,:]; lc_y_test = y_train[lctest]` krr.fit(K_MM, lc_K_NM, lc_y_train) # here we always use the same test set _, lc_score_now = krr.fit_predict_error(K_MM, lc_K_NM, lc_y_train, K_TM, y_test) lc_scores.add_score(Ntrain, lc_score_now) sc_name = 'RMSE' # MAE, RMSE, SUP, R2, CORR lc_results = lc_scores.fetch(sc_name) # output learning curve np.savetxt("KRR_learning_curve_4" + prefix + ".dat", lc_results) plot_styles.set_nice_font() if lc_points > 1 and n_sparse > 0: fig = plt.figure(figsize=(8 * 2.1, 8)) ax = fig.add_subplot(121) else: fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax.plot(y_train, y_pred, 'b.', label='train') ax.plot(y_test, y_pred_test, 'r.', label='test') ax.legend() ax.set_title('KRR for: ' + fy) ax.set_xlabel('actual y') ax.set_ylabel('predicted y') if lc_points > 1 and n_sparse > 0: ax2 = fig.add_subplot(122) ax2.errorbar(lc_results[:, 0], lc_results[:, 1], yerr=lc_results[:, 2], linestyle='', uplims=True, lolims=True) ax2.set_title('Learning curve') ax2.set_xlabel('Number of training samples') ax2.set_ylabel('Test {}'.format(sc_name)) ax2.set_xscale('log') ax2.set_yscale('log') plt.show() fig.savefig('KRR_4_' + prefix + '.png')
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom, keepraw, sparse_mode, n_sparse, power, kpca_d, pc1, pc2, projectatomic, plotatomic, adjusttext): """ Parameters ---------- fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it. fxyz: Location of xyz file for reading the properties. ftags: Location of tags for the first M samples. Plot the tags on the (k)PCA map. fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot' colorscol: The column number of the properties used for the coloring. Starts from 0. prefix: Filename prefix, default is ASAP output: The format for output files ([xyz], [matrix]). Default is xyz. peratom: Whether to output per atom pca coordinates (True/False) keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False) n_sparse: number of representative samples, default is 5% of the data power: use polynomial kernel function of degree n. kpca_d: Number of the principle components to keep pc1: Plot the projection along which principle axes pc2: Plot the projection along which principle axes projectatomic: build the projection using the (big) atomic descriptor matrix plotatomic: Plot the PCA coordinates of all atomic environments (True/False) adtext: Whether to adjust the texts (True/False) Returns ------- """ use_atomic_desc = (peratom or plotatomic or projectatomic) # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc) if projectatomic: desc = desc_atomic.copy() else: asapxyz = None print( "Did not provide the xyz file. We can only output descriptor matrix." ) output = 'matrix' # we can also load the descriptor matrix from a standalone file if os.path.isfile(fmat[0]): try: desc = np.genfromtxt(fmat[0], dtype=float) print("loaded the descriptor matrix from file: ", fmat) except: raise ValueError('Cannot load the descriptor matrix from file') # sanity check if len(desc) == 0: raise ValueError( 'Please supply descriptor in a xyz file or a standlone descriptor matrix' ) print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0])) if ftags != 'none': tags = np.loadtxt(ftags, dtype="str")[:] ndict = len(tags) else: tags = [] # sparsification n_sample = len(desc) # set default value of n_sparse if n_sparse == 0: n_sparse = max(10, n_sample // 20) # sparsification if n_sparse >= n_sample: print( "the number of representative structure is too large, please select n < ", n_sample) elif n_sample > 0: if sparse_mode == 'fps' or sparse_mode == 'FPS': ifps, _ = fps(desc, n_sparse, 0) elif sparse_mode == 'cur' or sparse_mode == 'CUR': cov = np.dot(np.asmatrix(desc), np.asmatrix(desc).T) ifps, _ = CUR_deterministic(cov, n_sparse) else: raise ValueError('Cannot find the specified sparsification mode') else: print("Not using any sparsification") ifps = np.range(n_sample) k_spec = { 'k0': { "type": "cosine" } } #{ 'k1': {"type": "polynomial", "d": power}} k_transform = Descriptors_to_Kernels(k_spec) kNN = k_transform.compute(desc[ifps]) kMN = k_transform.compute(desc, desc[ifps]) print("Shape of the kNN matrix: ", np.shape(kNN), ", and shape of the kMN matrix:", np.shape(kMN)) # main thing kpca = KernelPCA(kpca_d) kpca.fit(kNN) proj = kpca.transform(kMN) if peratom or plotatomic and not projectatomic: kNT = np.power(np.dot(desc_atomic[:], desc[ifps].T), power) proj_atomic_all = kpca.transform(kNT) # save if output == 'matrix': np.savetxt(prefix + "-kpca-d" + str(kpca_d) + ".coord", proj, fmt='%4.8f', header='low D coordinates of samples') elif output == 'xyz': if os.path.isfile(foutput + ".xyz"): os.rename(foutput + ".xyz", "bck." + foutput + ".xyz") asapxyz.set_descriptors(proj, 'kpca_coord') asapxyz.write(foutput) # color scheme plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function( fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic) # make plot if plotatomic: outfile = 'KPCA_4_' + prefix + '-c-' + fcolor + '-plotatomic.png' else: outfile = 'KPCA_4_' + prefix + '-c-' + fcolor + '.png' fig_spec_dict = { 'outfile': outfile, 'show': False, 'title': None, 'xlabel': 'Principal Axis 1', 'ylabel': 'Principal Axis 2', 'xaxis': True, 'yaxis': True, 'remove_tick': False, 'rasterized': True, 'fontsize': 16, 'components': { "first_p": { "type": 'scatter', 'clabel': colorlabel }, "second_p": { "type": 'annotate', 'adtext': adjusttext } } } asap_plot = Plotters(fig_spec_dict) asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], [], tags) if peratom or plotatomic and not projectatomic: asap_plot.plot(proj_atomic_all[::-1, [pc1, pc2]], plotcolor_peratom[::-1], [], []) plt.show()