Example #1
0
    def sparsify(self, desc_or_ntotal, n_or_ratio, sparse_param=0):
        """
        Function handing the sparsification of data
        Parameters
        ----------
        desc_or_ntotal: np.matrix or int
                        Either a design matrix [n_sample, n_desc],
                        or simply the total number of samples
        n_or_ratio: int or float 
                  Either the number or the fraction of sparsified points
        sparse_param: int
                additional parameter that may be needed for the specific sparsifier used

        Returns
        ----------
        sbs: list
        a list of the indexes for the sparsified points
        """
        if isinstance(desc_or_ntotal, int):
            n_total = desc_or_ntotal
            input_desc = False
        else:
            desc = desc_or_ntotal
            n_total = len(desc_or_ntotal)
            input_desc = True

        if n_or_ratio == 1 or isinstance(n_or_ratio, float):
            n_sparse = n_total * n_or_ratio
        elif isinstance(n_or_ratio, int):
            n_sparse = n_or_ratio
        else:
            raise ValueError("the sparsification ratio/number should be a float or int.")
 
        self._check(n_sparse, n_total)

        if self.sparse_mode == 'fps':
            if not input_desc: 
                raise ValueError("fps needs design matrix")
            sbs, _ = fps(desc, n_sparse, int(sparse_param))
        elif self.sparse_mode == 'cur':
            if not input_desc:
                raise ValueError("cur needs design matrix")
            import numpy as np
            cov = np.dot(np.asmatrix(desc), np.asmatrix(desc).T)
            sbs, _ = CUR_deterministic(cov, n_sparse)
        elif self.sparse_mode == 'random':
            _, sbs = random_split(n_total, n_sparse/n_total)
        elif self.sparse_mode == 'sequential':
            sbs = range(n_sparse)
        else:
            raise ValueError("sparse mode not right")

        return sbs
Example #2
0
def main(fmat, fy, prefix, test_ratio, jitter, n_sparse, sigma):
    """

    Parameters
    ----------
    fmat: Location of kernel matrix file.
    fy: Location of property list (1D-array of floats)
    prefix: filename prefix for learning curve figure
    test_ratio: train/test ratio
    jitter: jitter level, default is 1e-10
    n_sparse: number of representative samples
    sigma: noise level in kernel ridge regression

    Returns
    -------

    Fitting outcome & Learning curve.

    """

    # if it has been computed before we can simply load it
    try:
        K_all = np.genfromtxt(fmat, dtype=float)
    except OSError:
        raise Exception(
            'fmat file could not be loaded. Please check the filename')
    print("loaded", fmat)
    try:
        y_all = np.genfromtxt(fy, dtype=float)
    except OSError:
        raise Exception(
            'property vector file could not be loaded. Please check the filename'
        )
    if len(y_all) != len(K_all):
        raise ValueError(
            'Length of the vector of properties is not the same as number of samples'
        )
    else:
        n_sample = len(K_all)

    # train test split
    if test_ratio > 0:
        K_train, K_test, y_train, y_test, _, _ = kernel_random_split(
            K_all, y_all, test_ratio)
    else:
        K_train = K_test = K_all
        y_train = y_test = y_all
    n_train = len(K_train)
    n_test = len(K_test)

    # sparsification
    if n_sparse >= n_train:
        print(
            "the number of representative structure is too large, please select n < ",
            n_train)
    elif n_sparse > 0:
        ifps, dfps = fps(K_train, n_sparse, 0)
        K_MM = K_train[:, ifps][ifps]
        K_NM = K_train[:, ifps]
        K_TM = K_test[:, ifps]
    else:
        print("it's usually better to use some sparsification")
        K_MM = K_train
        K_NM = K_train
        K_TM = K_test

    delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM))
    krr = KRRSparse(jitter, delta, sigma)
    # fit the model
    krr.fit(K_MM, K_NM, y_train)

    # get the predictions for train set
    y_pred = krr.predict(K_NM)
    # compute the CV score for the dataset
    print("train score: ", get_score(y_pred, y_train))
    # get the predictions for test set
    y_pred_test = krr.predict(K_TM)
    # compute the CV score for the dataset
    print("test score: ", get_score(y_pred_test, y_test))

    plot_styles.set_nice_font()
    fig = plt.figure(figsize=(8 * 2.1, 8))
    ax = fig.add_subplot(121)
    ax.plot(y_train, y_pred, 'b.', label='train')
    ax.plot(y_test, y_pred_test, 'r.', label='test')
    ax.legend()
    ax.set_title('KRR for: ' + fy)
    ax.set_xlabel('actual y')
    ax.set_ylabel('predicted y')

    # learning curve
    # decide train sizes
    lc_points = 10
    train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points)
    print("Learning curves using train sizes: ", train_sizes)
    lc_stats = 12 * np.ones(lc_points, dtype=int)
    lc = LCSplit(ShuffleSplit,
                 n_repeats=lc_stats,
                 train_sizes=train_sizes,
                 test_size=n_test,
                 random_state=10)

    scores = {size: [] for size in train_sizes}
    for lctrain, lctest in lc.split(y_train):
        Ntrain = len(lctrain)
        lc_K_NM = K_NM[lctrain, :]
        lc_y_train = y_train[lctrain]
        # lc_K_test = K_NM[lctest,:]
        lc_K_test = K_TM
        # lc_y_test = y_train[lctest]
        lc_y_test = y_test
        krr.fit(K_MM, lc_K_NM, lc_y_train)
        lc_y_pred = krr.predict(lc_K_test)
        scores[Ntrain].append(get_score(lc_y_pred, lc_y_test))

    sc_name = 'RMSE'
    Ntrains = []
    avg_scores = []
    avg_scores_error = []
    for Ntrain, score in scores.items():
        avg = 0.
        var = 0.
        for sc in score:
            avg += sc[sc_name]
            var += sc[sc_name]**2.
        avg /= len(score)
        var /= len(score)
        var -= avg**2.
        avg_scores.append(avg)
        avg_scores_error.append(np.sqrt(var))
        Ntrains.append(Ntrain)

    ax2 = fig.add_subplot(122)
    ax2.errorbar(Ntrains, avg_scores, yerr=avg_scores_error)
    ax2.set_title('Learning curve')
    ax2.set_xlabel('Number of training samples')
    ax2.set_ylabel('Test {}'.format(sc_name))
    ax2.set_xscale('log')
    ax2.set_yscale('log')

    plt.show()
    fig.savefig('KRR_4_' + prefix + '.png')
Example #3
0
def main(fxyz, fy, prefix, nkeep, algorithm, fmat, fkde, reweight_lambda):
    """
    Select frames from the supplied xyz file (fxyz) using one of the following algorithms:

    1. random: random selection
    2. fps: farthest point sampling selection. Need to supply a kernel matrix or descriptor matrix using -fmat
    3. sortmin/sortmax: select the frames with the largest/smallest value. Need to supply the vector of properties using
       -fy
    4. CUR decomposition
    5. Reweight according to the re-weighted distribution exp(-f/\lambda),
       where exp(-f) is the precomputed kernel density estimation of the original samples.

    Parameters
    ----------
    fxyz: Path to xyz file.
    fy: Path to the list of properties (N floats) or name of the tags in ase xyz file
    prefix: Filename prefix, default is ASAP
    nkeep: The number of representative samples to select
    algorithm: 'the algorithm for selecting frames ([random], [fps], [sort], [reweight])')
    fmat: Location of descriptor or kernel matrix file. Needed if you select [fps].
    You can use gen_kmat.py to compute it.
    reweight_lambda: select samples according to the re-weighted distribution exp(-f/\lambda),
              where exp(-f) is the kernel density estimation of the original samples.
    """

    # read the xyz file
    asapxyz = ASAPXYZ(fxyz)
    nframes = asapxyz.get_num_frames()

    if nkeep == 0:
        nkeep = nframes

    if fy != 'none':
        y_all = []
        try:
            y_all = np.genfromtxt(fy, dtype=float)
        except:
            y_all = asapxyz.get_property(fy)
        if len(y_all) != nframes:
            raise ValueError(
                'Length of the vector of properties is not the same as number of samples'
            )

    if algorithm == 'random' or algorithm == 'RANDOM':
        idx = np.asarray(range(nframes))
        sbs = np.random.choice(idx, nkeep, replace=False)

    elif algorithm == 'sortmax' or algorithm == 'sortmin':
        if fy == 'none':
            raise ValueError(
                'must supply the vector of properties for sorting')

        idx = np.asarray(range(nframes))
        if algorithm == 'sortmax':
            sbs = [x for _, x in sorted(zip(y_all, idx))][:nkeep]
        elif algorithm == 'sortmin':
            sbs = [x for _, x in sorted(zip(y_all, idx))][nkeep:]

    elif algorithm == 'fps' or algorithm == 'FPS' or algorithm == 'cur' or algorithm == 'CUR':
        # for both algo we read in the descriptor matrix
        desc, _ = asapxyz.get_descriptors(fmat)
        if os.path.isfile(fmat):
            try:
                desc = np.genfromtxt(fmat, dtype=float)
            except:
                raise ValueError('Cannot load the kernel matrix')
        print("shape of the descriptor matrix: ", np.shape(desc),
              "number of descriptors: ", np.shape(desc[0]))

        # FPS
        if algorithm == 'fps' or algorithm == 'FPS':
            sbs, dmax_remain = fps(desc, nkeep, 0)
            print("Making farthest point sampling selection")
            np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) +
                       '.error',
                       dmax_remain,
                       fmt='%4.8f',
                       header='the maximum remaining distance in FPS')
        # CUR decomposition
        if algorithm == 'cur' or algorithm == 'CUR':
            desc = np.asmatrix(desc)
            cov = np.dot(desc, desc.T)
            print("Making CUR selection")
            print("shape of the covariance matrix:", np.shape(cov))
            sbs, rcov_error = CUR_deterministic(cov, nkeep)
            np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) +
                       '.error',
                       rcov_error,
                       fmt='%4.8f',
                       header='the remaining error of the covariance matrix')

    elif algorithm == 'reweight':
        if os.path.isfile(fkde):
            try:
                logkde = np.genfromtxt(fkde, dtype=float)[:, 1]
            except:
                raise IOError(
                    'Cannot load the (log of) kernel density for each sample')
            if len(logkde) != nframes:
                raise ValueError(
                    'mismatch of number of frames and kernel densities')
        else:
            raise ValueError(
                'must suply the (log of) kernel density for each sample')

        new_kde = np.zeros(nframes)
        for i in range(nframes):
            new_kde[i] = np.exp(logkde[i] / reweight_lambda) / np.exp(
                logkde[i])
        # compute the normalization factor so we expect to select n samples in the end
        normalization = nkeep / np.sum(new_kde)
        new_kde *= normalization
        sbs = []
        randomchoice = np.random.rand(nframes)
        for i in range(nframes):
            if randomchoice[i] < new_kde[i]:
                sbs.append(i)
        algorithm = algorithm + "-lambda-" + str(reweight_lambda)
    # save
    selection = np.zeros(nframes, dtype=int)
    for i in sbs:
        selection[i] = 1
    np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.index',
               selection,
               fmt='%d')
    if fy != 'none':
        np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '-' + fy,
                   np.asarray(y_all)[sbs],
                   fmt='%4.8f')
    asapxyz.write(prefix + "-" + algorithm + "-n-" + str(nkeep), sbs)
Example #4
0
def main(fmat, fxyz, fy, prefix, test_ratio, jitter, n_sparse, sigma,
         lc_points, lc_repeats):
    """

    Parameters
    ----------
    fmat: Location of kernel matrix file.
    fy: Location of property list (1D-array of floats)
    prefix: filename prefix for learning curve figure
    test_ratio: train/test ratio
    jitter: jitter level, default is 1e-10
    n_sparse: number of representative samples, default is 5% of the data
    sigma: noise level in kernel ridge regression, default is 0.1% of the standard deviation of the data.
    lc_points : number of points on the learning curve
    lc_repeats : number of sub-sampling when compute the learning curve

    Returns
    -------

    Fitting outcome & Learning curve.

    """

    # if it has been computed before we can simply load it
    try:
        K_all = np.genfromtxt(fmat, dtype=float)
    except OSError:
        raise Exception(
            'fmat file could not be loaded. Please check the filename')
    print("loaded", fmat)

    # read in the properties to be predicted
    y_all = []
    try:
        y_all = np.genfromtxt(fy, dtype=float)
    except:
        try:
            # try to read the xyz file
            if fxyz != 'none':
                asapxyz = ASAPXYZ(fxyz)
                y_all = asapxyz.get_property(fy)
        except OSError:
            raise Exception(
                'property vector file could not be loaded. Please check the filename'
            )

    if len(y_all) != len(K_all):
        raise ValueError(
            'Length of the vector of properties is not the same as number of samples'
        )
    else:
        n_sample = len(K_all)

    # train test split
    if test_ratio > 0:
        K_train, K_test, y_train, y_test, _, _ = kernel_random_split(
            K_all, y_all, test_ratio)
    else:
        K_train = K_test = K_all
        y_train = y_test = y_all
    n_train = len(K_train)
    n_test = len(K_test)

    # set default value of n_sparse
    if n_sparse == 0:
        n_sparse = n_train // 20
    # sparsification
    if n_sparse >= n_train:
        print(
            "the number of representative structure is too large, please select n < ",
            n_train)
    elif n_sparse > 0:
        ifps, dfps = fps(K_train, n_sparse, 0)
        K_MM = K_train[:, ifps][ifps]
        K_NM = K_train[:, ifps]
        K_TM = K_test[:, ifps]
    else:
        print("it's usually better to use some sparsification")
        K_MM = K_train
        K_NM = K_train
        K_TM = K_test

    # if sigma is not set...
    if sigma < 0:
        sigma = 0.001 * np.std(y_train)

    delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM))
    krr = KRRSparse(jitter, delta, sigma)
    # fit the model
    krr.fit(K_MM, K_NM, y_train)

    fit_error = {}
    # get the predictions for train set
    y_pred = krr.predict(K_NM)
    # compute the CV score for the dataset
    y_pred, y_pred_test, fit_error = krr.get_train_test_error(K_NM,
                                                              y_train,
                                                              K_TM,
                                                              y_test,
                                                              verbose=True,
                                                              return_pred=True)
    # dump to file
    import json
    with open('KRR_train_test_errors_4' + prefix + '.json', 'w') as fp:
        json.dump(fit_error, fp)

    # learning curve
    # decide train sizes
    if lc_points > 1 and n_sparse > 0:
        train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points)
        print("Learning curves using train sizes: ", train_sizes)
        lc_stats = lc_repeats * np.ones(lc_points, dtype=int)
        lc = LCSplit(ShuffleSplit,
                     n_repeats=lc_stats,
                     train_sizes=train_sizes,
                     test_size=n_test,
                     random_state=10)

        lc_scores = LC_SCOREBOARD(train_sizes)
        for lctrain, _ in lc.split(y_train):
            Ntrain = len(lctrain)
            lc_K_NM = K_NM[lctrain, :]
            lc_y_train = y_train[lctrain]
            # here we always use the same test set
            # otherwise, one can do `lc_K_test = K_NM[lctest,:]; lc_y_test = y_train[lctest]`
            krr.fit(K_MM, lc_K_NM, lc_y_train)
            # here we always use the same test set
            _, lc_score_now = krr.fit_predict_error(K_MM, lc_K_NM, lc_y_train,
                                                    K_TM, y_test)
            lc_scores.add_score(Ntrain, lc_score_now)

        sc_name = 'RMSE'  #     MAE, RMSE, SUP, R2, CORR
        lc_results = lc_scores.fetch(sc_name)
        # output learning curve
        np.savetxt("KRR_learning_curve_4" + prefix + ".dat", lc_results)

    plot_styles.set_nice_font()

    if lc_points > 1 and n_sparse > 0:
        fig = plt.figure(figsize=(8 * 2.1, 8))
        ax = fig.add_subplot(121)
    else:
        fig = plt.figure(figsize=(8, 8))
        ax = fig.add_subplot(111)
    ax.plot(y_train, y_pred, 'b.', label='train')
    ax.plot(y_test, y_pred_test, 'r.', label='test')
    ax.legend()
    ax.set_title('KRR for: ' + fy)
    ax.set_xlabel('actual y')
    ax.set_ylabel('predicted y')

    if lc_points > 1 and n_sparse > 0:
        ax2 = fig.add_subplot(122)
        ax2.errorbar(lc_results[:, 0],
                     lc_results[:, 1],
                     yerr=lc_results[:, 2],
                     linestyle='',
                     uplims=True,
                     lolims=True)
        ax2.set_title('Learning curve')
        ax2.set_xlabel('Number of training samples')
        ax2.set_ylabel('Test {}'.format(sc_name))
        ax2.set_xscale('log')
        ax2.set_yscale('log')

    plt.show()
    fig.savefig('KRR_4_' + prefix + '.png')
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom,
         keepraw, sparse_mode, n_sparse, power, kpca_d, pc1, pc2,
         projectatomic, plotatomic, adjusttext):
    """
    Parameters
    ----------
    fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it.
    fxyz: Location of xyz file for reading the properties.
    ftags: Location of tags for the first M samples. Plot the tags on the (k)PCA map.
    fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot'
    colorscol: The column number of the properties used for the coloring. Starts from 0.
    prefix: Filename prefix, default is ASAP
    output: The format for output files ([xyz], [matrix]). Default is xyz.
    peratom: Whether to output per atom pca coordinates (True/False)
    keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False)
    n_sparse: number of representative samples, default is 5% of the data
    power: use polynomial kernel function of degree n. 
    kpca_d: Number of the principle components to keep
    pc1: Plot the projection along which principle axes
    pc2: Plot the projection along which principle axes
    projectatomic: build the projection using the (big) atomic descriptor matrix
    plotatomic: Plot the PCA coordinates of all atomic environments (True/False)
    adtext: Whether to adjust the texts (True/False)

    Returns
    -------
    """

    use_atomic_desc = (peratom or plotatomic or projectatomic)
    # try to read the xyz file
    if fxyz != 'none':
        asapxyz = ASAPXYZ(fxyz)
        desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc)
        if projectatomic: desc = desc_atomic.copy()
    else:
        asapxyz = None
        print(
            "Did not provide the xyz file. We can only output descriptor matrix."
        )
        output = 'matrix'
    # we can also load the descriptor matrix from a standalone file
    if os.path.isfile(fmat[0]):
        try:
            desc = np.genfromtxt(fmat[0], dtype=float)
            print("loaded the descriptor matrix from file: ", fmat)
        except:
            raise ValueError('Cannot load the descriptor matrix from file')
    # sanity check
    if len(desc) == 0:
        raise ValueError(
            'Please supply descriptor in a xyz file or a standlone descriptor matrix'
        )

    print("shape of the descriptor matrix: ", np.shape(desc),
          "number of descriptors: ", np.shape(desc[0]))

    if ftags != 'none':
        tags = np.loadtxt(ftags, dtype="str")[:]
        ndict = len(tags)
    else:
        tags = []

    # sparsification
    n_sample = len(desc)
    # set default value of n_sparse
    if n_sparse == 0:
        n_sparse = max(10, n_sample // 20)
    # sparsification
    if n_sparse >= n_sample:
        print(
            "the number of representative structure is too large, please select n < ",
            n_sample)
    elif n_sample > 0:
        if sparse_mode == 'fps' or sparse_mode == 'FPS':
            ifps, _ = fps(desc, n_sparse, 0)
        elif sparse_mode == 'cur' or sparse_mode == 'CUR':
            cov = np.dot(np.asmatrix(desc), np.asmatrix(desc).T)
            ifps, _ = CUR_deterministic(cov, n_sparse)
        else:
            raise ValueError('Cannot find the specified sparsification mode')
    else:
        print("Not using any sparsification")
        ifps = np.range(n_sample)

    k_spec = {
        'k0': {
            "type": "cosine"
        }
    }  #{ 'k1': {"type": "polynomial", "d": power}}
    k_transform = Descriptors_to_Kernels(k_spec)

    kNN = k_transform.compute(desc[ifps])
    kMN = k_transform.compute(desc, desc[ifps])
    print("Shape of the kNN matrix: ", np.shape(kNN),
          ", and shape of the kMN matrix:", np.shape(kMN))
    # main thing
    kpca = KernelPCA(kpca_d)
    kpca.fit(kNN)
    proj = kpca.transform(kMN)
    if peratom or plotatomic and not projectatomic:
        kNT = np.power(np.dot(desc_atomic[:], desc[ifps].T), power)
        proj_atomic_all = kpca.transform(kNT)

    # save
    if output == 'matrix':
        np.savetxt(prefix + "-kpca-d" + str(kpca_d) + ".coord",
                   proj,
                   fmt='%4.8f',
                   header='low D coordinates of samples')
    elif output == 'xyz':
        if os.path.isfile(foutput + ".xyz"):
            os.rename(foutput + ".xyz", "bck." + foutput + ".xyz")
        asapxyz.set_descriptors(proj, 'kpca_coord')
        asapxyz.write(foutput)

    # color scheme
    plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function(
        fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic)

    # make plot
    if plotatomic:
        outfile = 'KPCA_4_' + prefix + '-c-' + fcolor + '-plotatomic.png'
    else:
        outfile = 'KPCA_4_' + prefix + '-c-' + fcolor + '.png'

    fig_spec_dict = {
        'outfile': outfile,
        'show': False,
        'title': None,
        'xlabel': 'Principal Axis 1',
        'ylabel': 'Principal Axis 2',
        'xaxis': True,
        'yaxis': True,
        'remove_tick': False,
        'rasterized': True,
        'fontsize': 16,
        'components': {
            "first_p": {
                "type": 'scatter',
                'clabel': colorlabel
            },
            "second_p": {
                "type": 'annotate',
                'adtext': adjusttext
            }
        }
    }
    asap_plot = Plotters(fig_spec_dict)
    asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], [], tags)
    if peratom or plotatomic and not projectatomic:
        asap_plot.plot(proj_atomic_all[::-1, [pc1, pc2]],
                       plotcolor_peratom[::-1], [], [])
    plt.show()