Python fps Examples

Programming Language: Python

Namespace/Package Name: asaplib.compressor

Method/Function: fps

Examples at hotexamples.com: 5

Python fps - 5 examples found. These are the top rated real world Python examples of asaplib.compressor.fps extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: sparsifier.py Project: mastricker/ASAP

    def sparsify(self, desc_or_ntotal, n_or_ratio, sparse_param=0):
        """
        Function handing the sparsification of data
        Parameters
        ----------
        desc_or_ntotal: np.matrix or int
                        Either a design matrix [n_sample, n_desc],
                        or simply the total number of samples
        n_or_ratio: int or float 
                  Either the number or the fraction of sparsified points
        sparse_param: int
                additional parameter that may be needed for the specific sparsifier used

        Returns
        ----------
        sbs: list
        a list of the indexes for the sparsified points
        """
        if isinstance(desc_or_ntotal, int):
            n_total = desc_or_ntotal
            input_desc = False
        else:
            desc = desc_or_ntotal
            n_total = len(desc_or_ntotal)
            input_desc = True

        if n_or_ratio == 1 or isinstance(n_or_ratio, float):
            n_sparse = n_total * n_or_ratio
        elif isinstance(n_or_ratio, int):
            n_sparse = n_or_ratio
        else:
            raise ValueError("the sparsification ratio/number should be a float or int.")
 
        self._check(n_sparse, n_total)

        if self.sparse_mode == 'fps':
            if not input_desc: 
                raise ValueError("fps needs design matrix")
            sbs, _ = fps(desc, n_sparse, int(sparse_param))
        elif self.sparse_mode == 'cur':
            if not input_desc:
                raise ValueError("cur needs design matrix")
            import numpy as np
            cov = np.dot(np.asmatrix(desc), np.asmatrix(desc).T)
            sbs, _ = CUR_deterministic(cov, n_sparse)
        elif self.sparse_mode == 'random':
            _, sbs = random_split(n_total, n_sparse/n_total)
        elif self.sparse_mode == 'sequential':
            sbs = range(n_sparse)
        else:
            raise ValueError("sparse mode not right")

        return sbs

Example #2

Show file

File: krr.py Project: FelixFaber/ASAP

def main(fmat, fy, prefix, test_ratio, jitter, n_sparse, sigma):
    """

    Parameters
    ----------
    fmat: Location of kernel matrix file.
    fy: Location of property list (1D-array of floats)
    prefix: filename prefix for learning curve figure
    test_ratio: train/test ratio
    jitter: jitter level, default is 1e-10
    n_sparse: number of representative samples
    sigma: noise level in kernel ridge regression

    Returns
    -------

    Fitting outcome & Learning curve.

    """

    # if it has been computed before we can simply load it
    try:
        K_all = np.genfromtxt(fmat, dtype=float)
    except OSError:
        raise Exception(
            'fmat file could not be loaded. Please check the filename')
    print("loaded", fmat)
    try:
        y_all = np.genfromtxt(fy, dtype=float)
    except OSError:
        raise Exception(
            'property vector file could not be loaded. Please check the filename'
        )
    if len(y_all) != len(K_all):
        raise ValueError(
            'Length of the vector of properties is not the same as number of samples'
        )
    else:
        n_sample = len(K_all)

    # train test split
    if test_ratio > 0:
        K_train, K_test, y_train, y_test, _, _ = kernel_random_split(
            K_all, y_all, test_ratio)
    else:
        K_train = K_test = K_all
        y_train = y_test = y_all
    n_train = len(K_train)
    n_test = len(K_test)

    # sparsification
    if n_sparse >= n_train:
        print(
            "the number of representative structure is too large, please select n < ",
            n_train)
    elif n_sparse > 0:
        ifps, dfps = fps(K_train, n_sparse, 0)
        K_MM = K_train[:, ifps][ifps]
        K_NM = K_train[:, ifps]
        K_TM = K_test[:, ifps]
    else:
        print("it's usually better to use some sparsification")
        K_MM = K_train
        K_NM = K_train
        K_TM = K_test

    delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM))
    krr = KRRSparse(jitter, delta, sigma)
    # fit the model
    krr.fit(K_MM, K_NM, y_train)

    # get the predictions for train set
    y_pred = krr.predict(K_NM)
    # compute the CV score for the dataset
    print("train score: ", get_score(y_pred, y_train))
    # get the predictions for test set
    y_pred_test = krr.predict(K_TM)
    # compute the CV score for the dataset
    print("test score: ", get_score(y_pred_test, y_test))

    plot_styles.set_nice_font()
    fig = plt.figure(figsize=(8 * 2.1, 8))
    ax = fig.add_subplot(121)
    ax.plot(y_train, y_pred, 'b.', label='train')
    ax.plot(y_test, y_pred_test, 'r.', label='test')
    ax.legend()
    ax.set_title('KRR for: ' + fy)
    ax.set_xlabel('actual y')
    ax.set_ylabel('predicted y')

    # learning curve
    # decide train sizes
    lc_points = 10
    train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points)
    print("Learning curves using train sizes: ", train_sizes)
    lc_stats = 12 * np.ones(lc_points, dtype=int)
    lc = LCSplit(ShuffleSplit,
                 n_repeats=lc_stats,
                 train_sizes=train_sizes,
                 test_size=n_test,
                 random_state=10)

    scores = {size: [] for size in train_sizes}
    for lctrain, lctest in lc.split(y_train):
        Ntrain = len(lctrain)
        lc_K_NM = K_NM[lctrain, :]
        lc_y_train = y_train[lctrain]
        # lc_K_test = K_NM[lctest,:]
        lc_K_test = K_TM
        # lc_y_test = y_train[lctest]
        lc_y_test = y_test
        krr.fit(K_MM, lc_K_NM, lc_y_train)
        lc_y_pred = krr.predict(lc_K_test)
        scores[Ntrain].append(get_score(lc_y_pred, lc_y_test))

    sc_name = 'RMSE'
    Ntrains = []
    avg_scores = []
    avg_scores_error = []
    for Ntrain, score in scores.items():
        avg = 0.
        var = 0.
        for sc in score:
            avg += sc[sc_name]
            var += sc[sc_name]**2.
        avg /= len(score)
        var /= len(score)
        var -= avg**2.
        avg_scores.append(avg)
        avg_scores_error.append(np.sqrt(var))
        Ntrains.append(Ntrain)

    ax2 = fig.add_subplot(122)
    ax2.errorbar(Ntrains, avg_scores, yerr=avg_scores_error)
    ax2.set_title('Learning curve')
    ax2.set_xlabel('Number of training samples')
    ax2.set_ylabel('Test {}'.format(sc_name))
    ax2.set_xscale('log')
    ax2.set_yscale('log')

    plt.show()
    fig.savefig('KRR_4_' + prefix + '.png')

Example #3

Show file

def main(fxyz, fy, prefix, nkeep, algorithm, fmat, fkde, reweight_lambda):
    """
    Select frames from the supplied xyz file (fxyz) using one of the following algorithms:

    1. random: random selection
    2. fps: farthest point sampling selection. Need to supply a kernel matrix or descriptor matrix using -fmat
    3. sortmin/sortmax: select the frames with the largest/smallest value. Need to supply the vector of properties using
       -fy
    4. CUR decomposition
    5. Reweight according to the re-weighted distribution exp(-f/\lambda),
       where exp(-f) is the precomputed kernel density estimation of the original samples.

    Parameters
    ----------
    fxyz: Path to xyz file.
    fy: Path to the list of properties (N floats) or name of the tags in ase xyz file
    prefix: Filename prefix, default is ASAP
    nkeep: The number of representative samples to select
    algorithm: 'the algorithm for selecting frames ([random], [fps], [sort], [reweight])')
    fmat: Location of descriptor or kernel matrix file. Needed if you select [fps].
    You can use gen_kmat.py to compute it.
    reweight_lambda: select samples according to the re-weighted distribution exp(-f/\lambda),
              where exp(-f) is the kernel density estimation of the original samples.
    """

    # read the xyz file
    asapxyz = ASAPXYZ(fxyz)
    nframes = asapxyz.get_num_frames()

    if nkeep == 0:
        nkeep = nframes

    if fy != 'none':
        y_all = []
        try:
            y_all = np.genfromtxt(fy, dtype=float)
        except:
            y_all = asapxyz.get_property(fy)
        if len(y_all) != nframes:
            raise ValueError(
                'Length of the vector of properties is not the same as number of samples'
            )

    if algorithm == 'random' or algorithm == 'RANDOM':
        idx = np.asarray(range(nframes))
        sbs = np.random.choice(idx, nkeep, replace=False)

    elif algorithm == 'sortmax' or algorithm == 'sortmin':
        if fy == 'none':
            raise ValueError(
                'must supply the vector of properties for sorting')

        idx = np.asarray(range(nframes))
        if algorithm == 'sortmax':
            sbs = [x for _, x in sorted(zip(y_all, idx))][:nkeep]
        elif algorithm == 'sortmin':
            sbs = [x for _, x in sorted(zip(y_all, idx))][nkeep:]

    elif algorithm == 'fps' or algorithm == 'FPS' or algorithm == 'cur' or algorithm == 'CUR':
        # for both algo we read in the descriptor matrix
        desc, _ = asapxyz.get_descriptors(fmat)
        if os.path.isfile(fmat):
            try:
                desc = np.genfromtxt(fmat, dtype=float)
            except:
                raise ValueError('Cannot load the kernel matrix')
        print("shape of the descriptor matrix: ", np.shape(desc),
              "number of descriptors: ", np.shape(desc[0]))

        # FPS
        if algorithm == 'fps' or algorithm == 'FPS':
            sbs, dmax_remain = fps(desc, nkeep, 0)
            print("Making farthest point sampling selection")
            np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) +
                       '.error',
                       dmax_remain,
                       fmt='%4.8f',
                       header='the maximum remaining distance in FPS')
        # CUR decomposition
        if algorithm == 'cur' or algorithm == 'CUR':
            desc = np.asmatrix(desc)
            cov = np.dot(desc, desc.T)
            print("Making CUR selection")
            print("shape of the covariance matrix:", np.shape(cov))
            sbs, rcov_error = CUR_deterministic(cov, nkeep)
            np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) +
                       '.error',
                       rcov_error,
                       fmt='%4.8f',
                       header='the remaining error of the covariance matrix')

    elif algorithm == 'reweight':
        if os.path.isfile(fkde):
            try:
                logkde = np.genfromtxt(fkde, dtype=float)[:, 1]
            except:
                raise IOError(
                    'Cannot load the (log of) kernel density for each sample')
            if len(logkde) != nframes:
                raise ValueError(
                    'mismatch of number of frames and kernel densities')
        else:
            raise ValueError(
                'must suply the (log of) kernel density for each sample')

        new_kde = np.zeros(nframes)
        for i in range(nframes):
            new_kde[i] = np.exp(logkde[i] / reweight_lambda) / np.exp(
                logkde[i])
        # compute the normalization factor so we expect to select n samples in the end
        normalization = nkeep / np.sum(new_kde)
        new_kde *= normalization
        sbs = []
        randomchoice = np.random.rand(nframes)
        for i in range(nframes):
            if randomchoice[i] < new_kde[i]:
                sbs.append(i)
        algorithm = algorithm + "-lambda-" + str(reweight_lambda)
    # save
    selection = np.zeros(nframes, dtype=int)
    for i in sbs:
        selection[i] = 1
    np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.index',
               selection,
               fmt='%d')
    if fy != 'none':
        np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '-' + fy,
                   np.asarray(y_all)[sbs],
                   fmt='%4.8f')
    asapxyz.write(prefix + "-" + algorithm + "-n-" + str(nkeep), sbs)

Example #4

Show file

File: krr.py Project: mastricker/ASAP

def main(fmat, fxyz, fy, prefix, test_ratio, jitter, n_sparse, sigma,
         lc_points, lc_repeats):
    """

    Parameters
    ----------
    fmat: Location of kernel matrix file.
    fy: Location of property list (1D-array of floats)
    prefix: filename prefix for learning curve figure
    test_ratio: train/test ratio
    jitter: jitter level, default is 1e-10
    n_sparse: number of representative samples, default is 5% of the data
    sigma: noise level in kernel ridge regression, default is 0.1% of the standard deviation of the data.
    lc_points : number of points on the learning curve
    lc_repeats : number of sub-sampling when compute the learning curve

    Returns
    -------

    Fitting outcome & Learning curve.

    """

    # if it has been computed before we can simply load it
    try:
        K_all = np.genfromtxt(fmat, dtype=float)
    except OSError:
        raise Exception(
            'fmat file could not be loaded. Please check the filename')
    print("loaded", fmat)

    # read in the properties to be predicted
    y_all = []
    try:
        y_all = np.genfromtxt(fy, dtype=float)
    except:
        try:
            # try to read the xyz file
            if fxyz != 'none':
                asapxyz = ASAPXYZ(fxyz)
                y_all = asapxyz.get_property(fy)
        except OSError:
            raise Exception(
                'property vector file could not be loaded. Please check the filename'
            )

    if len(y_all) != len(K_all):
        raise ValueError(
            'Length of the vector of properties is not the same as number of samples'
        )
    else:
        n_sample = len(K_all)

    # train test split
    if test_ratio > 0:
        K_train, K_test, y_train, y_test, _, _ = kernel_random_split(
            K_all, y_all, test_ratio)
    else:
        K_train = K_test = K_all
        y_train = y_test = y_all
    n_train = len(K_train)
    n_test = len(K_test)

    # set default value of n_sparse
    if n_sparse == 0:
        n_sparse = n_train // 20
    # sparsification
    if n_sparse >= n_train:
        print(
            "the number of representative structure is too large, please select n < ",
            n_train)
    elif n_sparse > 0:
        ifps, dfps = fps(K_train, n_sparse, 0)
        K_MM = K_train[:, ifps][ifps]
        K_NM = K_train[:, ifps]
        K_TM = K_test[:, ifps]
    else:
        print("it's usually better to use some sparsification")
        K_MM = K_train
        K_NM = K_train
        K_TM = K_test

    # if sigma is not set...
    if sigma < 0:
        sigma = 0.001 * np.std(y_train)

    delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM))
    krr = KRRSparse(jitter, delta, sigma)
    # fit the model
    krr.fit(K_MM, K_NM, y_train)

    fit_error = {}
    # get the predictions for train set
    y_pred = krr.predict(K_NM)
    # compute the CV score for the dataset
    y_pred, y_pred_test, fit_error = krr.get_train_test_error(K_NM,
                                                              y_train,
                                                              K_TM,
                                                              y_test,
                                                              verbose=True,
                                                              return_pred=True)
    # dump to file
    import json
    with open('KRR_train_test_errors_4' + prefix + '.json', 'w') as fp:
        json.dump(fit_error, fp)

    # learning curve
    # decide train sizes
    if lc_points > 1 and n_sparse > 0:
        train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points)
        print("Learning curves using train sizes: ", train_sizes)
        lc_stats = lc_repeats * np.ones(lc_points, dtype=int)
        lc = LCSplit(ShuffleSplit,
                     n_repeats=lc_stats,
                     train_sizes=train_sizes,
                     test_size=n_test,
                     random_state=10)

        lc_scores = LC_SCOREBOARD(train_sizes)
        for lctrain, _ in lc.split(y_train):
            Ntrain = len(lctrain)
            lc_K_NM = K_NM[lctrain, :]
            lc_y_train = y_train[lctrain]
            # here we always use the same test set
            # otherwise, one can do `lc_K_test = K_NM[lctest,:]; lc_y_test = y_train[lctest]`
            krr.fit(K_MM, lc_K_NM, lc_y_train)
            # here we always use the same test set
            _, lc_score_now = krr.fit_predict_error(K_MM, lc_K_NM, lc_y_train,
                                                    K_TM, y_test)
            lc_scores.add_score(Ntrain, lc_score_now)

        sc_name = 'RMSE'  #     MAE, RMSE, SUP, R2, CORR
        lc_results = lc_scores.fetch(sc_name)
        # output learning curve
        np.savetxt("KRR_learning_curve_4" + prefix + ".dat", lc_results)

    plot_styles.set_nice_font()

    if lc_points > 1 and n_sparse > 0:
        fig = plt.figure(figsize=(8 * 2.1, 8))
        ax = fig.add_subplot(121)
    else:
        fig = plt.figure(figsize=(8, 8))
        ax = fig.add_subplot(111)
    ax.plot(y_train, y_pred, 'b.', label='train')
    ax.plot(y_test, y_pred_test, 'r.', label='test')
    ax.legend()
    ax.set_title('KRR for: ' + fy)
    ax.set_xlabel('actual y')
    ax.set_ylabel('predicted y')

    if lc_points > 1 and n_sparse > 0:
        ax2 = fig.add_subplot(122)
        ax2.errorbar(lc_results[:, 0],
                     lc_results[:, 1],
                     yerr=lc_results[:, 2],
                     linestyle='',
                     uplims=True,
                     lolims=True)
        ax2.set_title('Learning curve')
        ax2.set_xlabel('Number of training samples')
        ax2.set_ylabel('Test {}'.format(sc_name))
        ax2.set_xscale('log')
        ax2.set_yscale('log')

    plt.show()
    fig.savefig('KRR_4_' + prefix + '.png')

Example #5

Show file

File: kpca_sparse_deprecated.py Project: yingli2009/ASAP

def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom,
         keepraw, sparse_mode, n_sparse, power, kpca_d, pc1, pc2,
         projectatomic, plotatomic, adjusttext):
    """
    Parameters
    ----------
    fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it.
    fxyz: Location of xyz file for reading the properties.
    ftags: Location of tags for the first M samples. Plot the tags on the (k)PCA map.
    fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot'
    colorscol: The column number of the properties used for the coloring. Starts from 0.
    prefix: Filename prefix, default is ASAP
    output: The format for output files ([xyz], [matrix]). Default is xyz.
    peratom: Whether to output per atom pca coordinates (True/False)
    keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False)
    n_sparse: number of representative samples, default is 5% of the data
    power: use polynomial kernel function of degree n. 
    kpca_d: Number of the principle components to keep
    pc1: Plot the projection along which principle axes
    pc2: Plot the projection along which principle axes
    projectatomic: build the projection using the (big) atomic descriptor matrix
    plotatomic: Plot the PCA coordinates of all atomic environments (True/False)
    adtext: Whether to adjust the texts (True/False)

    Returns
    -------
    """

    use_atomic_desc = (peratom or plotatomic or projectatomic)
    # try to read the xyz file
    if fxyz != 'none':
        asapxyz = ASAPXYZ(fxyz)
        desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc)
        if projectatomic: desc = desc_atomic.copy()
    else:
        asapxyz = None
        print(
            "Did not provide the xyz file. We can only output descriptor matrix."
        )
        output = 'matrix'
    # we can also load the descriptor matrix from a standalone file
    if os.path.isfile(fmat[0]):
        try:
            desc = np.genfromtxt(fmat[0], dtype=float)
            print("loaded the descriptor matrix from file: ", fmat)
        except:
            raise ValueError('Cannot load the descriptor matrix from file')
    # sanity check
    if len(desc) == 0:
        raise ValueError(
            'Please supply descriptor in a xyz file or a standlone descriptor matrix'
        )

    print("shape of the descriptor matrix: ", np.shape(desc),
          "number of descriptors: ", np.shape(desc[0]))

    if ftags != 'none':
        tags = np.loadtxt(ftags, dtype="str")[:]
        ndict = len(tags)
    else:
        tags = []

    # sparsification
    n_sample = len(desc)
    # set default value of n_sparse
    if n_sparse == 0:
        n_sparse = max(10, n_sample // 20)
    # sparsification
    if n_sparse >= n_sample:
        print(
            "the number of representative structure is too large, please select n < ",
            n_sample)
    elif n_sample > 0:
        if sparse_mode == 'fps' or sparse_mode == 'FPS':
            ifps, _ = fps(desc, n_sparse, 0)
        elif sparse_mode == 'cur' or sparse_mode == 'CUR':
            cov = np.dot(np.asmatrix(desc), np.asmatrix(desc).T)
            ifps, _ = CUR_deterministic(cov, n_sparse)
        else:
            raise ValueError('Cannot find the specified sparsification mode')
    else:
        print("Not using any sparsification")
        ifps = np.range(n_sample)

    k_spec = {
        'k0': {
            "type": "cosine"
        }
    }  #{ 'k1': {"type": "polynomial", "d": power}}
    k_transform = Descriptors_to_Kernels(k_spec)

    kNN = k_transform.compute(desc[ifps])
    kMN = k_transform.compute(desc, desc[ifps])
    print("Shape of the kNN matrix: ", np.shape(kNN),
          ", and shape of the kMN matrix:", np.shape(kMN))
    # main thing
    kpca = KernelPCA(kpca_d)
    kpca.fit(kNN)
    proj = kpca.transform(kMN)
    if peratom or plotatomic and not projectatomic:
        kNT = np.power(np.dot(desc_atomic[:], desc[ifps].T), power)
        proj_atomic_all = kpca.transform(kNT)

    # save
    if output == 'matrix':
        np.savetxt(prefix + "-kpca-d" + str(kpca_d) + ".coord",
                   proj,
                   fmt='%4.8f',
                   header='low D coordinates of samples')
    elif output == 'xyz':
        if os.path.isfile(foutput + ".xyz"):
            os.rename(foutput + ".xyz", "bck." + foutput + ".xyz")
        asapxyz.set_descriptors(proj, 'kpca_coord')
        asapxyz.write(foutput)

    # color scheme
    plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function(
        fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic)

    # make plot
    if plotatomic:
        outfile = 'KPCA_4_' + prefix + '-c-' + fcolor + '-plotatomic.png'
    else:
        outfile = 'KPCA_4_' + prefix + '-c-' + fcolor + '.png'

    fig_spec_dict = {
        'outfile': outfile,
        'show': False,
        'title': None,
        'xlabel': 'Principal Axis 1',
        'ylabel': 'Principal Axis 2',
        'xaxis': True,
        'yaxis': True,
        'remove_tick': False,
        'rasterized': True,
        'fontsize': 16,
        'components': {
            "first_p": {
                "type": 'scatter',
                'clabel': colorlabel
            },
            "second_p": {
                "type": 'annotate',
                'adtext': adjusttext
            }
        }
    }
    asap_plot = Plotters(fig_spec_dict)
    asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], [], tags)
    if peratom or plotatomic and not projectatomic:
        asap_plot.plot(proj_atomic_all[::-1, [pc1, pc2]],
                       plotcolor_peratom[::-1], [], [])
    plt.show()