Esempio n. 1
0
def main():
    """

    Test if Ridge regression is working.

    Parameters
    ----------
    fxyz: string giving location of xyz file
    prefix: string giving the filename prefix
    """
    fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz')
    fmat = ['SOAP-n4-l3-c1.9-g0.23']
    fy = 'dft_formation_energy_per_atom_in_eV'
    prefix = "test-skrr"
    test_ratio = 0.05
    lc_points = 8
    lc_repeats = 8

    # try to read the xyz file
    asapxyz = ASAPXYZ(fxyz)
    desc, _ = asapxyz.get_descriptors(fmat, False)
    y_all = asapxyz.get_property(fy)
    # print(desc)

    dm = Design_Matrix(X=desc, y=y_all, whiten=True, test_ratio=test_ratio)

    # kernel, jitter, delta, sigma, sparse_mode="fps", n_sparse=None
    k_spec = {
        'k0': {
            "type": "linear"
        }
    }  # { 'k1': {"type": "polynomial", "d": power}}

    # if sigma is not set...
    sigma = 0.001 * np.std(y_all)
    krr = KRRSparse(0., None, sigma)
    skrr = SPARSE_KRR_Wrapper(k_spec, krr, sparse_mode="fps", n_sparse=-1)

    # fit the model
    dm.compute_fit(skrr, 'skrr', store_results=True, plot=True)

    # learning curve
    if lc_points > 1:
        dm.compute_learning_curve(skrr,
                                  'ridge_regression',
                                  lc_points=lc_points,
                                  lc_repeats=lc_repeats,
                                  randomseed=42,
                                  verbose=False)

    dm.save_state(prefix)
    plt.show()
Esempio n. 2
0
def kernelridge(ctx, sigma, kernel, kernel_parameter, sparse_mode, n_sparse):
    """Kernel Ridge Regression (with sparsification)"""
    from asaplib.fit import SPARSE_KRR_Wrapper, KRRSparse
    k_spec = {"first_kernel": {"type": kernel, "d": kernel_parameter}}
    krr = KRRSparse(0., None, sigma)
    skrr = SPARSE_KRR_Wrapper(k_spec, krr, sparse_mode=sparse_mode, n_sparse=n_sparse)
    # fit the model
    ctx.obj['dm'].compute_fit(skrr, 'skrr', store_results=True, plot=True)
    if ctx.obj['fit_options']["learning_curve"] > 1:
        ctx.obj['dm'].compute_learning_curve(skrr, 'skrr', ctx.obj['fit_options']["learning_curve"],
                                             ctx.obj['fit_options']["lc_points"], randomseed=42, verbose=False)

    ctx.obj['dm'].save_state(ctx.obj['fit_options']['prefix'])
    from matplotlib import pyplot as plt
    plt.show()
Esempio n. 3
0
def main(fmat, fy, prefix, test_ratio, jitter, n_sparse, sigma):
    """

    Parameters
    ----------
    fmat: Location of kernel matrix file.
    fy: Location of property list (1D-array of floats)
    prefix: filename prefix for learning curve figure
    test_ratio: train/test ratio
    jitter: jitter level, default is 1e-10
    n_sparse: number of representative samples
    sigma: noise level in kernel ridge regression

    Returns
    -------

    Fitting outcome & Learning curve.

    """

    # if it has been computed before we can simply load it
    try:
        K_all = np.genfromtxt(fmat, dtype=float)
    except OSError:
        raise Exception(
            'fmat file could not be loaded. Please check the filename')
    print("loaded", fmat)
    try:
        y_all = np.genfromtxt(fy, dtype=float)
    except OSError:
        raise Exception(
            'property vector file could not be loaded. Please check the filename'
        )
    if len(y_all) != len(K_all):
        raise ValueError(
            'Length of the vector of properties is not the same as number of samples'
        )
    else:
        n_sample = len(K_all)

    # train test split
    if test_ratio > 0:
        K_train, K_test, y_train, y_test, _, _ = kernel_random_split(
            K_all, y_all, test_ratio)
    else:
        K_train = K_test = K_all
        y_train = y_test = y_all
    n_train = len(K_train)
    n_test = len(K_test)

    # sparsification
    if n_sparse >= n_train:
        print(
            "the number of representative structure is too large, please select n < ",
            n_train)
    elif n_sparse > 0:
        ifps, dfps = fps(K_train, n_sparse, 0)
        K_MM = K_train[:, ifps][ifps]
        K_NM = K_train[:, ifps]
        K_TM = K_test[:, ifps]
    else:
        print("it's usually better to use some sparsification")
        K_MM = K_train
        K_NM = K_train
        K_TM = K_test

    delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM))
    krr = KRRSparse(jitter, delta, sigma)
    # fit the model
    krr.fit(K_MM, K_NM, y_train)

    # get the predictions for train set
    y_pred = krr.predict(K_NM)
    # compute the CV score for the dataset
    print("train score: ", get_score(y_pred, y_train))
    # get the predictions for test set
    y_pred_test = krr.predict(K_TM)
    # compute the CV score for the dataset
    print("test score: ", get_score(y_pred_test, y_test))

    plot_styles.set_nice_font()
    fig = plt.figure(figsize=(8 * 2.1, 8))
    ax = fig.add_subplot(121)
    ax.plot(y_train, y_pred, 'b.', label='train')
    ax.plot(y_test, y_pred_test, 'r.', label='test')
    ax.legend()
    ax.set_title('KRR for: ' + fy)
    ax.set_xlabel('actual y')
    ax.set_ylabel('predicted y')

    # learning curve
    # decide train sizes
    lc_points = 10
    train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points)
    print("Learning curves using train sizes: ", train_sizes)
    lc_stats = 12 * np.ones(lc_points, dtype=int)
    lc = LCSplit(ShuffleSplit,
                 n_repeats=lc_stats,
                 train_sizes=train_sizes,
                 test_size=n_test,
                 random_state=10)

    scores = {size: [] for size in train_sizes}
    for lctrain, lctest in lc.split(y_train):
        Ntrain = len(lctrain)
        lc_K_NM = K_NM[lctrain, :]
        lc_y_train = y_train[lctrain]
        # lc_K_test = K_NM[lctest,:]
        lc_K_test = K_TM
        # lc_y_test = y_train[lctest]
        lc_y_test = y_test
        krr.fit(K_MM, lc_K_NM, lc_y_train)
        lc_y_pred = krr.predict(lc_K_test)
        scores[Ntrain].append(get_score(lc_y_pred, lc_y_test))

    sc_name = 'RMSE'
    Ntrains = []
    avg_scores = []
    avg_scores_error = []
    for Ntrain, score in scores.items():
        avg = 0.
        var = 0.
        for sc in score:
            avg += sc[sc_name]
            var += sc[sc_name]**2.
        avg /= len(score)
        var /= len(score)
        var -= avg**2.
        avg_scores.append(avg)
        avg_scores_error.append(np.sqrt(var))
        Ntrains.append(Ntrain)

    ax2 = fig.add_subplot(122)
    ax2.errorbar(Ntrains, avg_scores, yerr=avg_scores_error)
    ax2.set_title('Learning curve')
    ax2.set_xlabel('Number of training samples')
    ax2.set_ylabel('Test {}'.format(sc_name))
    ax2.set_xscale('log')
    ax2.set_yscale('log')

    plt.show()
    fig.savefig('KRR_4_' + prefix + '.png')
Esempio n. 4
0
def main(fmat, fxyz, fy, prefix, test_ratio, jitter, n_sparse, sigma,
         lc_points, lc_repeats):
    """

    Parameters
    ----------
    fmat: Location of kernel matrix file.
    fy: Location of property list (1D-array of floats)
    prefix: filename prefix for learning curve figure
    test_ratio: train/test ratio
    jitter: jitter level, default is 1e-10
    n_sparse: number of representative samples, default is 5% of the data
    sigma: noise level in kernel ridge regression, default is 0.1% of the standard deviation of the data.
    lc_points : number of points on the learning curve
    lc_repeats : number of sub-sampling when compute the learning curve

    Returns
    -------

    Fitting outcome & Learning curve.

    """

    # if it has been computed before we can simply load it
    try:
        K_all = np.genfromtxt(fmat, dtype=float)
    except OSError:
        raise Exception(
            'fmat file could not be loaded. Please check the filename')
    print("loaded", fmat)

    # read in the properties to be predicted
    y_all = []
    try:
        y_all = np.genfromtxt(fy, dtype=float)
    except:
        try:
            # try to read the xyz file
            if fxyz != 'none':
                asapxyz = ASAPXYZ(fxyz)
                y_all = asapxyz.get_property(fy)
        except OSError:
            raise Exception(
                'property vector file could not be loaded. Please check the filename'
            )

    if len(y_all) != len(K_all):
        raise ValueError(
            'Length of the vector of properties is not the same as number of samples'
        )
    else:
        n_sample = len(K_all)

    # train test split
    if test_ratio > 0:
        K_train, K_test, y_train, y_test, _, _ = kernel_random_split(
            K_all, y_all, test_ratio)
    else:
        K_train = K_test = K_all
        y_train = y_test = y_all
    n_train = len(K_train)
    n_test = len(K_test)

    # set default value of n_sparse
    if n_sparse == 0:
        n_sparse = n_train // 20
    # sparsification
    if n_sparse >= n_train:
        print(
            "the number of representative structure is too large, please select n < ",
            n_train)
    elif n_sparse > 0:
        ifps, dfps = fps(K_train, n_sparse, 0)
        K_MM = K_train[:, ifps][ifps]
        K_NM = K_train[:, ifps]
        K_TM = K_test[:, ifps]
    else:
        print("it's usually better to use some sparsification")
        K_MM = K_train
        K_NM = K_train
        K_TM = K_test

    # if sigma is not set...
    if sigma < 0:
        sigma = 0.001 * np.std(y_train)

    delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM))
    krr = KRRSparse(jitter, delta, sigma)
    # fit the model
    krr.fit(K_MM, K_NM, y_train)

    fit_error = {}
    # get the predictions for train set
    y_pred = krr.predict(K_NM)
    # compute the CV score for the dataset
    y_pred, y_pred_test, fit_error = krr.get_train_test_error(K_NM,
                                                              y_train,
                                                              K_TM,
                                                              y_test,
                                                              verbose=True,
                                                              return_pred=True)
    # dump to file
    import json
    with open('KRR_train_test_errors_4' + prefix + '.json', 'w') as fp:
        json.dump(fit_error, fp)

    # learning curve
    # decide train sizes
    if lc_points > 1 and n_sparse > 0:
        train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points)
        print("Learning curves using train sizes: ", train_sizes)
        lc_stats = lc_repeats * np.ones(lc_points, dtype=int)
        lc = LCSplit(ShuffleSplit,
                     n_repeats=lc_stats,
                     train_sizes=train_sizes,
                     test_size=n_test,
                     random_state=10)

        lc_scores = LC_SCOREBOARD(train_sizes)
        for lctrain, _ in lc.split(y_train):
            Ntrain = len(lctrain)
            lc_K_NM = K_NM[lctrain, :]
            lc_y_train = y_train[lctrain]
            # here we always use the same test set
            # otherwise, one can do `lc_K_test = K_NM[lctest,:]; lc_y_test = y_train[lctest]`
            krr.fit(K_MM, lc_K_NM, lc_y_train)
            # here we always use the same test set
            _, lc_score_now = krr.fit_predict_error(K_MM, lc_K_NM, lc_y_train,
                                                    K_TM, y_test)
            lc_scores.add_score(Ntrain, lc_score_now)

        sc_name = 'RMSE'  #     MAE, RMSE, SUP, R2, CORR
        lc_results = lc_scores.fetch(sc_name)
        # output learning curve
        np.savetxt("KRR_learning_curve_4" + prefix + ".dat", lc_results)

    plot_styles.set_nice_font()

    if lc_points > 1 and n_sparse > 0:
        fig = plt.figure(figsize=(8 * 2.1, 8))
        ax = fig.add_subplot(121)
    else:
        fig = plt.figure(figsize=(8, 8))
        ax = fig.add_subplot(111)
    ax.plot(y_train, y_pred, 'b.', label='train')
    ax.plot(y_test, y_pred_test, 'r.', label='test')
    ax.legend()
    ax.set_title('KRR for: ' + fy)
    ax.set_xlabel('actual y')
    ax.set_ylabel('predicted y')

    if lc_points > 1 and n_sparse > 0:
        ax2 = fig.add_subplot(122)
        ax2.errorbar(lc_results[:, 0],
                     lc_results[:, 1],
                     yerr=lc_results[:, 2],
                     linestyle='',
                     uplims=True,
                     lolims=True)
        ax2.set_title('Learning curve')
        ax2.set_xlabel('Number of training samples')
        ax2.set_ylabel('Test {}'.format(sc_name))
        ax2.set_xscale('log')
        ax2.set_yscale('log')

    plt.show()
    fig.savefig('KRR_4_' + prefix + '.png')