Exemple #1
0
def analytic_sobol_indices_from_gaussian_process(
        gp, variable, interaction_terms, ngp_realizations=1,
        stat_functions=(np.mean, np.median, np.min, np.max),
        ninterpolation_samples=500, nvalidation_samples=100,
        ncandidate_samples=1000, nquad_samples=50, use_cholesky=True, alpha=0):

    x_train, y_train, K_inv, lscale, kernel_var, transform_quad_rules = \
        extract_gaussian_process_attributes_for_integration(gp)

    if ngp_realizations > 0:
        gp_realizations = generate_gp_realizations(
            gp, ngp_realizations, ninterpolation_samples, nvalidation_samples,
            ncandidate_samples, variable, use_cholesky, alpha)

        # Check how accurate realizations
        validation_samples = generate_independent_random_samples(
            variable, 1000)
        mean_vals, std = gp(validation_samples, return_std=True)
        realization_vals = gp_realizations(validation_samples)
        print(mean_vals[:, 0].mean())
        # print(std,realization_vals.std(axis=1))
        print('std of realizations error',
              np.linalg.norm(std-realization_vals.std(axis=1))/np.linalg.norm(
                  std))
        print('var of realizations error',
              np.linalg.norm(std**2-realization_vals.var(axis=1)) /
              np.linalg.norm(std**2))

        print('mean interpolation error',
              np.linalg.norm((mean_vals[:, 0]-realization_vals[:, -1])) /
              np.linalg.norm(mean_vals[:, 0]))

        x_train = gp_realizations.selected_canonical_samples
        # gp_realizations.train_vals is normalized so unnormalize
        y_train = gp._y_train_std*gp_realizations.train_vals
        # kernel_var has already been adjusted by call to
        # extract_gaussian_process_attributes_for_integration
        K_inv = np.linalg.inv(gp_realizations.L.dot(gp_realizations.L.T))
        K_inv /= gp._y_train_std**2

    sobol_values, total_values, means, variances = \
        _compute_expected_sobol_indices(
            gp, variable, interaction_terms, nquad_samples,
            x_train, y_train, K_inv, lscale, kernel_var,
            transform_quad_rules, gp._y_train_mean)
    sobol_values = sobol_values.T
    total_values = total_values.T

    result = dict()
    data = [sobol_values, total_values, variances, means]
    data_names = ['sobol_indices', 'total_effects', 'variance', 'mean']
    for item, name in zip(data, data_names):
        subdict = dict()
        for ii, sfun in enumerate(stat_functions):
            subdict[sfun.__name__] = sfun(item, axis=(0))
        subdict['values'] = item
        result[name] = subdict
    return result
def sampling_based_sobol_indices_from_gaussian_process(
        gp,
        variables,
        interaction_terms,
        nsamples,
        sampling_method='sobol',
        ngp_realizations=1,
        normalize=True,
        nsobol_realizations=1,
        stat_functions=(np.mean, np.median, np.min, np.max),
        ninterpolation_samples=500,
        nvalidation_samples=100,
        ncandidate_samples=1000,
        use_cholesky=True,
        alpha=0):
    """
    Compute sobol indices from Gaussian process using sampling. 
    This function returns the mean and variance of these values with 
    respect to the variability in the GP (i.e. its function error)

    Following Kennedy and O'hagan we evaluate random realizations of each
    GP at a discrete set of points. To predict at larger sample sizes we 
    interpolate these points and use the resulting approximation to make any 
    subsequent predictions. This introduces an error but the error can be 
    made arbitrarily small by setting ninterpolation_samples large enough.
    The geometry of the interpolation samples can effect accuracy of the
    interpolants. Consequently we use Pivoted cholesky algorithm in 
    Harbrecht et al for choosing the interpolation samples.

    Parameters
    ----------
    ngp_realizations : integer
        The number of random realizations of the Gaussian process
        if ngp_realizations == 0 then the sensitivity indices will
        only be computed using the mean of the GP.

    nsobol_realizations : integer
        The number of random realizations of the random samples used to 
        compute the sobol indices. This number should be similar to 
        ngp_realizations, as mean and stdev are taken over both these 
        random values.

    stat_functions : list
        List of callable functions with signature fun(np.ndarray)
        E.g. np.mean. If fun has arguments then we must wrap then with partial
        and set a meaniningful __name__, e.g. fun = partial(np.quantile, q=0.5)
        fun.__name__ == 'quantile-0.25'. 
        Note: np.min and np.min names are amin, amax

    ninterpolation_samples : integer
        The number of samples used to interpolate the discrete random 
        realizations of a Gaussian Process

    nvalidation_samples : integer
        The number of samples used to assess the accuracy of the interpolants
        of the random realizations

    ncandidate_samples : integer
        The number of candidate samples selected from when building the 
        interpolants of the random realizations
        
    Returns
    -------
    result : dictionary
        Result containing the numpy functions in stat_funtions applied
        to the mean, variance, sobol_indices and total_effects of the Gaussian
        process. To access the data associated with a fun in stat_function
        use the key fun.__name__, For example  if the stat_function is np.mean
        the mean sobol indices are accessed via result['sobol_indices']['mean'].
        The raw values of each iteration are stored in 
        result['sobol_indices]['values']
    """
    assert nsobol_realizations > 0

    if ngp_realizations > 0:
        assert ncandidate_samples > ninterpolation_samples
        gp_realizations = generate_gp_realizations(
            gp, ngp_realizations, ninterpolation_samples, nvalidation_samples,
            ncandidate_samples, variables, use_cholesky, alpha)
        fun = gp_realizations
    else:
        fun = gp

    sobol_values, total_values, variances, means = \
        repeat_sampling_based_sobol_indices(
            fun, variables, interaction_terms, nsamples,
            sampling_method, nsobol_realizations)

    result = dict()
    data = [sobol_values, total_values, variances, means]
    data_names = ['sobol_indices', 'total_effects', 'variance', 'mean']
    for item, name in zip(data, data_names):
        subdict = dict()
        for ii, sfun in enumerate(stat_functions):
            # have to deal with averaging over axis = (0, 1) and axis = (0, 2)
            # for mean, variance and sobol_indices, total_effects respectively
            subdict[sfun.__name__] = sfun(item, axis=(0, -1))
        subdict['values'] = item
        result[name] = subdict
    return result