Exemple #1
0
 def obj(y0):
     y0x0z0 = np.hstack([y0, x0z0])
     num = kernel_density.KDEMultivariate(
         data=np.hstack([self.endog, self.exog, self.external]),
         var_type=self._endogtype + self._exogtype + self._externaltype,
         bw='normal_reference')
     den = kernel_density.KDEMultivariate(
         data=np.hstack([self.exog, self.external]),
         var_type=self._exogtype + self._externaltype,
         bw='normal_reference')
     return -num.pdf(y0x0z0) / den.pdf(x0z0)
Exemple #2
0
def calc_next_params(domain, trials):

    gamma = DEFAULT_GAMMA
    bw = DEFAULT_BW
    n_min = DEFAULT_N_MIN
    bw_weight = DEFAULT_BW_WEIGHT
    sampling_num = DEFAULT_SAMPLING_NUM

    next_params = {}
    if len(trials) <= n_min + 2:
        random_result = domain.random()
        for index, fieldname in enumerate(domain.fieldnames):
            next_params[fieldname] = random_result[index]
        return next_params

    train_x, train_y = trials.get_train_data()
    idx = np.argsort(train_y)
    n = len(trials)
    l_len = max(n_min, int(n * gamma))
    g_len = max(n_min, n - l_len)
    x_l = train_x[idx[:l_len], :]
    x_g = train_x[idx[-g_len:], :]

    # I want to get the types of params from domain
    # v_types = "ccccuuuuuooooo"
    # ref. http://www.statsmodels.org/dev/generated/
    #      statsmodels.nonparametric.kernel_density.KDEMultivariate.html

    v_types = 'c' * domain.n_params

    l_est = kde.KDEMultivariate(x_l, v_types, bw=bw)

    g_est = kde.KDEMultivariate(x_g, v_types, bw=bw)

    wide_bw = l_est.bw * bw_weight
    for w in np.nditer(wide_bw, op_flags=['readwrite']):
        w[...] = max(w, 1e-3 * bw_weight)
    bounds = domain.bounds
    minimize_result = minimize(fun=objective_function,
                               x=x_l,
                               sampling_num=sampling_num,
                               bw=wide_bw,
                               bounds=bounds,
                               args=(l_est, g_est))

    for index, fieldname in enumerate(domain.fieldnames):
        next_params[fieldname] = minimize_result[index]

    return next_params
Exemple #3
0
 def logpdf(self, rowid, targets, constraints=None, inputs=None):
     if self.N == 0:
         raise ValueError('KDE requires at least one observation.')
     constraints = self.populate_constraints(rowid, targets, constraints)
     if inputs:
         raise ValueError('Prohibited inputs: %s' % (inputs, ))
     if not targets:
         raise ValueError('No targets: %s' % (targets, ))
     if any(np.isnan(v) for v in targets.values()):
         raise ValueError('Invalid nan values in targets: %s' % (targets, ))
     if any(q not in self.outputs for q in targets):
         raise ValueError('Unknown targets: %s' % (targets, ))
     if any(q in constraints for q in targets):
         raise ValueError('Duplicate variable: %s, %s' % (
             targets,
             constraints,
         ))
     if not constraints:
         model = kernel_density.KDEMultivariate(
             self._dataset(targets),
             self._stattypes(targets),
             bw=self._bw(targets),
         )
         pdf = model.pdf(targets.values())
     else:
         full_members = self._dataset(targets.keys() + constraints.keys())
         model = kernel_density.KDEMultivariateConditional(
             full_members[:, :len(targets)],
             full_members[:, len(targets):],
             self._stattypes(targets),
             self._stattypes(constraints),
             bw=np.concatenate((self._bw(targets), self._bw(constraints))),
         )
         pdf = model.pdf(targets.values(), constraints.values())
     return np.log(pdf)
Exemple #4
0
 def transition(self, N=None):
     if self.N > 0:
         dataset = self._dataset(self.outputs)
         stattypes = self._stattypes(self.outputs)
         # Learn the kernel bandwidths.
         kde = kernel_density.KDEMultivariate(dataset,
                                              stattypes,
                                              bw='cv_ml')
         self.bw = kde.bw.tolist()
Exemple #5
0
    def execute(self):
        """Execute the link.

        :returns: status code of execution
        :rtype: StatusCode
        """
        # --- your algorithm code goes here
        self.logger.debug('Now executing link: {link}.', link=self.name)

        ds = process_manager.service(DataStore)

        unordered_categorical_i = ds['unordered_categorical_i']
        ordered_categorical_i = ds['ordered_categorical_i']
        continuous_i = ds['continuous_i']
        data_no_nans = ds[self.data_no_nans_read_key]

        # Concatenate normalized data with original categorical data
        # if one of unordered_categorical_i, ordered_categorical_i, data_normalized is empty, then concatenating will
        # not work (see next line). We thus make them of the correct length
        data_unordered_categorical = data_no_nans[:, unordered_categorical_i]
        data_ordered_categorical = data_no_nans[:, ordered_categorical_i]

        n_obs = len(data_no_nans)
        if data_unordered_categorical.size == 0:
            data_unordered_categorical = np.empty(shape=(n_obs, 0))
        if data_ordered_categorical.size == 0:
            data_ordered_categorical = np.empty(shape=(n_obs, 0))

        if self.do_pca:
            data_normalized_pca = ds[self.data_normalized_pca_read_key]
            d = np.concatenate((data_unordered_categorical,
                                data_ordered_categorical, data_normalized_pca),
                               axis=1)
        else:
            data_normalized = ds[self.data_normalized_read_key]
            if data_normalized.size == 0:
                data_normalized = np.empty(shape=(n_obs, 0))
            d = np.concatenate((data_unordered_categorical,
                                data_ordered_categorical, data_normalized),
                               axis=1)

        var_type = 'u' * len(unordered_categorical_i) + 'o' * len(ordered_categorical_i) + \
                   'c' * len(continuous_i)

        # NB: statsmodels uses normal reference for unordered categorical variables as well!
        # NB: the bandwiths are determined on the normalized continuous data and on the original categorical data
        if (len(continuous_i) == 0) & (len(ordered_categorical_i) == 0):
            kde_weights = ut.kde_only_unordered_categorical(d)
            ds[self.store_key] = kde_weights
        else:
            kde = kernel_density.KDEMultivariate(d,
                                                 var_type=var_type,
                                                 bw='normal_reference')
            ds[self.store_key] = kde.bw

        return StatusCode.Success
Exemple #6
0
def kde_statsmodels_m(x, x_grid, **kwargs):
    """
    multivariate kde
    """
    model = kde.KDEMultivariate(x, bw='normal_reference', var_type='c')
    return model.cdf(x_grid)
Exemple #7
0
w = KPDF.MPDFGaussian(rvs, grid, bw_kpdf / 2)
w = w.reshape(x.shape) / w.max()
plot(axes[3], w, 'KPDF bw:kpdf/2 ($\ell_2$ norm: %.3f)' % np.linalg.norm(
    (p - w).flat))

w = KPDF.MPDFGaussian(rvs, grid, bw_scott)
w = w.reshape(x.shape) / w.max()
plot(axes[4], w, 'KPDF bw:scott ($\ell_2$ norm: %.3f)' % np.linalg.norm(
    (p - w).flat))

w = KPDF.MPDFGaussian(rvs, grid, bw_scott / 2)
w = w.reshape(x.shape) / w.max()
plot(axes[5], w, 'KPDF bw:scott/2 ($\ell_2$ norm: %.3f)' % np.linalg.norm(
    (p - w).flat))

dens = smkde.KDEMultivariate(rvs, 'cc', bw='cv_ml')
print("SM bandwidth (cv_ml): " + repr(dens.bw))
w = dens.pdf(grid)
w = w.reshape(x.shape) / w.max()
plot(axes[6], w, 'SM bw:CVML ($\ell_2$ norm: %.3f)' % np.linalg.norm(
    (p - w).flat))

dens = smkde.KDEMultivariate(rvs, 'cc', bw='cv_ls')
print("SM bandwidth (cv_ls): " + repr(dens.bw))
w = dens.pdf(grid)
w = w.reshape(x.shape) / w.max()
plot(axes[7], w, 'SM bw:CVLS ($\ell_2$ norm: %.3f)' % np.linalg.norm(
    (p - w).flat))

plt.savefig('fig3.png')
plt.show()
Exemple #8
0
def stats_kde(x, **kwargs):
    grid = np.arange(np.nanmin(x), np.nanmax(x))
    model = kde.KDEMultivariate(x, bw='normal_reference', var_type='c')
    return grid, model.cdf(grid), model.pdf(grid)
print('\n')

#%%

# We need a multivariate alternative to scikit learn...
#
# https://www.statsmodels.org/stable/generated/
#     statsmodels.nonparametric.kernel_density.KDEMultivariate.html
import statsmodels.nonparametric.kernel_density as statmKDE

# Get the longitude and latitude values of the asteroid
AST_LONG_LAT = ast_2020_jx1_df[['ECLIP_LONG_RAD', 'ECLIP_LAT_RAD']].values

# Compute now the 2D multivariate KDE
DENS_MODEL = statmKDE.KDEMultivariate(data=AST_LONG_LAT, \
                                      var_type='cc', \
                                      bw='normal_reference')

#%%

# Let's print the bandwidth results
print(f'Bandwidth longitude in radians (normal ref.): {DENS_MODEL.bw[0]}')
print(f'Bandwidth latitude in radians (normal ref.): {DENS_MODEL.bw[1]}')
print('\n')

#%%

# Do the results from other bw-determining methods differ?
DENS_MODEL_TEMP = statmKDE.KDEMultivariate(data=AST_LONG_LAT, \
                                           var_type='cc', \
                                           bw='cv_ml')