Beispiel #1
0
def test_kernel_density_sampling(n_samples=100, n_features=3):
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)

    bandwidth = 0.2

    for kernel in ['gaussian', 'tophat']:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert_equal(X.shape, samp.shape)

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == 'tophat':
            assert np.all(dist < bandwidth)
        elif kernel == 'gaussian':
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)

    # non-regression test: used to return a scalar
    X = rng.randn(4, 1)
    kde = KernelDensity(kernel="gaussian").fit(X)
    assert_equal(kde.sample().shape, (1, 1))
Beispiel #2
0
def test_kernel_density_sampling(n_samples=100, n_features=3):
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)

    bandwidth = 0.2

    for kernel in ['gaussian', 'tophat']:
        # draw a tophat sample
        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert X.shape == samp.shape

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == 'tophat':
            assert np.all(dist < bandwidth)
        elif kernel == 'gaussian':
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)

    # non-regression test: used to return a scalar
    X = rng.randn(4, 1)
    kde = KernelDensity(kernel="gaussian").fit(X)
    assert kde.sample().shape == (1, 1)
Beispiel #3
0
def npd_function_ica(X, n_sample=50, k=10, s=2000, verbose=False):

    N_samp = np.shape(X)[0]
    N_bins = np.shape(X)[1]

    d = 1
    scotts_b = N_samp**(-1. / (d + 4))

    ica = FastICA(whiten=False, max_iter=1000, tol=1e-2)
    S_ = ica.fit_transform(X)
    A_ = ica.mixing_

    Y_ref_ica_unmixed = np.zeros((n_sample, N_samp - 1, N_bins))
    Y_ref_ica = np.zeros((n_sample, N_samp - 1, N_bins))

    for i in range(n_sample):
        for j in range(N_bins):
            X_ica_ind = S_[:, j].reshape(-1, 1)  #shape is now (2048,1)
            kde = KernelDensity(bandwidth=scotts_b,
                                kernel='gaussian').fit(X_ica_ind)
            samps = kde.sample(N_samp - 1)  #2047 samples for the jth k bin
            Y_ref_ica_unmixed[i, :, j] = np.ndarray.flatten(samps)
        Y_ref_ica[i] = np.dot(
            Y_ref_ica_unmixed[i],
            A_.T)  #applying the mixing matrix to undo the ICA transformation

    X_ref_ica_unmixed = np.zeros((N_samp, N_bins))
    X_ref_ica = np.zeros((N_samp, N_bins))

    for j in range(N_bins):
        X_ica_ind = S_[:, j].reshape(-1, 1)  #shape is now (2048,1)
        kde = KernelDensity(bandwidth=scotts_b,
                            kernel='gaussian').fit(X_ica_ind)
        samps = kde.sample(N_samp)  #2047 samples for the jth k bin
        X_ref_ica_unmixed[:, j] = np.ndarray.flatten(samps)

    X_ref_ica = np.dot(
        X_ref_ica_unmixed,
        A_.T)  #applying the mixing matrix to undo the ICA transformation

    kl_ref, kl_data = [], []

    for i in range(n_sample):

        if verbose:
            print(i)
        Y_ref_ica_samp = Y_ref_ica[i]
        kl_ref.append(
            kNNdiv_general(X_ref_ica,
                           Y_ref_ica_samp,
                           Knn=k,
                           alpha=None,
                           div_func='kl'))
        kl_data.append(
            kNNdiv_general(X, Y_ref_ica_samp, Knn=k, alpha=None,
                           div_func='kl'))

    return kl_ref, kl_data
Beispiel #4
0
    def get_probability(self, team1, team2):
        home_dist = np.array(self.teams[team1].get_sot_list() +
                             self.teams[team2].get_sota_list()).reshape(-1, 1)

        grid = GridSearchCV(KernelDensity(kernel='gaussian'),
                            {'bandwidth': self.bandwidths},
                            cv=LeaveOneOut())
        grid.fit(home_dist)
        bandwidth = grid.best_params_["bandwidth"]

        home_kernel = KernelDensity(bandwidth=bandwidth, kernel="gaussian")
        home_kernel.fit(home_dist)

        away_dist = np.array(self.teams[team1].get_sota_list() +
                             self.teams[team2].get_sot_list()).reshape(-1, 1)

        grid = GridSearchCV(KernelDensity(kernel='gaussian'),
                            {'bandwidth': self.bandwidths},
                            cv=LeaveOneOut())
        grid.fit(away_dist)
        bandwidth = grid.best_params_["bandwidth"]

        away_kernel = KernelDensity(bandwidth=bandwidth, kernel="gaussian")
        away_kernel.fit(away_dist)

        draw = 0
        home = 0
        away = 0

        for i in range(self.iterations):
            home_shots = home_kernel.sample()[0][0]
            away_shots = away_kernel.sample()[0][0]

            home_goals = np.round(home_shots *
                                  self.teams[team1].get_shot_conversion())
            away_goals = np.round(away_shots *
                                  self.teams[team2].get_shot_conversion())

            if home_goals == away_goals:
                draw += 1
            elif home_goals > away_goals:
                home += 1
            else:
                away += 1

        #print(home, draw, away)

        return home / self.iterations, draw / self.iterations, away / self.iterations
Beispiel #5
0
def pval_calibrated_bandwidth(data,
                              alpha_cal,
                              null,
                              I='auto',
                              N_bootstrap=1000,
                              comm=MPI.COMM_WORLD,
                              calibration_file=None):
    '''
        NB!: Test is only calibrated to correct level for alpha_cal.
    '''
    data = comm.bcast(data)
    I = get_I(data, I)
    try:
        lambda_alpha = load_lambda('bw_ad', null, alpha_cal, calibration_file)
    except KeyError:
        lambda_alpha = load_lambda('bw', null, alpha_cal, calibration_file)
    h_crit = critical_bandwidth(data, I)
    var_data = np.var(data)
    KDE_h_crit = KernelDensity(kernel='gaussian',
                               bandwidth=h_crit).fit(data.reshape(-1, 1))
    resamp_fun = lambda: is_unimodal_kde(
        h_crit * lambda_alpha,
        KDE_h_crit.sample(len(data)).ravel() / np.sqrt(1 + h_crit**2 / var_data
                                                       ), I)
    smaller_equal_crit_bandwidth = bootstrap(resamp_fun,
                                             N_bootstrap,
                                             dtype=np.bool_,
                                             comm=comm)
    return np.mean(~smaller_equal_crit_bandwidth)
Beispiel #6
0
def test_silverman_adaptive_resampling(data,
                                       alpha,
                                       I='auto',
                                       N_bootstrap_max=10000,
                                       comm=MPI.COMM_WORLD):
    data = comm.bcast(data)
    I = get_I(data, I)
    h_crit = critical_bandwidth(data, I)
    var_data = np.var(data)
    KDE_h_crit = KernelDensity(kernel='gaussian',
                               bandwidth=h_crit).fit(data.reshape(-1, 1))
    resamp_fun = lambda: not is_unimodal_kde(
        h_crit,
        KDE_h_crit.sample(len(data)).ravel() / np.sqrt(1 + h_crit**2 / var_data
                                                       ), I)
    try:
        return float(
            probability_above(resamp_fun,
                              alpha,
                              max_samp=N_bootstrap_max,
                              comm=comm,
                              batch=100,
                              bound_significance=0.05,
                              exception_at_max_samp=True,
                              printing=False))
    except MaxSampExceededException:
        return alpha
Beispiel #7
0
def test_calibrated_bandwidth_adaptive_resampling(data,
                                                  alpha,
                                                  null,
                                                  I='auto',
                                                  N_bootstrap_max=10000,
                                                  comm=MPI.COMM_WORLD,
                                                  calibration_file=None):
    data = comm.bcast(data)
    I = get_I(data, I)
    try:
        lambda_alpha = load_lambda('bw_ad', null, alpha, calibration_file)
        # loading lambda computed with adaptive probablistic bisection search
    except KeyError:
        lambda_alpha = load_lambda('bw', null, alpha, calibration_file)
        # loading lambda computed with probabilistic bisection search
    h_crit = critical_bandwidth(data, I)
    var_data = np.var(data)
    KDE_h_crit = KernelDensity(kernel='gaussian',
                               bandwidth=h_crit).fit(data.reshape(-1, 1))
    resamp_fun = lambda: not is_unimodal_kde(
        h_crit * lambda_alpha,
        KDE_h_crit.sample(len(data)).ravel() / np.sqrt(1 + h_crit**2 / var_data
                                                       ), I)
    try:
        return float(
            probability_above(resamp_fun,
                              alpha,
                              max_samp=N_bootstrap_max,
                              comm=comm,
                              batch=100,
                              bound_significance=0.05,
                              exception_at_max_samp=True,
                              printing=False))
    except MaxSampExceededException:
        return alpha
Beispiel #8
0
class XSampleBW(XSample):
    def __init__(self, N, sampfun, comm=MPI.COMM_WORLD):
        super(XSampleBW, self).__init__(N, sampfun, comm)
        self.I = (-1.5, 1.5)  # avoiding spurious bumps in the tails
        self.h_crit = critical_bandwidth(self.data, self.I)
        #print_all_ranks(self.comm, "self.h_crit = {}".format(self.h_crit))
        self.var = np.var(self.data)
        self.kde_h_crit = KernelDensity(kernel='gaussian',
                                        bandwidth=self.h_crit).fit(
                                            self.data.reshape(-1, 1))

    @property
    def statistic(self):
        return self.h_crit

    def resampled_statistic_below_scaled_statistic(self, lambda_scale):
        '''
            P( h_{crit}^* <= \lambda*h_{crit})
                = P(KDE(X^*, \lambda* h_{crit}) is unimodal)
        '''
        return self.is_unimodal_resample(lambda_scale)

    def is_unimodal_resample(self, lambda_val):
        data = self.kde_h_crit.sample(
            self.N).reshape(-1) / np.sqrt(1 + self.h_crit**2 / self.var)
        #print "np.var(data)/self.var = {}".format(np.var(data)/self.var)
        return is_unimodal_kde(self.h_crit * lambda_val, data, self.I)

    def probability_of_unimodal_above(self, lambda_val, gamma):
        return self.prob_resampled_statistic_below_bound_above_gamma(
            lambda_val, gamma)
Beispiel #9
0
class XSampleBW(XSample):

    def __init__(self, N, sampfun, comm=MPI.COMM_WORLD):
        super(XSampleBW, self).__init__(N, sampfun, comm)
        self.I = (-1.5, 1.5)  # avoiding spurious bumps in the tails
        self.h_crit = critical_bandwidth(self.data, self.I)
        #print_all_ranks(self.comm, "self.h_crit = {}".format(self.h_crit))
        self.var = np.var(self.data)
        self.kde_h_crit = KernelDensity(kernel='gaussian', bandwidth=self.h_crit).fit(self.data.reshape(-1, 1))

    @property
    def statistic(self):
        return self.h_crit

    def resampled_statistic_below_scaled_statistic(self, lambda_scale):
        '''
            P( h_{crit}^* <= \lambda*h_{crit})
                = P(KDE(X^*, \lambda* h_{crit}) is unimodal)
        '''
        return self.is_unimodal_resample(lambda_scale)

    def is_unimodal_resample(self, lambda_val):
        data = self.kde_h_crit.sample(self.N).reshape(-1)/np.sqrt(1+self.h_crit**2/self.var)
        #print "np.var(data)/self.var = {}".format(np.var(data)/self.var)
        return is_unimodal_kde(self.h_crit*lambda_val, data, self.I)

    def probability_of_unimodal_above(self, lambda_val, gamma):
        return self.prob_resampled_statistic_below_bound_above_gamma(lambda_val, gamma)
Beispiel #10
0
def augment_x_df(x_train,
                 df,
                 repeats=2,
                 fit_col='Assets',
                 seed=22,
                 cutoff=2.5,
                 col_num=200):
    #augments data with random multiplactive constant on all present value columns
    #distribution of this constant is that of the eg Assets column (which is assets growth)

    aug_mask = list(map(lambda x: x[0] != 'p', df.columns[0:col_num]))

    ker_fit_data = df[fit_col].values
    ker_fit_data = ker_fit_data[(ker_fit_data > 0.5) * (ker_fit_data < 1.5)]
    ker_fit_data = ker_fit_data.reshape(-1, 1)

    kde = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(ker_fit_data)
    x_train_aug = np.repeat(x_train, repeats, axis=0)

    number = x_train_aug.shape[0]
    seed = 22
    scale_rand = kde.sample([number], seed)

    x_train_aug[:, aug_mask] * scale_rand
    x_train[np.abs(x_train) > cutoff] = 0
    return x_train_aug
Beispiel #11
0
def copula_generate(X, generator=None, n=None):
    """ Generate using copula trick.

        :param generator: Model to fit and sample from. KDE by default.
        :param n: Number of examples to generate. By default it is the number of observations in X.
    """
    indexes = X.indexes
    columns = X.columns
    if generator is None:
        generator = KernelDensity()
    if n is None:
        n = X.shape[0]
    X_real = np.array(X)
    # X marginals to uniforms
    X = matrix_to_rank(X)
    # X uniforms to inverse gaussian CDF
    X = rank_matrix_to_inverse(X)
    # Fit generator
    generator.fit(X)
    # Generating artificial data \n Sampling from generator
    X_artif = generator.sample(n)
    # Marginal retrofitting
    result = autopandas.AutoData(marginal_retrofit(X_artif, X_real))
    # Restore data frame index
    result.indexes = indexes
    result.columns = columns
    return result
Beispiel #12
0
    def expand__(self):
        """
        -Expand. An algorithm for expanding the bounds of an APR to improve its generalization ability.
        The objective is to estimate a kernel density for the chosen instances with rel_features.
        After we have our density function, we sample n samples and we get the epsilon percentiles.
        
        Taking the percentiles from a sampled distribution.
        """
        self.mn_ = self.mn__pred.copy()
        self.mx_ = self.mx__pred.copy()

        for i in range(len(self.rel_features_)):

            kd = KernelDensity()
            kd.fit(self.chosen[:, i].reshape(1, -1))
            u = kd.sample(100, random_state=0)
            mn, mx = np.percentile(
                u, [self.epsilon * 100, (1 - self.epsilon) * 100])

            # if the bounds are outside the APR, we update the APR bounds.
            if mn < self.mn_[self.rel_features_[i]]:
                self.mn_[self.rel_features_[i]] = mn

            if mx > self.mx_[self.rel_features_[i]]:
                self.mx_[self.rel_features_[i]] = mx
Beispiel #13
0
def ICA_loglikes_samples(arr, lst):

    S_ = lst[0]
    A_ = lst[1]

    N_samp = np.shape(arr)[0]
    N_bins = np.shape(arr)[1]

    d = 1
    scotts_b = N_samp**(-1. / (d + 4))

    X_ref_ica_unmixed = np.zeros((2048, N_bins))
    X_ref_ica = np.zeros((2048, N_bins))

    loglike_ica = np.zeros(np.shape(arr)[0])

    for j in range(N_bins):

        X_ica_ind = S_[:, j].reshape(-1, 1)
        kde = KernelDensity(bandwidth=scotts_b,
                            kernel='gaussian').fit(X_ica_ind)

        samps = kde.sample(2048)
        X_ref_ica_unmixed[:, j] = np.ndarray.flatten(samps)

    X_ref_ica = np.dot(X_ref_ica_unmixed, A_.T)

    return loglike_ica, X_ref_ica
Beispiel #14
0
def test_KernelDensity_sampling(n_samples=100, n_features=3):
    np.random.seed(0)
    X = np.random.random((n_samples, n_features))

    bandwidth = 0.2

    for kernel in ["gaussian", "tophat"]:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert_equal(X.shape, samp.shape)

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == "tophat":
            assert np.all(dist < bandwidth)
        elif kernel == "gaussian":
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ["epanechnikov", "exponential", "linear", "cosine"]:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)
Beispiel #15
0
def test_KernelDensity_sampling(n_samples=100, n_features=3):
    np.random.seed(0)
    X = np.random.random((n_samples, n_features))

    bandwidth = 0.2

    for kernel in ['gaussian', 'tophat']:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert_equal(X.shape, samp.shape)

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == 'tophat':
            assert np.all(dist < bandwidth)
        elif kernel == 'gaussian':
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)
Beispiel #16
0
def KDE_resample(x, y, N, bandwidth=0.5):
    """
    Resample features based on Kernel Density approximation

    Parameters
    ----------
    X : numpy.ndarray
        Feature array
    y : numpy.ndarray
        Label array
    N : int
        Total samples to simulate (to be added to original sample)

    Returns
    -------
    newX : numpy.ndarray
        New Feature array
    newY : numpy.ndarray
        New label array
    """
    uys = np.unique(y)
    newX = np.zeros((int(N * len(uys)), np.size(x, axis=1)))
    newy = np.zeros((int(N * len(uys)), ))
    for i, uy in enumerate(uys):
        gind = np.where(y == uy)
        newX[i * N:i * N + len(gind[0]), :] = x[gind[0], :]
        newy[i * N:(i + 1) * N] = uy
        cx = x[gind[0], :]
        kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(cx)
        newX[i * N + len(gind[0]):(i + 1) * N] = kde.sample(n_samples=N -
                                                            len(gind[0]))
    return newX, newy
Beispiel #17
0
def ICA_loglikes_samples_fitSA(arr):

    N_samp = np.shape(arr)[0]
    N_bins = np.shape(arr)[1]

    d = 1
    scotts_b = N_samp**(-1. / (d + 4))
    print('scotts_b = %.2f' % scotts_b)

    ica = FastICA(whiten=False, max_iter=1000, tol=1e-3)
    S_ = ica.fit_transform(arr)
    A_ = ica.mixing_
    #W_ica = ica.components_

    X_ref_ica_unmixed = np.zeros((N_samp, N_bins))
    X_ref_ica = np.zeros((N_samp, N_bins))

    loglike_ica = np.zeros(np.shape(arr)[0])

    for j in range(N_bins):

        X_ica_ind = S_[:, j].reshape(-1, 1)
        kde = KernelDensity(bandwidth=scotts_b,
                            kernel='gaussian').fit(X_ica_ind)

        samps = kde.sample(N_samp)  #samples for the jth bin
        X_ref_ica_unmixed[:, j] = np.ndarray.flatten(samps)

        log_dens = kde.score_samples(X_ica_ind)  #.reshape(len(X_ica_ind),1))
        loglike_ica += log_dens

    X_ref_ica = np.dot(X_ref_ica_unmixed,
                       A_.T)  #should be akin to samples X^mock.

    return loglike_ica, X_ref_ica
Beispiel #18
0
def kde_estimator(X, y, random_state=None, kernel='gaussian'):
    n_classes = len(np.unique(y))
    lst_1 = [len(np.where(y==clase)[0]) for clase in range(n_classes)]
    lst_2 = [max(lst_1)-x for x in lst_1]
    X_res = np.array([]).reshape(0, X.shape[-1])
    y_res = np.array([])
    for i in range(n_classes):
        if lst_2[i]==0:
            X_res = np.concatenate([
                X_res,
                X[np.where(y==i)], 
            ])
            y_res = np.concatenate([
                y_res,
                y[np.where(y==i)],
            ])             
        else:
            print("CLASS:", i)
            kde = KernelDensity(kernel=kernel, bandwidth=0.2).fit(X[np.where(y==i)])
            X_res = np.concatenate([
                X_res,
                X[np.where(y==i)], 
                kde.sample(n_samples=lst_2[i], random_state=random_state),
            ])
            y_res = np.concatenate([
                y_res,
                y[np.where(y==i)],
                np.array([i for _ in range(lst_2[i])]),
            ])
            return X_res, y_res
Beispiel #19
0
class KDE():
    def __init__(self, **kwargs):
        """ Kernel Density Estimation (parzen windows).
        """
        self.model = KernelDensity(**kwargs)
        self.columns = None
        self.indexes = None

    def fit(self, data, **kwargs):
        """ Train the generator with data.

            :param data: The training data.
        """
        self.columns = data.columns
        self.indexes = data.indexes
        self.model.fit(data, **kwargs)

    def sample(self, n=1, **kwargs):
        """ Sample from trained KDE.

            :param n: Number of examples to sample.
        """
        if self.indexes is None:
            raise Exception(
                'You firstly need to train the KDE before sampling. Please use fit method.'
            )
        else:
            gen_data = self.model.sample(n, **kwargs)
            return autopandas.AutoData(gen_data,
                                       columns=self.columns,
                                       indexes=self.indexes)
Beispiel #20
0
def kde_fit_quantiles(rtquants, nsamples=1000, bw=.1):
    """ takes quantile estimates and fits cumulative density function
    returns samples to pass to sns.kdeplot()
    """
    kdefit = KernelDensity(kernel='gaussian', bandwidth=bw).fit(rtquants)
    samples = kdefit.sample(n_samples=nsamples).flatten()
    return samples
Beispiel #21
0
def test_kde_sample_weights():
    n_samples = 400
    size_test = 20
    weights_neutral = np.full(n_samples, 3.)
    for d in [1, 2, 10]:
        rng = np.random.RandomState(0)
        X = rng.rand(n_samples, d)
        weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
        X_repetitions = np.repeat(X, weights, axis=0)
        n_samples_test = size_test // d
        test_points = rng.rand(n_samples_test, d)
        for algorithm in ['auto', 'ball_tree', 'kd_tree']:
            for metric in ['euclidean', 'minkowski', 'manhattan',
                           'chebyshev']:
                if algorithm != 'kd_tree' or metric in KDTree.valid_metrics:
                    kde = KernelDensity(algorithm=algorithm, metric=metric)

                    # Test that adding a constant sample weight has no effect
                    kde.fit(X, sample_weight=weights_neutral)
                    scores_const_weight = kde.score_samples(test_points)
                    sample_const_weight = kde.sample(random_state=1234)
                    kde.fit(X)
                    scores_no_weight = kde.score_samples(test_points)
                    sample_no_weight = kde.sample(random_state=1234)
                    assert_allclose(scores_const_weight, scores_no_weight)
                    assert_allclose(sample_const_weight, sample_no_weight)

                    # Test equivalence between sampling and (integer) weights
                    kde.fit(X, sample_weight=weights)
                    scores_weight = kde.score_samples(test_points)
                    sample_weight = kde.sample(random_state=1234)
                    kde.fit(X_repetitions)
                    scores_ref_sampling = kde.score_samples(test_points)
                    sample_ref_sampling = kde.sample(random_state=1234)
                    assert_allclose(scores_weight, scores_ref_sampling)
                    assert_allclose(sample_weight, sample_ref_sampling)

                    # Test that sample weights has a non-trivial effect
                    diff = np.max(np.abs(scores_no_weight - scores_weight))
                    assert diff > 0.001

                    # Test invariance with respect to arbitrary scaling
                    scale_factor = rng.rand()
                    kde.fit(X, sample_weight=(scale_factor * weights))
                    scores_scaled_weight = kde.score_samples(test_points)
                    assert_allclose(scores_scaled_weight, scores_weight)
Beispiel #22
0
def test_kde_sample_weights():
    n_samples = 400
    size_test = 20
    weights_neutral = np.full(n_samples, 3.)
    for d in [1, 2, 10]:
        rng = np.random.RandomState(0)
        X = rng.rand(n_samples, d)
        weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
        X_repetitions = np.repeat(X, weights, axis=0)
        n_samples_test = size_test // d
        test_points = rng.rand(n_samples_test, d)
        for algorithm in ['auto', 'ball_tree', 'kd_tree']:
            for metric in ['euclidean', 'minkowski', 'manhattan',
                           'chebyshev']:
                if algorithm != 'kd_tree' or metric in KDTree.valid_metrics:
                    kde = KernelDensity(algorithm=algorithm, metric=metric)

                    # Test that adding a constant sample weight has no effect
                    kde.fit(X, sample_weight=weights_neutral)
                    scores_const_weight = kde.score_samples(test_points)
                    sample_const_weight = kde.sample(random_state=1234)
                    kde.fit(X)
                    scores_no_weight = kde.score_samples(test_points)
                    sample_no_weight = kde.sample(random_state=1234)
                    assert_allclose(scores_const_weight, scores_no_weight)
                    assert_allclose(sample_const_weight, sample_no_weight)

                    # Test equivalence between sampling and (integer) weights
                    kde.fit(X, sample_weight=weights)
                    scores_weight = kde.score_samples(test_points)
                    sample_weight = kde.sample(random_state=1234)
                    kde.fit(X_repetitions)
                    scores_ref_sampling = kde.score_samples(test_points)
                    sample_ref_sampling = kde.sample(random_state=1234)
                    assert_allclose(scores_weight, scores_ref_sampling)
                    assert_allclose(sample_weight, sample_ref_sampling)

                    # Test that sample weights has a non-trivial effect
                    diff = np.max(np.abs(scores_no_weight - scores_weight))
                    assert diff > 0.001

                    # Test invariance with respect to arbitrary scaling
                    scale_factor = rng.rand()
                    kde.fit(X, sample_weight=(scale_factor * weights))
                    scores_scaled_weight = kde.score_samples(test_points)
                    assert_allclose(scores_scaled_weight, scores_weight)
Beispiel #23
0
def kde3d(x, y, z, data_point):
    values = np.vstack([x, y, z]).T
    # Use grid search cross-validation to optimize the bandwidth
    # params = {'bandwidth': np.logspace(-1, 1, 20)}
    kde = KernelDensity(bandwidth=0.3)
    kde.fit(values)
    kde_coords = kde.sample(10000)
    log_pdf = kde.score_samples(kde_coords)
    percentile = np.sum(log_pdf < kde.score(data_point))/10000.
    return (percentile)
Beispiel #24
0
    def GetDensity(self, action='generate', samples=100, draws=None):
        """
        TODO: Check density calculations for multiple dimensions.
        Generate a density estimation of the positions at each time or sample positions
        from the generated density at a specified time.
        
        Parameters
        ----------
        action : (string) - Options: 'generate', 'sample'.
            'generate' : Generate a density estimation using kernel density estimation and save it.
            'sample' : Generate a density estimation at the final time and both draw and return samples
                from it equal to the number of points in position.
        samples : (int) - The number of sample points in each dimension at which to measure the density.
            Total number of points is samples ** dimensions.
        draws : (int) - The number of points to draw from the density distribution, if None,
            draw a number of points equal to the number of points in Positions.
            
        Returns
        -------
        'generate'        
            DensitySamples : (np.array) - An array of the positions of the points used to sample the density.
            Density : (np.array) - The value of the density evaluated at each points in DensitySamples.
        'sample'
            samples : (np.array) - An array of the samples drawn from the density generated from the positions
                at the final time.
        """
        if action == 'generate':
            #           A list of sample arrays ranging from the min value to the max value in each dimension.
            minmax = [
                np.linspace(np.amin(self.Positions[:, i, :]),
                            np.amax(self.Positions[:, i, :]), samples)
                for i in range(self.Positions.shape[1])
            ]
            self.DensitySamples = np.array(list(product(*minmax)))
            self.Density = np.zeros(
                (self.DensitySamples.shape[0], self.Times.shape[0]))

            for i in range(self.Positions.shape[2]):
                bandwidth = 0.2 * np.mean(pdist(self.Positions[:, :, i]))
                KDE = KernelDensity(bandwidth=bandwidth,
                                    kernel='gaussian',
                                    metric='euclidean')
                KDE.fit(self.Positions[:, :, i])
                self.Density[:, i] = np.exp(
                    KDE.score_samples(self.DensitySamples))

        elif action == 'sample':
            bandwidth = min(pdist(self.Positions[-1, :][:, np.newaxis]))
            KDE = KernelDensity(bandwidth=bandwidth,
                                kernel='gaussian',
                                metric='euclidean')
            KDE.fit(self.Positions[-1, :][:, np.newaxis])
            if draws is None:
                draws = self.Positions.shape[1]
            return KDE.sample(draws)
def generate_fit(x, y, n=100000, bandwidth=0.1, nbins=15, xmin=-1, xmax=2.5):
    data = np.vstack([x, y]).T
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(data)
    sample = kde.sample(n)

    x_fit, median, mean, std = calculate_median_profile(sample[:, 0],
                                                        sample[:, 1],
                                                        xmin=xmin,
                                                        xmax=xmax,
                                                        nbins=nbins)
    return sample, x_fit, median, mean, std
 def downsample(self, X, n):
     # we've already fit()ted, but we're worried that our X is so
     # large our classifier will be too slow in practice. we can
     # downsample by running a kde on X and sampling from it (this
     # will be slow, but happens only once), and then using those
     # points as the new X.
     if len(X) < n:
         return X
     kde = KernelDensity()
     kde.fit(X)
     return kde.sample(n)
Beispiel #27
0
def pval_silverman(data, I='auto', N_bootstrap=1000, comm=MPI.COMM_WORLD):
    I = get_I(data, I)
    data = comm.bcast(data)
    h_crit = critical_bandwidth(data, I)
    var_data = np.var(data)
    KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1))
    resamp_fun = lambda: is_unimodal_kde(
        h_crit, KDE_h_crit.sample(len(data)).ravel()/np.sqrt(1+h_crit**2/var_data), I)
    smaller_equal_crit_bandwidth = bootstrap(resamp_fun, N_bootstrap, dtype=np.bool_,
                                             comm=comm)
    return np.mean(~smaller_equal_crit_bandwidth)
 def downsample(self, X, n):
     # we've already fit()ted, but we're worried that our X is so
     # large our classifier will be too slow in practice. we can
     # downsample by running a kde on X and sampling from it (this
     # will be slow, but happens only once), and then using those
     # points as the new X.
     if len(X) < n:
         return X
     kde = KernelDensity()
     kde.fit(X)
     return kde.sample(n)
Beispiel #29
0
def copula_generate(X):
    print('X marginals to uniforms...')
    X = matrix_to_rank(X)
    print('X uniforms to inverse gaussian cdf...')
    X = rank_matrix_to_inverse(X)
    print('Gaussian Kernel Density Estimation...')
    kernel = KernelDensity().fit(X)
    print('Generating artificial data \n Sampling from KDE distribution...')
    X_artif = kernel.sample(X.shape[0])
    print('Marginal retrofitting...')
    return marginal_retrofit(X_artif, X)
Beispiel #30
0
class KDEDist(object):
    def __init__(self, bw, kernel='gaussian'):
        self._bw = bw
        self._kernel = kernel
        self._kd = KernelDensity(bandwidth=bw, kernel=kernel)
        self._samples = None

    @staticmethod
    def bw_range(x, n=3):
        max_pwr = 2
        h_opt = np.std(x) * (4. / (3. * len(x)))**0.2
        pwrs = np.concatenate([
            np.linspace(-max_pwr, 0, n + 1),
            np.linspace(0, max_pwr, n + 1)[1:]
        ])
        return h_opt * 2**pwrs

    @property
    def name(self):
        return 'KDE({}, {:.5f})'.format(self._kernel, self._bw)

    @property
    def samples(self):
        if self._samples is None:
            self._samples = self.rvs(100000)
        return self._samples

    def dist(self):
        return self

    def fit(self, x):
        self._kd.fit(np.reshape(x, (len(x), 1)))
        return self

    def logpdf(self, x):
        return self._kd.score_samples(np.reshape(x, (len(x), 1)))

    def rvs(self, n):
        return self._kd.sample(n).reshape(n)

    def stats(self, moments='mv'):
        out = []
        if 'm' in moments:
            out.append(np.array([np.mean(self.samples)]))
        if 'v' in moments:
            out.append(np.array([np.var(self.samples)]))
        if 's' in moments:
            out.append(np.array([skew(self.samples)]))
        if 'k' in moments:
            out.append(np.array([kurtosis(self.samples)]))
        return tuple(out)

    def ppf(self, q):
        return np.percentile(self.samples, q)
Beispiel #31
0
def generate_samples(X, size=100):
    '''
    Generate new sample from the same distribution of original data
    :param X: the original data
    :param size: size of new samples
    :return: data: new sampled data
    '''
    kde = KernelDensity(kernel='gaussian', bandwidth=0.01) # kernel density estimation (0.01: bandwidth of the kernel)
    kde.fit(X)  # fit the kernel density model on the data

    data = kde.sample(size) # generate new random samples from the model
    return kde, data
Beispiel #32
0
class colorKDE(object):
	def __init__(self,data=np.array([])):
		self.data = data
	
		
	def runKDE(self,bandwidth=0.2,use_opt=False):
		'''
		Generate the KDE and run with given bandwith
		
		If use_opt is specified, ruCVSearch must have been run already
		'''
		if use_opt:
			self.kde = KernelDensity(bandwidth=self.optimal_bandwidth)
		else:
			self.kde = KernelDensity(bandwidth=bandwidth)
		
		self.kde.fit(self.data)
		
	def runCVSearch(self,search_range=np.linspace(0.01,1.0,50),folds=20):
		self.grid = GridSearchCV(KernelDensity(),{'bandwidth':search_range},\
			cv=folds)
		self.grid.fit(self.data)
		self.optimal_bandwidth=self.grid.best_params_['bandwidth']
		print 'Optimal bandwidth: ' + str(self.optimal_bandwidth)
		
	def score_samples(self,x):
		'''
		Replicate score_samples functionality so both saves
		can be treated the same
		'''
		return self.kde.score_samples(x)
		
	def sample(self,n_samples):
		'''
		Replicate samples functionality so both saves
		can be treated the same
		'''
		return self.kde.sample(n_samples=n_samples)
		
	
	def save(self,filename,full=True):
		'''
		Save current state of the object
		
		If full is false, only save self.kde
		'''
		if full:
			#save the entire object, including data
			pickle.dump(self,open(filename,'wb'),protocol=-1)
			
		else:
			#only save the .kde object
			pickle.dump(self.kde,open(filename,'wb'),protocol=-1)
Beispiel #33
0
def montecarlo():
    """
    Run montecarlo simulation. Reads parameters from the excel sheet Params, and perform price simulations
    over one year. Finally, plots the simulation results and adds it to the sheet.
    :return: None
    """
    # Get the Excel work book
    wb = xw.Book.caller()

    # Get params
    ticker, start_date, end_date = get_params(book=wb, sheet_name="Params")

    # Get adj closes
    closes = get_adj_closes(ticker, start_date, end_date)

    # Calculate simple daily returns
    ret = closes.pct_change().dropna()
    # Estimate density with Gaussian kernels
    kde = KernelDensity(kernel='gaussian', bandwidth=0.001).fit(ret)
    # Returns simulation
    n_days, n_sim = 252, 100000
    d_range = pd.date_range(start=closes.index[-1] + pd.Timedelta(days=1),
                            periods=n_days)
    ret_sim = pd.DataFrame(data=kde.sample(n_samples=n_days * n_sim).reshape(
        (n_days, n_sim)),
                           index=d_range)
    # To prices
    closes_sim = (closes.iloc[-1].values[0]) * (1 + ret_sim).cumprod()

    # Get 5% - 95% percentile bands
    band_5 = pd.DataFrame(
        data={'5% band': np.percentile(closes_sim, 5, axis=1)}, index=d_range)
    band_95 = pd.DataFrame(
        data={'95% band': np.percentile(closes_sim, 95, axis=1)},
        index=d_range)

    # Plot past prices, bands and prices scenarios
    fig = plt.figure(figsize=(6, 4))
    plt.plot(closes.iloc[-100:], label='Historical Adj Close')
    plt.plot(band_5, label='5% Percentile Band')
    plt.plot(band_95, label='95% Percentile Band')
    plt.plot(closes_sim.sample(10, axis=1), label='Price Scenarios')
    plt.xlabel('Time')
    plt.ylabel('Price')
    plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))

    # Add the plot to the sheet
    sheet = wb.sheets['Params']
    sheet.pictures.add(fig,
                       name='Montecarlo Simulation',
                       update=True,
                       left=sheet.range('B9').left,
                       top=sheet.range('B9').top)
Beispiel #34
0
def get_numerical_signature(values, S):
    '''
    Learns a distribution of the values
    Then generates a sample of size S
    '''
    # Transform data to numpy array
    Xnumpy = np.asarray(values)
    X = Xnumpy.reshape(-1, 1)
    # Learn kernel
    kde = KernelDensity(kernel=C.kd["kernel"],
                        bandwidth=C.kd["bandwidth"]).fit(X)
    sig_v = [kde.sample()[0][0] for x in range(S)]
    return sig_v
Beispiel #35
0
class colorKDE(object):
    def __init__(self, data=np.array([])):
        self.data = data

    def runKDE(self, bandwidth=0.2, use_opt=False):
        '''
		Generate the KDE and run with given bandwith
		
		If use_opt is specified, ruCVSearch must have been run already
		'''
        if use_opt:
            self.kde = KernelDensity(bandwidth=self.optimal_bandwidth)
        else:
            self.kde = KernelDensity(bandwidth=bandwidth)

        self.kde.fit(self.data)

    def runCVSearch(self, search_range=np.linspace(0.01, 1.0, 50), folds=20):
        self.grid = GridSearchCV(KernelDensity(),{'bandwidth':search_range},\
         cv=folds)
        self.grid.fit(self.data)
        self.optimal_bandwidth = self.grid.best_params_['bandwidth']
        print('Optimal bandwidth: ' + str(self.optimal_bandwidth))

    def score_samples(self, x):
        '''
		Replicate score_samples functionality so both saves
		can be treated the same
		'''
        return self.kde.score_samples(x)

    def sample(self, n_samples):
        '''
		Replicate samples functionality so both saves
		can be treated the same
		'''
        return self.kde.sample(n_samples=n_samples)

    def save(self, filename, full=True):
        '''
		Save current state of the object
		
		If full is false, only save self.kde
		'''
        if full:
            #save the entire object, including data
            pickle.dump(self, open(filename, 'wb'), protocol=-1)

        else:
            #only save the .kde object
            pickle.dump(self.kde, open(filename, 'wb'), protocol=-1)
Beispiel #36
0
def generate_data_ae(generator, encoder, org_data, n):
    encoded = encoder.predict(org_data)
    input_vectors_list = []

    kde = KernelDensity().fit(encoded)
    new_col = kde.sample(n)
    input_vectors_list.append(new_col)

    input_data = np.column_stack(input_vectors_list)

    generated_data = generator.predict(input_data)
    new_data = pd.DataFrame(data=generated_data,
                            columns=list(org_data.columns))
    return new_data
Beispiel #37
0
    class XSampleFMBW(XSampleBW):
        def __init__(self, N, comm=MPI.COMM_SELF):
            self.comm = comm
            self.rank = self.comm.Get_rank()
            self.I = (-1.5, a + 1)  # CHECK: Is appropriate bound? OK.
            self.lamtol = 0
            self.mtol = mtol
            self.N = N
            if self.rank == 0:
                N1 = binom.rvs(N, 2.0 / 3)
                #print "N1 = {}".format(N1)
                N2 = N - N1
                data = np.hstack(
                    [np.random.randn(N1),
                     np.random.randn(N2) + a])
            else:
                data = None
            data = self.comm.bcast(data)
            self.data = data
            self.var = np.var(data)
            self.h_crit = fisher_marron_critical_bandwidth(
                data, self.lamtol, self.mtol, self.I)
            #print_all_ranks(self.comm, "self.h_crit = {}".format(self.h_crit))
            self.kde_h_crit = KernelDensity(kernel='gaussian',
                                            bandwidth=self.h_crit).fit(
                                                data.reshape(-1, 1))

        def is_unimodal_resample(self, lambda_val):
            data = self.kde_h_crit.sample(
                self.N).reshape(-1) / np.sqrt(1 + self.h_crit**2 / self.var)
            #print "np.var(data)/self.var = {}".format(np.var(data)/self.var)
            return is_unimodal_kde_fm(self.h_crit * lambda_val, data,
                                      self.lamtol, self.mtol, self.I)

        def probability_of_unimodal_above(self, lambda_val, gamma):
            '''
                G_n(\lambda) = P(\hat h_{crit}^*/\hat h_{crit} <= \lambda)
                             = P(\hat h_{crit}^* <= \lambda*\hat h_{crit})
                             = P(KDE(X^*, \lambda*\hat h_{crit}) is unimodal)
            '''
            # print "bootstrapping 1000 samples at rank {}:".format(self.rank)
            # smaller_equal_crit_bandwidth = bootstrap(lambda: self.is_unimodal_resample(lambda_val), 1000, dtype=np.bool_)
            # pval = np.mean(~smaller_equal_crit_bandwidth)
            # print "result at rank {}: pval = {}".format(self.rank, pval)+"\n"+"-"*20
            return probability_above(
                lambda: self.is_unimodal_resample(lambda_val),
                gamma,
                max_samp=20000,
                comm=self.comm,
                batch=20)
class ManoDatasetC(Dataset):
    def __init__(self, base_path, transform, train_indices):
        self.transform = transform

        mano_path = os.path.join(base_path, '%s_mano.json' % 'training')
        mano_list = json_load(mano_path)
        mano_array = np.array(mano_list).squeeze(1)
        mano_poses = mano_array[..., :51]

        mano_poses = mano_poses[train_indices]

        self.kde = KernelDensity(bandwidth=0.15, kernel='gaussian')
        self.kde.fit(mano_poses)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.mano_layer = ManoLayer(
            mano_root='mano/models', use_pca=False, ncomps=45, flat_hand_mean=False)

        self.mano_layer.to(self.device)

    def __len__(self):
        return 32560

    def __getitem__(self, idx):
        sample = self.kde.sample()
        pose = sample[..., :48]
        shape_start = sample[..., 48:]
        shape = np.ones([1, 10])
        shape[..., :3] = shape_start

        x = {
            'p': pose,
            's': shape
        }
        x = self.transform(x)

        hand_verts, hand_joints = self.mano_layer(x['p'], x['s'])
        batch_size = hand_joints.shape[0]
        hand_joints = hand_joints.reshape([batch_size, 63])

        sample = {
            'hand_joints': torch.squeeze(hand_joints),
            'hand_verts': torch.squeeze(hand_verts),
            'poses': torch.squeeze(x['p']),
            'shapes': torch.squeeze(x['s'])
        }

        return sample
Beispiel #39
0
def test_silverman_adaptive_resampling(data, alpha, I='auto',
                                       N_bootstrap_max=10000, comm=MPI.COMM_WORLD):
    data = comm.bcast(data)
    I = get_I(data, I)
    h_crit = critical_bandwidth(data, I)
    var_data = np.var(data)
    KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1))
    resamp_fun = lambda: not is_unimodal_kde(
        h_crit, KDE_h_crit.sample(len(data)).ravel()/np.sqrt(1+h_crit**2/var_data), I)
    try:
        return float(probability_above(resamp_fun, alpha, max_samp=N_bootstrap_max, comm=comm,
                     batch=100, bound_significance=0.05, exception_at_max_samp=True,
                     printing=False))
    except MaxSampExceededException:
        return alpha
Beispiel #40
0
def pval_calibrated_bandwidth(data, alpha_cal, null, I='auto',
                              N_bootstrap=1000, comm=MPI.COMM_WORLD,
                              calibration_file=None):
    '''
        NB!: Test is only calibrated to correct level for alpha_cal.
    '''
    data = comm.bcast(data)
    I = get_I(data, I)
    try:
        lambda_alpha = load_lambda('bw_ad', null, alpha_cal, calibration_file)
    except KeyError:
        lambda_alpha = load_lambda('bw', null, alpha_cal, calibration_file)
    h_crit = critical_bandwidth(data, I)
    var_data = np.var(data)
    KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1))
    resamp_fun = lambda: is_unimodal_kde(
        h_crit*lambda_alpha, KDE_h_crit.sample(len(data)).ravel()/np.sqrt(1+h_crit**2/var_data), I)
    smaller_equal_crit_bandwidth = bootstrap(resamp_fun, N_bootstrap, dtype=np.bool_, comm=comm)
    return np.mean(~smaller_equal_crit_bandwidth)
Beispiel #41
0
    class XSampleFMBW(XSampleBW):

        def __init__(self, N, comm=MPI.COMM_SELF):
            self.comm = comm
            self.rank = self.comm.Get_rank()
            self.I = (-1.5, a+1)  # CHECK: Is appropriate bound? OK.
            self.lamtol = 0
            self.mtol = mtol
            self.N = N
            if self.rank == 0:
                N1 = binom.rvs(N, 2.0/3)
                #print "N1 = {}".format(N1)
                N2 = N - N1
                data = np.hstack([np.random.randn(N1), np.random.randn(N2)+a])
            else:
                data = None
            data = self.comm.bcast(data)
            self.data = data
            self.var = np.var(data)
            self.h_crit = fisher_marron_critical_bandwidth(data, self.lamtol, self.mtol, self.I)
            #print_all_ranks(self.comm, "self.h_crit = {}".format(self.h_crit))
            self.kde_h_crit = KernelDensity(kernel='gaussian', bandwidth=self.h_crit).fit(data.reshape(-1, 1))

        def is_unimodal_resample(self, lambda_val):
            data = self.kde_h_crit.sample(self.N).reshape(-1)/np.sqrt(1+self.h_crit**2/self.var)
            #print "np.var(data)/self.var = {}".format(np.var(data)/self.var)
            return is_unimodal_kde_fm(self.h_crit*lambda_val, data, self.lamtol, self.mtol, self.I)

        def probability_of_unimodal_above(self, lambda_val, gamma):
            '''
                G_n(\lambda) = P(\hat h_{crit}^*/\hat h_{crit} <= \lambda)
                             = P(\hat h_{crit}^* <= \lambda*\hat h_{crit})
                             = P(KDE(X^*, \lambda*\hat h_{crit}) is unimodal)
            '''
            # print "bootstrapping 1000 samples at rank {}:".format(self.rank)
            # smaller_equal_crit_bandwidth = bootstrap(lambda: self.is_unimodal_resample(lambda_val), 1000, dtype=np.bool_)
            # pval = np.mean(~smaller_equal_crit_bandwidth)
            # print "result at rank {}: pval = {}".format(self.rank, pval)+"\n"+"-"*20
            return probability_above(lambda: self.is_unimodal_resample(lambda_val),
                                     gamma, max_samp=20000, comm=self.comm, batch=20)
Beispiel #42
0
def test_calibrated_bandwidth_adaptive_resampling(data, alpha, null, I='auto',
                                                  N_bootstrap_max=10000, comm=MPI.COMM_WORLD,
                                                  calibration_file=None):
    data = comm.bcast(data)
    I = get_I(data, I)
    try:
        lambda_alpha = load_lambda('bw_ad', null, alpha, calibration_file)
           # loading lambda computed with adaptive probablistic bisection search
    except KeyError:
        lambda_alpha = load_lambda('bw', null, alpha, calibration_file)
           # loading lambda computed with probabilistic bisection search
    h_crit = critical_bandwidth(data, I)
    var_data = np.var(data)
    KDE_h_crit = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1))
    resamp_fun = lambda: not is_unimodal_kde(
        h_crit*lambda_alpha, KDE_h_crit.sample(len(data)).ravel()/np.sqrt(1+h_crit**2/var_data), I)
    try:
        return float(probability_above(resamp_fun, alpha, max_samp=N_bootstrap_max, comm=comm,
                     batch=100, bound_significance=0.05, exception_at_max_samp=True,
                     printing=False))
    except MaxSampExceededException:
        return alpha
Beispiel #43
0
result, stats = get_standart_deviation(delta, PHAT_targets_valid[:, 0], method="full")
print(result)

full_set = np.hstack((PHAT_features_train, PHAT_targets_train))

# bring all magnitudes to redshift range
rescaled_set = np.copy(full_set)
rescaled_set[:, 0:-1] = rescaled_set[:, 0:-1]  # *feature_av
rescaled_set[:, -1] = rescaled_set[:, -1]

# Draw a sample set every time
kde = KernelDensity(bandwidth=0.001)
kde.fit(rescaled_set)
for i in range(500, 9000, 2000):
    aug_data = kde.sample(i)
    # aug_data = np.vstack((aug_data, full_set))

    # initalize predictor
    tree_para = {"min_samples_leaf": 5}
    clf = AdaBoostRegressor(DecisionTreeRegressor(**tree_para), loss="exponential", n_estimators=20)

    # fit predictor
    clf.fit(aug_data[:, 0:-1], aug_data[:, -1])
    predicted_aug = clf.predict(PHAT_features_valid)

    # collect stats
    delta_aug = predicted_aug - PHAT_targets_valid[:, 0]
    feature_imp_aug = clf.feature_importances_

    result_aug, stats_aug = get_standart_deviation(delta_aug, PHAT_targets_valid[:, 0], method="full")
Beispiel #44
0
def sklearn_log_density(sample_points, evaluation_points):
    """
    Estimate the log probability density function from which a set of sample
    points was drawn and return the estimated density at the evaluation points.

    *sample_points* is an [n x m] matrix.

    *evaluation_points* is the set of points at which to evaluate the kde.

    Note: if any dimension has all points equal then the entire distribution
    is treated as a dirac distribution with infinite density at each point.
    This makes the entropy calculation better behaved (narrowing the
    distribution increases the entropy) but is not so useful in other contexts.
    Other packages will (correctly) ignore dimensions of width zero.
    """
    # Ugly hack warning: if *evaluation_points* is an integer, then sample
    # that many points from the kde and return the log density at each
    # sampled point.  Since the code that uses this is looking only at
    # the mean log density, it doesn't need the sample points themselves.
    # This interface should be considered internal to the entropy module
    # and not used by outside functions.  If you need it externally, then
    # restructure the api so that the function always returns both the
    # points and the density, as well as any other function (such as the
    # denisty function and the sister function scipy_stats_density) so
    # that all share the new interface.

    from sklearn.neighbors import KernelDensity

    # Standardize data so we can use spherical kernels and uniform bandwidth
    data, mu, sigma = standardize(sample_points)

    # Note that sigma will be zero for dimensions w_o where all points are equal.
    # With P(w) = P(w, w_o) / P(w_o | w) and P(w_o) = 1 for all points in
    # the set, then P(w) = P(w, w_o) and we can ignore the zero dimensions.
    # However, as another ugly hack, we want the differential entropy to go
    # to -inf as the distribution narrows, so pretend that P = 0 everywhere.
    # Uncomment the following line to return the sample probability instead.
    ## sigma[sigma == 0.] = 1.

    # Silverman bandwidth estimator
    n, d = sample_points.shape
    bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4))

    #print("starting grid search for bandwidth over %d points"%n)
    #from sklearn.grid_search import GridSearchCV
    #from numpy import logspace
    #params = {'bandwidth': logspace(-1, 1, 20)}
    #fitter = GridSearchCV(KernelDensity(), params)
    #fitter.fit(data)
    #kde = fitter.best_estimator_
    #print("best bandwidth: {0}".format(kde.bandwidth))
    #import time; T0 = time.time()
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth,
                        rtol=1e-6, atol=1e-6)
    kde.fit(data)

    if isinstance(evaluation_points, int):
        # For generated points, they already follow the distribution
        points = kde.sample(n)
    elif evaluation_points is not None:
        # Standardized evaluation points to match sample distribution
        # Note: for dimensions where all sample points are equal, sigma
        # has been artificially set equal to one.  This means that the
        # evaluation points which do not match the sample value will
        # use the simple differences for the z-score rather than
        # pushing them out to plus/minus infinity.
        points = (evaluation_points - mu)/(sigma + (sigma == 0.))
    else:
        points = sample_points

    # Evaluate pdf, scaling the resulting density by sigma to correct the area.
    # If sigma is zero, return entropy as -inf;  this seems to not be the
    # case for discrete distributions (consider Bernoulli with p=1, q=0,
    #  => H = -p log p - q log q = 0), so need to do something else, both
    # for the kde and for the entropy calculation.
    with np.errstate(divide='ignore'):
        log_pdf = kde.score_samples(points) - np.sum(np.log(sigma))

    return log_pdf