Esempio n. 1
0
class KdeSampler:
    """Kernel Density Estimation based sampler."""
    def __init__(self, sample=None, dim=2, n_sample_bin=1000, bw=1):
        """Sampler creation.

        A large bin of sample is used to draw new sample from using KDE.

        :param array_like sample: Sample to start from,
          shape (n_samples, n_features).
        :param int dim: Dimension of the parameter space.
        :param int n_sample_bin: Number of sample of the bin.
        :param float bw: Bandwidth of the KDE.
        """
        self.dim = dim
        if sample is None:
            self.space = [np.random.random_sample(self.dim)]
        else:
            self.space = sample

        self.n_samples = len(self.space)
        self.bw = bw
        self.bounds = np.array([[0] * self.dim, [1] * self.dim])

        self.kde = KernelDensity(kernel='gaussian',
                                 bandwidth=self.bw,
                                 metric='pyfunc',
                                 rtol=1e-4,
                                 metric_params={'func': self.metric_func})
        self.kde.fit(self.space)
        self.kde_ = copy.deepcopy(self.kde)

        # dists = [ot.Uniform(0, 1) for _ in range(self.dim)]
        # dists = ot.ComposedDistribution(dists)
        # lhs = ot.LHSExperiment(dists, n_sample_bin, True, True)
        # self.space_bin = np.array(lhs.generate())
        # # self.space_bin = np.array(ot.LowDiscrepancySequence(ot.SobolSequence(self.dim)).generate(n_sample_bin))
        # self.idx = list(range(n_sample_bin))

    def metric_func(self, x, other):
        """Inverse of Minkowsky with p=0.5."""
        p = 0.5

        # Bounds exclusion
        mask = np.logical_and(x >= self.bounds[0], x <= self.bounds[1])
        if not np.all(mask):
            return 0

        # Non-rectangular domain
        # if not 0.5 < np.sum(x) < 1:
        #     return 0

        # Minkowsky
        dist = np.sum(abs(x - other)**p)**(1. / p)

        # Euclidean
        # dist = np.linalg.norm(x - other)

        # background = np.linalg.norm(x[1] - 0.8)
        # dist = 0
        # dist *= 1 / (background ) * 0.05

        # LHS constrain
        # if np.linalg.norm(x - other, -np.inf) <= 0.03 / (self.n_samples + 1):
        #     return 0

        # LHS + Limit influence
        # if (np.linalg.norm(x - other, -np.inf) <= 0.03 / (self.n_samples + 1)) and \
        #     (np.linalg.norm(x - other) <= 0.5 / (self.n_samples + 1) ** (1 / len(self.bounds[0]))):
        #     return 0

        return dist

    def pdf(self, x, kernel='gaussian'):
        """Scale PDF between 0 and 1."""
        pdf_base = np.exp(self.kde.score_samples(x))
        sigma_fin = self.bw / self.n_samples**(1 / self.dim)
        pdf = 1 - (2 * np.pi)**(
            self.dim /
            2) * sigma_fin**self.dim * pdf_base * self.n_samples  # gaussian
        # pdf = 1 - np.pi * sigma_fin ** 2 * f * self.n_samples  # tophat
        pdf[np.where(pdf < 0)] = 0

        return pdf

    def sample_kde(self, n_samples=1):
        """Generate random samples from the model.

        :param int n_samples: Number of samples to generate.
        :param return: List of samples.
        :rtype: array_like, shape (n_samples, n_features)
        """

        # proba = np.exp(self.kde.score_samples(self.space_bin))
        # proba = self.pdf(self.space_bin)
        # proba /= np.sum(proba)
        # idx = np.random.choice(self.idx, size=n_samples, p=proba)
        # return np.atleast_2d(self.space_bin[idx])

        def metropolis_accept(old, new):
            return np.log(np.random.uniform()) < new - old

        def proposal(x):
            lower, upper = -0.1, 1.1
            sigma = 0.3

            return np.array([
                truncnorm.rvs((lower - xi) / sigma, (upper - xi) / sigma,
                              loc=xi,
                              scale=sigma) for xi in x
            ]).reshape(1, -1)

        def metropolis(logp, n_samples, init):
            old = proposal(init)
            samples = []
            while len(samples) < n_samples:
                new = proposal(old)
                logp_old = logp(old)
                logp_new = logp(new)
                if metropolis_accept(logp_old, logp_new):
                    old = new
                    logp_old = logp_new

                if np.exp(logp_old) > 0:
                    samples.append(old)

            samples = np.atleast_2d(samples)[:n_samples].reshape(n_samples, -1)
            return samples

        # Restart
        # samples = np.random.random(self.bounds.shape[1]).reshape(1, -1)
        # while len(samples) < n_samples:
        #     samples_ = metropolis(self.kde.score_samples, n_samples // 1,
        #                           np.random.random(self.bounds.shape[1]))
        #     samples = np.concatenate([samples, samples_])

        # samples = metropolis(self.kde.score_samples, n_samples,
        #                      np.random.random(self.bounds.shape[1]))
        with np.errstate(divide='ignore', invalid='ignore'):
            samples = metropolis(lambda x: np.log(self.pdf(x)), n_samples,
                                 np.random.random(self.bounds.shape[1]))

        return samples

    def generate(self, n_samples=2):
        """Generate samples.

        Using the KDE, generate new samples following the PDF.
        The sample giving the best improvement in terms of discrepancy is kept.

        Update the KDE after each sample is added to the sampling.

        :param int n_samples: Number of samples to generate.
        :return: Sample.
        :rtype: array_like, shape (n_samples, n_features)
        """
        self.kde = copy.deepcopy(self.kde_)
        sample = list(copy.deepcopy(self.space))
        self.n_samples = len(sample)

        for _ in range(n_samples - 1):
            sample_ = self.sample_kde(500)

            self.sample_ = sample_
            self.kde_prev = copy.deepcopy(self.kde)

            # Normal strategy
            # disc = [ot.SpaceFillingPhiP(1000).evaluate(np.vstack([sample, s]))
            #         for s in sample_]

            # disc = [Space.discrepancy(np.vstack([sample, s]), method='WD')
            #         for s in sample_]

            disc = [
                ot.SpaceFillingC2().evaluate(np.vstack([sample, s]))
                for s in sample_
            ]

            # Subprojections
            # disc = [discrepancy_2D(np.vstack([sample, s]))
            #         for s in sample_]

            # Sobol consideration
            # disc = [ot.SpaceFillingC2().evaluate(np.concatenate([np.array(sample)[:, 0].reshape(-1, 1), np.array(s)[0].reshape(1, 1)]))
            #         for s in sample_]

            sample.append(sample_[np.argmin(disc)])

            # For constrain
            # disc = [ot.SpaceFillingMinDist().evaluate(np.vstack([sample, s]))
            #         for s in sample_]

            # Max probability point
            # disc = self.kde_.score_samples(sample_)

            # sample.append(sample_[np.argmax(disc)])

            self.n_samples = len(sample)
            self.kde.set_params(bandwidth=self.bw / self.n_samples**(1 / 2),
                                metric_params={'func': self.metric_func})
            self.kde.fit(sample)

        return np.array(sample)
Esempio n. 2
0
class KDEestimator:
    """
    An interface for generating random numbers according
    to a given Kernel Density Estimation (KDE) parametrization based on the 
    data. 
    """
    def __init__(self, bandwidth=1.0):
        from sklearn.neighbors.kde import KernelDensity
        self.bandwidth = bandwidth
        self.model = KernelDensity(bandwidth=self.bandwidth)

    def _botev_fixed_point(self, t, M, I, a2):
        # Find the largest float available for this numpy
        if hasattr(np, 'float128'):
            large_float = np.float128
        elif hasattr(np, 'float96'):
            large_float = np.float96
        else:
            large_float = np.float64

        l = 7
        I = large_float(I)
        M = large_float(M)
        a2 = large_float(a2)
        f = 2 * np.pi**(2 * l) * np.sum(I**l * a2 * np.exp(-I * np.pi**2 * t))
        for s in range(l, 1, -1):
            K0 = np.prod(np.arange(1, 2 * s, 2)) / np.sqrt(2 * np.pi)
            const = (1 + (1 / 2)**(s + 1 / 2)) / 3
            time = (2 * const * K0 / M / f)**(2 / (3 + 2 * s))
            f = 2 * np.pi ** (2 * s) * \
                np.sum(I ** s * a2 * np.exp(-I * np.pi ** 2 * time))
        return t - (2 * M * np.sqrt(np.pi) * f)**(-2 / 5)

    def finite(self, val):
        """ Checks if a value is finite or not """
        return val is not None and np.isfinite(val)

    def botev_bandwidth(self, data):
        """ Implementation of the KDE bandwidth selection method outline in:
            
        Z. I. Botev, J. F. Grotowski, and D. P. Kroese. *Kernel density estimation via diffusion.* The Annals of Statistics, 38(5):2916-2957, 2010.

        Based on the implementation of Daniel B. Smith, PhD. The object is a callable returning the bandwidth for a 1D kernel.
        
        Forked from the package `PyQT_fit <https://code.google.com/archive/p/pyqt-fit/>`_. 
        
        :param data: 1D array containing the data to model with a 1D KDE. 
        :type data: numpy.ndarray
        :returns: Optimal bandwidth according to the data. 
        """
        from scipy import fftpack, optimize
        #    def __init__(self, N=None, **kword):
        #        if 'lower' in kword or 'upper' in kword:
        #            print("Warning, using 'lower' and 'upper' for botev bandwidth is "
        #                  "deprecated. Argument is ignored")
        #        self.N = N
        #
        #    def __call__(self, data):#, model):
        #        """
        #        Returns the optimal bandwidth based on the data
        #        """
        N = 2**10  #if self.N is None else int(2 ** np.ceil(np.log2(self.N)))
        #        lower = getattr(model, 'lower', None)
        #        upper = getattr(model, 'upper', None)
        #        if not finite(lower) or not finite(upper):
        minimum = np.min(data)
        maximum = np.max(data)
        span = maximum - minimum
        lower = minimum - span / 10  #if not finite(lower) else lower
        upper = maximum + span / 10  #if not finite(upper) else upper
        # Range of the data
        span = upper - lower

        # Histogram of the data to get a crude approximation of the density
        #        weights = model.weights
        #        if not weights.shape:
        weights = None
        M = len(data)
        DataHist, bins = np.histogram(data,
                                      bins=N,
                                      range=(lower, upper),
                                      weights=weights)
        DataHist = DataHist / M
        DCTData = fftpack.dct(DataHist, norm=None)

        I = np.arange(1, N, dtype=int)**2
        SqDCTData = (DCTData[1:] / 2)**2
        guess = 0.1

        try:
            t_star = optimize.brentq(self._botev_fixed_point,
                                     0,
                                     guess,
                                     args=(M, I, SqDCTData))
        except ValueError:
            t_star = .28 * N**(-.4)

        return np.sqrt(t_star) * span

    def fit(self, x):
        self.bandwidth = self.botev_bandwidth(x.flatten())
        self.model.set_params(**{'bandwidth': self.bandwidth})
        self.model.fit(x.reshape(-1, 1))

    def sample(self, dimension=1.0):
        return self.model.sample(dimension)

    def pdf(self, x):
        return self.model.score_samples(x)