Beispiel #1
0
    def __init__(self, data, bin_num=None, shift_num=50, normed=True, force_scott = False, rule = 'scott'):
        self.data_min = min(data)
        self.data_max = max(data)
        self.shift_num = shift_num
        self.data = data
        self.data_len = len(self.data)
        self.normed=normed
        ##If None use KDE to autobin
        
        if bin_num == None:
            kde_result = kde(self.data)
            if len(self.data) >= 40 and not force_scott and kde_result:
                self.bw,self.kde_mesh,self.kde_den = kde_result
                self.bins_from_bw()
                self.bw2,self.kde_mesh,self.kde_den = kde(self.data, None, self.ash_mesh.min(), self.ash_mesh.max())
            else:
                print("Using Scott's rule")
                kernel = stats.gaussian_kde(self.data)
                kernel.set_bandwidth(rule)
                self.bw = kernel.factor * self.data.std() # kde factor is bandwidth scaled by sigma
                self.bins_from_bw()
                self.kde_mesh = self.ash_mesh
                self.kde_den = kernel(self.kde_mesh)
        else:
            print("Using bin number: ", bin_num)
            self.set_bins(bin_num)

            kernel = stats.gaussian_kde(self.data)
            kernel.set_bandwidth(self.bw)
            self.kde_mesh = self.ash_mesh
            self.kde_den = kernel(self.kde_mesh)
Beispiel #2
0
def requirement1() :

    global min_range
    global max_range

    ds = [100, 500, 1000, 10000]
    b = 100
    h = 0.1
    k = 10

    xs = np.linspace(min_range, max_range, 200)

    # Histogram as example
    legends = []
    data = get_data(200)
    plot_true_distribution(1000)
    legends.append('True distribution')
    for d in ds :
        data = get_data(d)
        plt.hist(data, density=True, bins=b, alpha=0.4)
        legends.append('#bin = ' + str(b) + ', #data = ' + str(d))
    plt.legend(legends)
    plt.title('Requirement 1-1')
    plt.savefig('req1-1', dpi=300)
    plt.show()

    # KDE as example
    plt.figure()
    legends = []
    data = get_data(200)
    plot_true_distribution(1000)
    legends.append('True distribution')
    density = kde(data)
    for d in ds :
        data = get_data(d)
        density = kde(data)
        density.set_bandwidth(h)
        plt.plot(xs, density(xs))
        legends.append('h = ' + str(h) + ', #data = ' + str(d))
    plt.legend(legends)
    plt.title('Requirement 1-2')
    plt.savefig('req1-2', dpi=300)
    plt.show()

    # KNN as example
    plt.figure()
    legends = []
    data = get_data(200)
    plot_true_distribution(1000)
    legends.append('True distribution')
    for d in ds :
        data = get_data(d)
        density = knn(data, k)
        plt.plot(xs, density(xs))
        legends.append('k = ' + str(k) + ', #data = ' + str(d))
    plt.legend(legends)
    plt.ylim([0, 0.4])
    plt.title('Requirement 1-3')
    plt.savefig('req1-3', dpi=300)
    plt.show()
Beispiel #3
0
    def __init__(self,
                 data,
                 bin_num=None,
                 shift_num=50,
                 density=True,
                 force_scott=False,
                 rule='scott',
                 weights=None):
        self.data_min = min(data)
        self.data_max = max(data)
        self.shift_num = shift_num
        self.data = data
        self.data_len = len(self.data)
        self.density = density
        self.weights = weights

        # If None use KDE to autobin
        if bin_num is None:
            kde_result = kde(self.data)
            if len(self.data) >= 50 and not force_scott and kde_result:
                self.bw, self.kde_mesh, self.kde_den = kde_result
                self._bins_from_bw()
                self.bw2, self.kde_mesh, self.kde_den = \
                    kde(self.data, None, self.ash_mesh.min(),
                        self.ash_mesh.max())
            elif rule == 'fd':
                print("Using FD rule")
                kernel = stats.gaussian_kde(self.data)
                self.bin_width = 2 * (stats.iqr(self.data) /
                                      (len(self.data)**(1 / 3)))
                self.bw_from_bin_width()
                kernel.set_bandwidth(self.bw)
                self._bins_from_bw()
                self.kde_mesh = self.ash_mesh
                self.kde_den = kernel(self.kde_mesh)
            else:
                print("Using Scott's rule")
                kernel = stats.gaussian_kde(self.data)
                kernel.set_bandwidth(rule)
                # kde factor is bandwidth scaled by sigma
                self.bw = kernel.factor * self.data.std()
                self._bins_from_bw()
                self.kde_mesh = self.ash_mesh
                self.kde_den = kernel(self.kde_mesh)
        else:
            print("Using bin number: ", bin_num)
            self.set_bins(bin_num)

            kernel = stats.gaussian_kde(self.data)
            kernel.set_bandwidth(self.bw)
            self.kde_mesh = self.ash_mesh
            self.kde_den = kernel(self.kde_mesh)
Beispiel #4
0
def requirement3() :

    global min_range
    global max_range

    data = get_data(200)
    hs = [0.1, 1, 2]
    legends = []

    xs = np.linspace(min_range, max_range, 200)

    density = kde(data)

    plot_true_distribution(1000)
    legends.append('True distribution')

    # KDE with different h
    for h in hs :
        density.set_bandwidth(h)
        plt.plot(xs, density(xs))
        legends.append('h = ' + str(h))
    plt.legend(legends)
    plt.title('Requirement 3')
    plt.show()

    # Comment out to get Cross-validation KDE and Variable KDE
    '''
def get_bandwidth_fxn(data,period=None,nneighb=None,epses=2.**np.arange(-40,41),beta='-1/d',d=None):
    """
    Constructs a bandwidth function for a given dataset.  Performs a kernel density estimate q_\epsilon, and sets the bandwidth to q_epsilon^beta. 
    
    Parameters
    ----------
    data : 2D array-like
        Two-dimensional dataset used to create the diffusion map.
    period : 1D array-like or float, optional
        Period of the coordinate, e.g. 360 for an angle in degrees. If None, all coordinates are taken to be aperiodic.  If scalar, assumed to be period of each coordinate. If 1D array-like with each value a scalar or None, each coordinate has periodicity of that size.
    nneighb : int or None, optional
        Number of neighbors to include in constructing the diffusion map.  Default is None, which corresponds to using all neighbors.
    beta : float or string, optional
        Parameter for constructing the bandwidth function for the Diffusion map.  If rho is None, it will be set to q_\epsilon^beta, where q_\epsilon is an estimate of the density.  If rho is provided, this parameter is unused.  As with alpha, this will interpret strings that are evaluatable expressions.  Default is 0.0
    d : int or None, optional
        Dimension of the system. If None, dimension is estimated using the kde.

    Returns
    -------
    rho : 1d array
        The estimated bandwidth function.
    """
    N = len(data)
    if ((beta == 0) or (beta == '0')):
        return np.ones(N),None  # Handle uniform bandwidth case.
    else:
        # Use q^beta as bandwidth, where q is an estimate of the density.
        q,d_est,eps_opt = kde.kde(data,epses=epses,period=period,nneighb=nneighb,d=d)
        if d is None:
            d = d_est
        
        # If beta parameter is an expression, evaluate it and convert to float
        beta = _eval_param(beta,d)
        return q**beta,d
Beispiel #6
0
def plot_kde_uncertainty(data,
                         n_resamples=1000,
                         x_resolution=1000,
                         significance=0.05,
                         palette=sns.color_palette()):
    """
    Bootstrap a confidence interval for the KDE of the provided dataset, and plot along with the KDE.
    """

    assert n_resamples >= 100

    x_grid = np.linspace(min(data), max(data), x_resolution)
    orig_kde = kde(data, x_grid)

    resampled_kdes = np.zeros((n_resamples, x_resolution))

    for i in range(n_resamples):
        resample = sample_with_replacement(data)
        resampled_kdes[i] = kde(resample, x_grid)

    # sort to get percentiles
    resampled_kdes.sort(axis=0)

    def percentile_index(percentile, N):
        """
        Find the index of the x'th percentile in a sorted collection of size N.
        """

        assert 0 <= percentile <= 1

        return int(np.round(percentile * N))

    def ci_index(alpha, N):
        """
        Find the indices in a sorted collection of size N of the two bounds of a confidence interval of significance level alpha.
        """

        return percentile_index(alpha / 2., N), percentile_index(
            (1. - alpha / 2.), N)

    i_lower, i_higher = ci_index(significance, n_resamples)
    plt.plot(x_grid, resampled_kdes[i_lower], '--', color=palette[0])
    plt.plot(x_grid, resampled_kdes[i_higher], '--', color=palette[0])

    plt.plot(x_grid, orig_kde, '-', color=palette[0])
    sns.despine()
Beispiel #7
0
    def find_Lines(self):
        No_of_images = len(glob.glob1(self.myPath, "*.jpg"))
        numbers = np.arange(1, No_of_images + 1)

        for number in numbers:
            oriimg = cv2.imread(self.myPath +
                                "\Coins_img ({}).jpg".format(number))
            newimg = self.resize_img(oriimg)
            # print (newimg.shape)
            gray = cv2.cvtColor(newimg, cv2.COLOR_BGR2GRAY)
            # gray = cv2.medianBlur(gray,(5,5))
            gray = cv2.bilateralFilter(gray, 13, 60, 60)
            output = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)

            Circles = cv2.HoughCircles(gray,
                                       cv2.HOUGH_GRADIENT,
                                       1,
                                       40,
                                       param1=90,
                                       param2=30,
                                       minRadius=0,
                                       maxRadius=0)

            if Circles is None:
                cv2.imshow("Vertical Lines", output)
                cv2.waitKey(0)
                cv2.destroyAllWindows()
            if Circles is not None:
                Detected = np.uint16(np.around(Circles))
                centers = []
                for x, y, r in Detected[0, :]:
                    cv2.circle(output, (x, y), r, (0, 255, 255), 3)
                    cv2.circle(output, (x, y), 2, (255, 0, 0), 2)
                    centers.append((x, y))

                X_axis_points = [i[0] for i in centers]
                clusters = kde(X_axis_points, bdw=3.5)
                #If the model isn't being accurate try tweaking the bdw

                line_clusters = self.make_clustered_array(centers, clusters)

                for i in line_clusters:
                    i = np.array(i)
                    X_1 = np.argmin(i[:, 1])
                    X_2 = np.argmax(i[:, 1])
                    cv2.line(output, tuple(i[X_1]), tuple(i[X_2]), (0, 0, 0),
                             3)

                cv2.imshow("Vertical Lines", output)
                plt.show()
                cv2.waitKey(0)
                cv2.destroyAllWindows()
        return None
Beispiel #8
0
    def test_kde(self):
        import kde
        lines = inspect.getsourcelines(kde.kde)[0]
        lines = [line.strip() for line in lines]
        data = np.random.normal(size=100)
        bw = np.random.normal(1) + 10.
        query = np.random.normal(1)
        myout = my_kde(x=query, bw=bw, data=data)
        self.assertTrue(np.abs(kde.kde(x=query, bw=bw, data=data) - myout) < 1e-6,
                               'kde function does not return correct value')

        self.assertTrue(len(lines) == 1,
                        "please write your kde on one line, yes a def():" +
                        "counts so you cannot do that")

        pat = ".* +for +.*"
        self.assertFalse(
            self.search_lines(pat, lines),
            "please do not use the keyword for")

        pat = ".* +if +.*"
        self.assertFalse(
            self.search_lines(pat, lines),
            "please do not use the keyword if")

        pat = ".* +in +.*"
        self.assertFalse(
            self.search_lines(pat, lines),
            "please do not use the keyword in")

        pat = ".*vectorize.*"
        self.assertFalse(
            self.search_lines(pat, lines),
            "please do not use vectorize")

        pat = ".*map.*"
        self.assertFalse(
            self.search_lines(pat, lines),
            "please do not use map")
def main(Nsamp=None, Nmesh=None):
    """
    Generates plots for the 16 test cases for both the analytical pdf and the
    kernel density estimate.

    Parameters
    ----------
    Nsamp: int
           Number of samples used for the kde
    Nmesh: int
           Number of points used for the mesh
    """
    if Nsamp is None:
        Nsamp = 10000
    classes = [(cls, name) for name, cls in inspect.getmembers(dgp)
               if inspect.isclass(cls) and not cls in (dgp.dgp, dgp.LogNormal)]
    for cls, name in classes:
        print 'Generating graph for', name
        model = cls()
        x = model.sample(size=Nsamp)
        t, mesh, kdense = kde.kde(x, N=Nmesh)
        f = model.pdf(mesh)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.set_title(name, size=36)

        plt.plot(mesh, kdense)
        plt.plot(mesh, f)
        for label in ax.get_xticklabels() + ax.get_yticklabels():
            label.set_fontsize(24)

        fig.set_figheight(10)
        fig.set_figwidth(12)
        fig.savefig(name+'.pdf')
        plt.close()
        

    return None
Beispiel #10
0
def main(Nsamp=None, Nmesh=None):
    """
    Generates plots for the 16 test cases for both the analytical pdf and the
    kernel density estimate.

    Parameters
    ----------
    Nsamp: int
           Number of samples used for the kde
    Nmesh: int
           Number of points used for the mesh
    """
    if Nsamp is None:
        Nsamp = 10000
    classes = [(cls, name) for name, cls in inspect.getmembers(dgp)
               if inspect.isclass(cls) and not cls in (dgp.dgp, dgp.LogNormal)]
    for cls, name in classes:
        print 'Generating graph for', name
        model = cls()
        x = model.sample(size=Nsamp)
        t, mesh, kdense = kde.kde(x, N=Nmesh)
        f = model.pdf(mesh)

        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.set_title(name, size=36)

        plt.plot(mesh, kdense)
        plt.plot(mesh, f)
        for label in ax.get_xticklabels() + ax.get_yticklabels():
            label.set_fontsize(24)

        fig.set_figheight(10)
        fig.set_figwidth(12)
        fig.savefig(name + '.pdf')
        plt.close()

    return None
Beispiel #11
0
from kde import kde
from gauss1D import gauss1D
import matplotlib.pyplot as plt

h, k = parameters()

print('Question: Kernel/K-Nearest Neighborhood Density Estimators')

# Produce the random samples
samples = np.random.normal(0, 1, 100)

# Compute the original normal distribution
realDensity = gauss1D(0, 1, 100, 5)

# Estimate the probability density using the KDE
estDensity = kde(samples, h)

# plot results
plt.subplot(2, 1, 1)
plt.plot(estDensity[:, 0],
         estDensity[:, 1],
         'r',
         linewidth=1.5,
         label='KDE Estimated Distribution')
plt.plot(realDensity[:, 0],
         realDensity[:, 1],
         'b',
         linewidth=1.5,
         label='Real Distribution')
plt.legend()
plt.show()
    m1 = -1  # mean value
    s1 = 0.1 # % variance 

    N2 = 500 # number of data in data set 2
    m2 = 2   # mean value
    s2 = 0.5 # variance 
    
    h = 0.1       # bandwidth
    npoints = 100 # number of abscis points in kde

    x1 = math.sqrt(s1)*np.random.randn(N1,1) + m1
    x2 = math.sqrt(s2)*np.random.randn(N2,1) + m2
    x = np.concatenate((x1,x2),axis=0)
    
    # Kernel Density Estimate
    (xx,kde_estimate) = kde.kde(x,'Gaussian',h, npoints)
    plt.plot(xx,kde_estimate, 'r', label='Kernel Density Estimate')
    
    # the histogram of the data
    n, bins, patches = plt.hist(x, 50, normed=1, facecolor='green', alpha=0.75, label='Histogram')
    
    # empirical CDF
    (xx,pp) = ecdf(x, npoints)
    plt.plot(xx,pp, 'k', label='Empirical CDF')
    
    # Smooth Empirical CDF (KDE Integral)
    kde_integral = kde_integral(kde_estimate)
    plt.plot(xx,kde_integral, 'm', label='Smooth Empirical CDF')
    plt.legend(loc='upper left')
    plt.show()
    
Beispiel #13
0
    m1 = -1  # mean value
    s1 = 0.1  # % variance

    N2 = 500  # number of data in data set 2
    m2 = 2  # mean value
    s2 = 0.5  # variance

    h = 0.1  # bandwidth
    npoints = 100  # number of abscis points in kde

    x1 = math.sqrt(s1) * np.random.randn(N1, 1) + m1
    x2 = math.sqrt(s2) * np.random.randn(N2, 1) + m2
    x = np.concatenate((x1, x2), axis=0)

    # Kernel Density Estimate
    (xx, kde_estimate) = kde.kde(x, 'Gaussian', h, npoints)
    plt.plot(xx, kde_estimate, 'r', label='Kernel Density Estimate')

    # the histogram of the data
    n, bins, patches = plt.hist(x,
                                50,
                                normed=1,
                                facecolor='green',
                                alpha=0.75,
                                label='Histogram')

    # empirical CDF
    (xx, pp) = ecdf(x, npoints)
    plt.plot(xx, pp, 'k', label='Empirical CDF')

    # Smooth Empirical CDF (KDE Integral)