Exemple #1
0
    def fit(self, X, y):
        sorted_idx = X.argsort(axis=0).flatten()
        kde_values = X.copy()[sorted_idx]
        kde_labels = y.copy()[sorted_idx]

        bin_counts = np.bincount(y).astype(float)
        mixture = 0.5
        old_ratios = np.zeros(kde_labels.shape)
        iter_count = 0
        if (self.bandwidth is None):
            self.bandwidth = hscott(X)
        for i in range(self.n_iters):
            controls_kde = neighbors.KernelDensity(kernel=self.kernel,
                                                   bandwidth=self.bandwidth)
            patholog_kde = neighbors.KernelDensity(kernel=self.kernel,
                                                   bandwidth=self.bandwidth)
            controls_kde.fit(kde_values[kde_labels == 0])
            patholog_kde.fit(kde_values[kde_labels == 1])

            controls_score = controls_kde.score_samples(kde_values)
            controls_score = np.exp(controls_score) * mixture

            patholog_score = patholog_kde.score_samples(kde_values)
            patholog_score = np.exp(patholog_score) * (1 - mixture)

            ratio = controls_score / (controls_score + patholog_score)
            if (np.all(ratio == old_ratios)):
                break
            iter_count += 1
            old_ratios = ratio
            kde_labels = ratio < 0.5

            diff_y = np.hstack(([0], np.diff(kde_labels)))
            if (np.sum(diff_y != 0) == 2
                    and np.unique(kde_labels).shape[0] == 2):
                split_y = int(np.all(np.diff(np.where(kde_labels == 0)) == 1))
                sizes = [
                    x.shape[0] for x in np.split(diff_y,
                                                 np.where(diff_y != 0)[0])
                ]
                split_prior_smaller = (np.mean(
                    kde_values[kde_labels == split_y]) < np.mean(
                        kde_values[kde_labels == (split_y + 1) % 2]))
                if split_prior_smaller:
                    replace_idxs = np.arange(kde_values.shape[0])[-sizes[2]:]
                else:
                    replace_idxs = np.arange(kde_values.shape[0])[:sizes[0]]

                kde_labels[replace_idxs] = (split_y + 1) % 2

            bin_counts = np.bincount(kde_labels).astype(float)
            mixture = bin_counts[0] / bin_counts.sum()
            if (mixture < 0.10 or mixture > 0.90):
                break
        self.controls_kde = controls_kde
        self.patholog_kde = patholog_kde
        self.mixture = mixture
        self.iter_ = iter_count
        return self
def GridSearchKDE(data):
    params = {'bandwidth': np.logspace(-3, 3, 50)}
    grid = GridSearchCV(neighbors.KernelDensity(), params)
    grid.fit(data)

    print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

    params = {'bandwidth': np.linspace(-0.5, 0.5, 50) * grid.best_estimator_.bandwidth}
    grid = GridSearchCV(neighbors.KernelDensity(), params)
    grid.fit(data)

    print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
    return grid.best_estimator_.bandwidth
 def fit(self, data, indices, bandwidth = None):
     fitData = data[:, indices]
     fitData, self.scaler = trans(fitData, scale = True)
     if not bandwidth:
         bandwidth = GridSearchKDE(fitData)
     self.kde = neighbors.KernelDensity(bandwidth = bandwidth)
     self.kde.fit(fitData)
def KernelThresh(image,
                 intens=[0, 40000],
                 num=4000,
                 bandwidth=2000,
                 kernel='gaussian'):
    """Determine threshold using Gaussian kernel density estimation
    
    This is good for bimodal distribution. Using Gaussian kernel density 
    estimation (KDE) to find the two mode of distribution. The threshold is 
    choosen as the middle of the two modes.
    """
    _max_count, _ax, _fig = PixDistribution(image)
    kde = skneighbor.KernelDensity(kernel=kernel, bandwidth=bandwidth)
    if len(image.shape) > 1:
        kde.fit(image.flatten()[:, np.newaxis])
    else:
        kde.fit(image[:, np.newaxis])
    x_pos = np.linspace(intens[0], intens[1], num=num)[:, np.newaxis]
    kde.get_params()
    log_dens = kde.score_samples(x_pos)
    dens = np.exp(log_dens)
    maxima = LocalMaxima(dens, width=100, highPeak=False)
    if len(maxima) != 2:
        print('Non-bimodal detected')
        return None
    else:
        m1, m2 = maxima
        thres = 0.5 * (x_pos[m1, 0] + x_pos[m2, 0])
        _ax.plot([thres, thres], [0, _max_count], label='Ostu')
        plt.show()
        return thres
Exemple #5
0
	def getDistanceDensity(self, data_set):
		self.distance = []
		kde = neighbors.KernelDensity(kernel = 'linear',bandwidth=0.75).fit(data_set)

		for i in range(0,len(data_set)):
			density = kde.score_samples(data_set[i])
			self.distance.append(density)
Exemple #6
0
 def GetOptimalBandwidth(self, datalabel, bandlims, numbands):
     '''Optimize the bandwidth using leave-one-out cross-validation.
     Example follows that at jakevdp.github.io/PythonDataScienceHandbook.
     Args
         datalabel: string
             string describing which datalabel in the dataframe to find
             the bandwidth for
         bandlims: array (length 2)
             limits to search for the optimal bandwidth in
         numbands: int
             ints 
     '''
     if bandlims[1] < 0 or bandlims[0] < 0:
         print("Bandwidth must be greater than zero")
         return
     bandwidths = np.linspace(bandlims[0], bandlims[1], numbands)
     data = self.df[datalabel]
     if isinstance(self.df[datalabel][0], np.ndarray):
         print("WERE IN HERE")
         data_arr = []
         for i in xrange(len(self.df[datalabel])):
             data_arr = data_arr + list(self.df[datalabel][0])
         data = np.array(data_arr)
     if len(data) > 500:
         print("This may take some time depending on your data length.")
         print("numbands > 10 with len(data)>500 starts to take a bit")
     grid = sgs.GridSearchCV(skn.KernelDensity(kernel='gaussian'),
                             {'bandwidth': bandwidths},
                             cv=cv.LeaveOneOut(len(data)))
     grid.fit(data[:, None])
     thebandwidth = grid.best_params_['bandwidth']
     return thebandwidth
Exemple #7
0
    def __init__(self, masks):
        n_class = 10
        self.maps_with_class = [[], [], [], [], [], [], [], [], [], []]
        self.kde_samplers = []
        self.class_probs = np.ones(n_class) / n_class
        #        self.class_probs = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5])
        self.mask_size = None
        ts = time.time()
        for mask_i, mask in enumerate(masks):
            assert mask.shape[2] == n_class
            if not self.mask_size:
                self.mask_size = mask.shape[1]
            samplers = []
            for class_i in range(n_class):
                X = np.nonzero(mask[:, :, class_i])
                X = np.stack(X, axis=1)

                #                np.random.shuffle(X)
                #                X = X[:50000]

                if not X.size:
                    samplers.append(None)
                else:
                    self.maps_with_class[class_i].append(mask_i)
                    sampler = neighbors.KernelDensity(self.mask_size *
                                                      0.02).fit(X)
                    samplers.append(sampler)

            assert len(samplers) == n_class
            self.kde_samplers.append(samplers)
        print('sampler init time: {}'.format(time.time() - ts))
Exemple #8
0
def KDEFit(data, bandwidth='thumb'):
    if bandwidth == 'thumb':
        #bandwidth = np.power( (len(data)*(data.shape[-1]+2.0)/4.0), -1.0/(data.shape[-1]+4.0) )
        bandwidth = np.power((len(data) * (data.shape[-1] + 2.0) / 4.0), -1.0 /
                             (data.shape[-1] + 4.0)) * 0.5
    kde = neighbors.KernelDensity(bandwidth=bandwidth).fit(data)
    return kde
Exemple #9
0
def plot_and_save(scores_and_labels,
                  xlabel,
                  output,
                  xmin=0.0,
                  xmax=0.6,
                  smoothness=20.,
                  font_size=12,
                  line_width=2):

    # create and save plot
    plt.figure()

    # create kernel density estimator
    kde = neighbors.KernelDensity(kernel='gaussian',
                                  bandwidth=xmax / smoothness)
    # need to add another dimension as required by sklearn
    # arrays passed to kde must be 2-dimensional
    X_plot = np.reshape(np.linspace(xmin, xmax, 500), (-1, 1))
    styles = ['-', '--', '-.', ':']
    for i, (xs, label) in enumerate(scores_and_labels):
        scores = np.ravel(xs) if len(xs) < 1e5 else np.random.choice(
            np.ravel(xs), int(1e5))
        kde.fit(np.reshape(scores, (-1, 1)))
        densities = kde.score_samples(X_plot)
        plt.plot(X_plot[:, 0],
                 np.exp(densities),
                 lw=line_width,
                 label=label,
                 ls=styles[i % len(styles)])
    plt.ylabel('Density', size=font_size)
    plt.xlabel(xlabel, size=font_size)
    plt.legend(loc='best', fontsize=font_size)
    plt.tight_layout()
    plt.savefig(output)
Exemple #10
0
def fit_best_kde(data, steps=25, rtol=0.1, cv=3, fit_sample_size=None):
    '''
    This function determines a best fitting kernel density estimate
    using scikit-learn's sklearn.neighbors.KernelDensity method along 
    scikit-learn's sklearn.model_selection.GridSearchCV method. In 
    particular, the GridSearchCV method is used to try all possible
    kernel types with 100 evenly spaced bandwidths between the minimum
    and maximum differences between values in the provided data.
    
    Arguments:
    data: a 1-dimensional list or Numpy array that includes the data
    
    rtol: the relative tolerance passed to sklearn.neighbors.KernelDensity 
    method. Higher values offer faster computational times at the cost of
    accuracy.
    
    cv: the number of cross-validation splits the sklearn.model_selection.GridSearchCV 
    method uses to identify the best kde.
    
    fit_sample_size: a value that, if specified, denotes that a random sample
    of size sample_size should be used to fit the kernel density estimate. This
    functionality is added to reduce the high computational times that may
    occur when the provided data is large.
    
    Returns:
    data: a dictionary specifes the best bandwidth and kernel.         
        
        
    '''
    import sklearn.neighbors as skneighbor
    from sklearn.model_selection import GridSearchCV
    import warnings
    import numpy as np

    data = np.array(data)

    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')

        if fit_sample_size is not None:
            data = np.random.choice(data.ravel(),
                                    size=fit_sample_size,
                                    replace=False)

        min_val, max_val = find_min_max_diff(data)

        params = {
            'bandwidth': np.linspace(min_val, max_val, steps),
            'kernel': skneighbor.kde.VALID_KERNELS
        }
        grid = GridSearchCV(skneighbor.KernelDensity(rtol=rtol), params, cv=cv)
        grid.fit(data.reshape(-1, 1))

        return grid.best_params_
Exemple #11
0
 def fit(self, X, y, weights=[1, 1]):
     self.classes_ = np.sort(np.unique(y))
     training_sets = [X[y == yi] for yi in self.classes_]
     self.models_ = [
         neighbors.KernelDensity(bandwidth=self.bandwidth,
                                 kernel=self.kernel).fit(Xi)
         for Xi in training_sets
     ]
     weights = np.array(weights)
     self.logpriors_ = [
         np.log(Xi.shape[0] / X.shape[0]) for Xi in training_sets
     ] + np.log(weights)
    def fit(self, X, y):
        self.ages_ = y
        degree = 4
        ages_t = y.reshape(-1, 1)
        wm_model = sklearn.pipeline.make_pipeline(
            sklearn.preprocessing.PolynomialFeatures(degree),
            sklearn.linear_model.LinearRegression())
        gm_model = sklearn.pipeline.make_pipeline(
            sklearn.preprocessing.PolynomialFeatures(1),
            sklearn.linear_model.LinearRegression())
        self.wm_model_ = wm_model.fit(ages_t, X[:, 0])
        self.gm_model_ = gm_model.fit(ages_t, X[:, 1])

        self.ages_ = y
        self.ages_grid_ = np.arange(15, 100).reshape(-1, 1)
        ages_kde = skn.KernelDensity(kernel="gaussian", bandwidth=3)
        ages_kde.fit(ages_t)
        prior = np.exp(
            ages_kde.score_samples(self.ages_grid_))
        self.prior_ = prior / np.sum(prior)

        wm_residuals = np.abs(wm_model.predict(ages_t) - X[:, 0])
        gm_kernel = (
            44.7 ** 2
            * skg.kernels.RBF(length_scale=30, length_scale_bounds=(10, 60))
            + skg.kernels.WhiteKernel(noise_level=1e4,
                                      noise_level_bounds=(1e3, 1e5))
        )
        wm_kernel = (
            44.7 ** 2
            * skg.kernels.RBF(length_scale=30, length_scale_bounds=(10, 60))
            + skg.kernels.WhiteKernel(noise_level=1e4,
                                      noise_level_bounds=(1e3, 1e5))
        )
        self.wm_gp_ = skg.GaussianProcessRegressor(
            kernel=wm_kernel,
            n_restarts_optimizer=0)
        self.wm_gp_.fit(ages_t, wm_residuals)
        gm_residuals = np.abs(gm_model.predict(ages_t) - X[:, 1])
        self.gm_gp_ = skg.GaussianProcessRegressor(
            kernel=gm_kernel,
            n_restarts_optimizer=0)
        self.gm_gp_.fit(ages_t, gm_residuals)

        # plt.figure()
        # plt.scatter(y, X[:, 0])
        # plt.plot(self.ages_grid_, wm_model.predict(self.ages_grid_))
        # plt.figure()
        # plt.scatter(y, gm_residuals)
        # plt.plot(self.ages_grid_, self.gm_gp_.predict(self.ages_grid_))
        # plt.show()
        # input()
        return self
Exemple #13
0
def pdf_from_kde(data,
                 min_val=0,
                 max_val=None,
                 bandwidth=1.0,
                 kernel='gaussian'):
    '''
    This function generates a probability density function (PDF) that is 
    based on a kernel density estimate that is fit using scikit-learn's
    sklearn.neighbors.KernelDensity method. Specifically, it returns two
    objects, pdfx and pdfy, that contain the support and probability values
    that define the PDF, respectively. 
    
    Arguments:
    data: a 1-dimensional list or Numpy array that includes the data
    
    min_val: the minimum value to include in the PDF support (default
    is min_value - 0.10*[range between max_val and min_val values])
    
    max_val: the maximum value to include in the PDF support (default
    is max_value + 0.10*[range between max_val and min_val values])
    
    bandwidth: the bandwidth for the kernel density estimate.
    
    cv: the kernel type, which is passed directly to scikit-learn's
    sklearn.neighbors.KernelDensity method
    
    Returns:
    data: a dictionary with two keys, x and y. The values are NumPy arrays for the 
    support (x) and probability values (y) that define the PDF.
        
        
    '''

    import sklearn.neighbors as skneighbor
    import numpy as np

    data = np.array(data)

    if min_val is None:
        min_val = data.min() - 0.10 * (data.max() - data.min())

    if max_val is None:
        max_val = data.max() + 0.10 * (data.max() - data.min())

    pdfx = np.linspace(min_val, max_val, 1000)
    pdfy = np.exp(
        skneighbor.KernelDensity(bandwidth=bandwidth, kernel=kernel,
                                 rtol=0.1).fit(data.reshape(
                                     -1, 1)).score_samples(pdfx.reshape(-1,
                                                                        1)))
    pdfy = pdfy / pdfy.sum()
    return {'x': pdfx, 'y': pdfy}
    def fit(self,
            x_target=np.random.random((100, 3)),
            y_target=np.random.binomial(1, 0.5, (100, ))):
        '''
        __init__: To store all the data in a class
        param 
        x_source: numpy array (n,d) of features in source distribution
        y_source: numpy array (n,) of labels in source distribution
        x_target: numpy array (n,d) of features in target distribution
        y_target: numpy array (n,) of labels in target distribution

        
        Stores the class variables 
        m: # source data points
        n: # target data points
        d: # feature dimension
        x_source, y_source, x_target, y_target
        '''

        x_target = np.array(x_target)
        y_target = np.array(y_target)

        # Checking shape consistency
        if len(x_target.shape) != 2:
            raise TypeError('x_target is not an array fo shape (n,d)')
        if len(y_target.shape) != 1:
            raise TypeError('y_target is not an array fo shape (n,)')

        # Checking dimension consistency
        if self.x_source.shape[1] != x_target.shape[1]:
            raise TypeError(
                'Dimension don\'t match for source and target features')

        m, self.d = self.x_source.shape
        self.n, _ = x_target.shape
        self.n += m
        x, y = np.concatenate((self.x_source, x_target)), np.concatenate(
            (self.y_source, y_target))
        self.prop_target = np.mean(y_target)
        weights = np.array([1 - self.prop_target, self.prop_target])
        self.logpriors_ = np.log(weights)
        self.classes = np.array([0, 1])
        training_sets = [x[y == i] for i in [0, 1]]
        self.models_ = [
            neighbors.KernelDensity(bandwidth=self.bandwidth,
                                    kernel=self.kernel).fit(xi)
            for xi in training_sets
        ]
Exemple #15
0
    def KDEEstimate2D(self,bandwidth,datalabelx,datalabely,xbins=100j,ybins=100j,
            x_range=[0,1], y_range=[0,1],kern='gaussian'):
        '''
        Performs a 2D Kernel density estimation using data from the two variables
        specified in datalabelx and datalabely.  x and y-ranges assume data has
        been normalized and have the full range from [0,1].
        '''
        datax = None
        datay = None
        try:
            datax = self.df[datalabelx]
            datay = self.df[datalabely]
        except KeyError:
            print("No data found for one of these datalabels.")
            return
        range_x_ind = np.where((datax>x_range[0]) & (datax<x_range[1]))[0]
        range_y_ind = np.where((datay>y_range[0]) & (datay<y_range[1]))[0]
        print("RANGE_X: " + str(range_x_ind))
        print("RANGE_Y_IND: " + str(range_y_ind))
        range_indices = np.array(list(set(np.concatenate((range_x_ind,range_y_ind)))))
        print("RANGE_INDICES: " + str(range_indices))
        datax = datax[range_indices]
        datay = datay[range_indices]
        if isinstance(self.df[datalabelx][0],np.ndarray):
            print("WERE IN HERE")
            datax_arr = []
            for i in range(len(self.df[datalabelx])):
                datax_arr = datax_arr + list(self.df[datalabelx][0])
            datax=np.array(datax_arr)
        if isinstance(self.df[datalabely][0],np.ndarray):
            print("WERE IN HERE")
            datay_arr = []
            for i in yrange(len(self.df[datalabely])):
                datay_arr = datay_arr + list(self.df[datalabely][0])
            datay=np.array(datay_arr)
        xx, yy = np.mgrid[x_range[0]:x_range[1]:xbins,
                y_range[0]:y_range[1]:ybins]

        xy_grid = np.vstack([yy.ravel(),xx.ravel()]).T
        xy_dataset = np.vstack([datay,datax]).T
        TwoDKDE = skn.KernelDensity(bandwidth=bandwidth,kernel=kern)
        TwoDKDE.fit(xy_dataset)

        z = np.exp(TwoDKDE.score_samples(xy_grid))

        return xx,yy,np.reshape(z,xx.shape)
Exemple #16
0
def kde_pdf(spike_times, bandwidth=50.0, xgrid=None, kernel='gaussian'):
    """Compute the probability density function using KernelDensity
    estimation with specified bandwidth and evaluated at positions in
    xgrid.

    Return (pdf, xgrid)

    """
    if len(spike_times) == 0:
        warnings.warn('No spikes in spike trains')
        return (None, None)
    kde = skn.KernelDensity(kernel=kernel, bandwidth=bandwidth)
    kde.fit(spike_times[:, np.newaxis])
    if xgrid is None:
        xgrid = np.arange(min(spike_times), max(spike_times), bandwidth/2.0)
    log_pdf = kde.score_samples(xgrid[:, np.newaxis])    
    pdf = np.exp(log_pdf)
    return pdf, xgrid
Exemple #17
0
def selectWithKernalDensity(dist_top):
    """
    Model selection rountine that returns a list of models based on the output
    of kernal density estimation.
    
    :param dist_top: list of sorted distances
    """

    dist_top_reshape = dist_top.reshape((len(dist_top), 1))

    kde = neighbors.KernelDensity(kernel='tophat',
                                  bandwidth=0.005).fit(dist_top_reshape)

    log_dens = kde.score_samples(dist_top_reshape)

    minInd = signal.argrelextrema(log_dens, np.less)

    return minInd, log_dens
Exemple #18
0
def _kernel_density_joint(estimations, ranges, bandwidth=1 / 25):

    ndims = len(ranges)

    scaler = _min_max_scaler(ranges, feature_range=(0, 100))

    bandwidth = bandwidth * 100
    # step = 1.0

    kd = neighbors.KernelDensity(bandwidth=bandwidth).fit(
        scaler.transform(estimations))
    locations1d = np.arange(0, 100, 1)
    locations = np.reshape(np.meshgrid(*[locations1d] * ndims), (ndims, -1)).T
    kd_probs = np.exp(kd.score_samples(locations))

    shape = (ndims, ) + (len(locations1d), ) * ndims
    locations = scaler.inverse_transform(locations)
    locations = np.reshape(locations.T, shape)
    kd_probs = np.reshape(kd_probs, shape[1:])
    return locations, kd_probs, kd
Exemple #19
0
def _kernel_density_joint(samples, weights, ranges, bandwidth=1 / 25):

    ndims = len(ranges)

    scaler = _min_max_scaler(ranges, feature_range=(0, 100))

    bandwidth = bandwidth * 100
    # step = 1.0

    kd = neighbors.KernelDensity(bandwidth=bandwidth)
    kd.fit(scaler.transform(samples), sample_weight=weights)

    grid_shape = [100] * ndims
    grid = np.indices(grid_shape)
    locations = np.reshape(grid, (ndims, -1)).T
    kd_probs = np.exp(kd.score_samples(locations))

    shape = (ndims, *grid_shape)
    locations = scaler.inverse_transform(locations)
    locations = np.reshape(locations.T, shape)
    kd_probs = np.reshape(kd_probs, grid_shape)
    return locations, kd_probs, kd
Exemple #20
0
    def KDEEstimate2D(self,
                      bandwidth,
                      datalabelx,
                      datalabely,
                      xbins=100j,
                      ybins=100j,
                      kern='gaussian'):
        datax = None
        datay = None
        try:
            datax = self.df[datalabelx]
            datay = self.df[datalabely]
        except KeyError:
            print("No data found for one of these datalabels.")
            return
        if isinstance(self.df[datalabelx][0], np.ndarray):
            print("WERE IN HERE")
            datax_arr = []
            for i in xrange(len(self.df[datalabelx])):
                datax_arr = datax_arr + list(self.df[datalabelx][0])
            datax = np.array(datax_arr)
        if isinstance(self.df[datalabely][0], np.ndarray):
            print("WERE IN HERE")
            datay_arr = []
            for i in yrange(len(self.df[datalabely])):
                datay_arr = datay_arr + list(self.df[datalabely][0])
            datay = np.array(datay_arr)
        xx, yy = np.mgrid[datax.min():datax.max():xbins,
                          datay.min():datay.max():ybins]

        xy_grid = np.vstack([yy.ravel(), xx.ravel()]).T
        xy_dataset = np.vstack([datay, datax]).T
        TwoDKDE = skn.KernelDensity(bandwidth=bandwidth, kernel=kern)
        TwoDKDE.fit(xy_dataset)

        z = np.exp(TwoDKDE.score_samples(xy_grid))

        return xx, yy, np.reshape(z, xx.shape)
Exemple #21
0
 def KDEEstimate1D(self,bandwidth,datalabel,x_range=[0,1],bins=100,kern='gaussian'):
     '''
     Performs a 1D Kernel density estimation using data from the two variables
     specified in datalabelx and datalabely.  x-ranges assume data has
     been normalized and have the full range from [0,1].
     '''
     data = None
     try:
         data = self.df[datalabel]
     except KeyError:
         print("No data found for this datalabel.")
         return
     if isinstance(self.df[datalabel][0],np.ndarray):
         data = np.concatenate(self.df[datalabel])
         #data_arr = []
         #for i in range(len(self.df[datalabel])):
         #    data_arr = data_arr + list(self.df[datalabel][0])
         #data=np.array(data_arr)
     linspace = np.linspace(x_range[0],x_range[1], (x_range[1]-x_range[0])*bins)
     kde = skn.KernelDensity(bandwidth=bandwidth, kernel=kern)
     kde.fit(self.df[datalabel][:,None])
     logp = kde.score_samples(linspace[:,None])
     return linspace, np.exp(logp)
Exemple #22
0
 def KDEEstimate1D(self, datalabel, xlims=None, kern='gaussian'):
     bandwidth = None
     data = None
     try:
         bandwidth = self.bandwidths[datalabel]
         data = self.df[datalabel]
     except KeyError:
         print("No bandwidth or data found for this datalabel.")
         return
     print(data)
     if isinstance(self.df[datalabel][0], np.ndarray):
         print("WERE IN HERE")
         data_arr = []
         for i in xrange(len(self.df[datalabel])):
             data_arr = data_arr + list(self.df[datalabel][0])
         data = np.array(data_arr)
     if xlims is None:
         xlims = [np.min(data), np.max(data)]
     linspace = np.linspace(xlims[0], xlims[1],
                            (xlims[1] - xlims[0]) * 100.)
     kde = skn.KernelDensity(bandwidth=bandwidth, kernel=kern)
     kde.fit(self.df[datalabel][:, None])
     logp = kde.score_samples(linspace[:, None])
     return linspace, np.exp(logp)
Exemple #23
0
"""
Make a 'true' integration over 2 dims of the KDE to obtain a marginalized
1D distribtuion (here in logE).

Use multiprocessing, because it takes time on a single processor.
"""

import numpy as np
import scipy.integrate as scint
import sklearn.neighbors as skn
from multiprocessing import Pool

exp = np.load("./data/IC86_I_data.npy")

kde = skn.KernelDensity(bandwidth=0.1, kernel="gaussian", rtol=1e-8)

# KDE sample must be cut in sigma before fitting, similar to range in hist
_exp = exp[exp["sigma"] <= np.deg2rad(5)]

fac_logE = 1.5
fac_dec = 2.5
fac_sigma = 2.

_logE = fac_logE * _exp["logE"]
_sigma = fac_sigma * np.rad2deg(_exp["sigma"])
_dec = fac_dec * _exp["dec"]

kde_sample = np.vstack((_logE, _dec, _sigma)).T

# Fit KDE best model to sample
kde.fit(kde_sample)
Exemple #24
0
	def findClusters(self, data_set,  data_set_training=None):
		self.clusters = []
		self.clusters_training = []
		self.indices = []
		self.distance = []
		self.labels = []

		kde = neighbors.KernelDensity(bandwidth=0.75).fit(data_set)
		
		print(data_set)
		if (data_set_training == None):
			data_set_training = data_set
		else:
			mergedlist = []
			mergedlist.extend(data_set)
			mergedlist.extend(data_set_training)
			data_set_training = mergedlist
		
		#print("data_set_trainging:")
		#print(data_set_training)
		#print(len(data_set_training))
		#print(len(data_set_training[0]))


		kmeans = MiniBatchKMeans(n_clusters=self.n_clusters, random_state=0).fit(data_set_training)
		#predicao = kmeans.predict(data_set_training)

		#print("Centros de cluster")
		#print(kmeans.cluster_centers_)
		#exit()

		distance_clusters = kmeans.fit_transform(data_set_training)
		#print(distance_clusters)
		#print(len(distance_clusters))
		#print(len(distance_clusters[0]))
		#print(len(data_set))
		#distance_clusters_data_set = distance_clusters[0:len(data_set)]
		
		#print(distance_clusters)
		#exit()

		#print("Preparando clusters...")
		for i in range(0, len(kmeans.cluster_centers_)):
			self.clusters.append([])
			self.clusters_training.append([])
			self.indices.append([])

		for i in range(0, len(data_set_training)):
			self.clusters_training[kmeans.labels_[i]].append(data_set_training[i]) 

		#print(len(data_set))
		#print(len(distance_clusters))
		#print(len(kmeans.labels_))
		predicao = kmeans.predict(data_set_training)

		for i in range(0,len(data_set)):
			dist = 0
			print("exemplo: ", i+1)
			for j in distance_clusters[i]:
				#print("Distancia para cluster: ", j )
				dist+= j


				#print(j)
			#self.distance.append(dist)
			self.distance.append([dist])
			
			'''print("dATASET: ", data_set[i])
			print("data set training: " ,data_set_training[i])
			print(kmeans.labels_)
			print(predicao[i])
			print(dist)
			print("------------")'''
			self.indices[kmeans.labels_[i]].append(i)
			self.clusters[kmeans.labels_[i]].append(data_set[i]) 

			#print(self.clusters)
			density = kde.score_samples(data_set[i])
			self.distance[i].append(density[0])

		#print(self.distance)
		#exit()
		return distance_clusters
Exemple #25
0
def plotattractors(report,
                   reduction,
                   figsize=None,
                   labelsize=None,
                   connect_psets=False,
                   contour=False,
                   downsample=None,
                   density_downsample=None,
                   focus=None,
                   focus_osc=False,
                   hide_defocused=False,
                   color_code=False,
                   square=False):
    """
    Set up a hexbin or scatter-line plot in the current pyplot.

    Arguments:
    - report: full parameter sampling report
    - reduction: how to map concentration values to 2D space: an instance of e.g. PCA2D or AverageLog
    - figsize: figure size as a tuple of inches (width by height)
    - labelsize: font size for axis labels
    - connect_psets: whether to make a scatter-line plot instead of a hexbin plot
    - contour: proportion of density outside the lowest contour level, or False to not add contour lines
    - downsample: ruleset to downsample systems for display
    - density_downsample: ruleset to downsample systems for contour/density estimation
    - focus: Boolean-valued ruleset to focus systems (scatter-line only, default all focused)
    - focus_osc: whether to focus systems containing oscillators (scatter-line only, will defocus all others if focus not set)
    - hide_defocused: whether to hide all non-focused systems (scatter-line only)
    - color_code: whether to color lines by system type (scatter-line only)
    - square: whether to force a square plot
    """
    reduction.prepare(report)
    random.seed(1)
    summary_occurrences = categorizeattractors(report)
    filtered_psets = applydownsample(summary_occurrences, downsample)
    points = reduction.reduce(psets_matrix(filtered_psets))
    xlabel, ylabel = reduction.labels()
    fig, ax_main = plt.subplots(figsize=figsize)
    if connect_psets:
        distinct_summaries = list(categorizeattractors(filtered_psets).keys())
        default_cycle = cycler.cycler(color=[
            'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple',
            'tab:brown', 'tab:gray', 'tab:olive', 'tab:cyan'
        ])
        default_cycler = default_cycle()
        defocus_default_cycler = plt.rcParams['axes.prop_cycle']()
        for i, pset in enumerate(filtered_psets):
            pset_matrix = np.array(caricatureattractors(pset['attractors']))
            pset_xy = reduction.reduce(pset_matrix)
            sorted_attractors = pset_xy[pset_matrix[:, 0].argsort(), :]
            point_mask = [not isoscillator(a) for a in pset['attractors']]
            has_oscillator = not all(point_mask)
            z = i
            linewidth = None
            oscwidth = 1.6
            dotsize = 36.0
            defocused = False
            summary = summarizeattractors(pset)
            if focus or focus_osc:
                if (focus_osc
                        and has_oscillator) or (focus and specificrulevalue(
                            focus, summary, default=False)):
                    z += len(filtered_psets) + 1
                elif hide_defocused:
                    continue
                else:
                    linewidth = 0.8
                    oscwidth = 1.1
                    dotsize = 10.0
                    defocused = True
            if color_code:
                hue, sat, lum, hue_vary_width = summaryhsl(
                    distinct_summaries, summary)
                hue += random.uniform(0, hue_vary_width)
                if not defocused:
                    lum *= random.uniform(0.85, 1.1)
                    sat *= random.uniform(0.8, 1.0)
            elif defocused:
                next_prop = next(defocus_default_cycler)
                color_spec = next_prop['color']
                r, g, b = mplcolors.to_rgb(color_spec)
                hue, sat, lum = colorsys.rgb_to_hls(r, g, b)
            if defocused:
                lum = min(1 - (1 - lum) * random.uniform(0.3, 0.5), 0.9)
                sat *= random.uniform(0.35, 0.45)
            if color_code or defocused:
                pset_color = colorsys.hls_to_rgb(hue, lum, sat)
            else:
                pset_color = next(default_cycler)['color']
            ax_main.plot(sorted_attractors[:, 0],
                         sorted_attractors[:, 1],
                         lw=linewidth,
                         color=pset_color,
                         zorder=z)
            pointprops = {
                's': dotsize
            } if defocused or not contour else {
                'linewidths': 1.0,
                'edgecolors': 'white',
                's': dotsize * 1.3
            }
            ax_main.scatter(pset_xy[point_mask, 0],
                            pset_xy[point_mask, 1],
                            color=pset_color,
                            zorder=z,
                            **pointprops)
            for osc in (a for a in pset['attractors'] if isoscillator(a)):
                vertices = np.array(osc['orbit'])
                projected_vertices = reduction.reduce(vertices)
                if projected_vertices.shape[0] >= 3:
                    projected_vertices = np.vstack(
                        (projected_vertices, projected_vertices[0, :]))
                polygon = mplpatch.Polygon(projected_vertices,
                                           color=pset_color,
                                           linewidth=oscwidth,
                                           linestyle='--',
                                           fill=False,
                                           zorder=z)
                ax_main.add_patch(polygon)
    else:
        cmap = copy.copy(plt.get_cmap('viridis'))
        cmap.set_under('white', 1.0)
        hex_args = {
            'linewidths': 0.2,
            'norm': mplcolors.LogNorm(vmin=2),
            'cmap': cmap,
            'gridsize': 40
        }
        bin_results = ax_main.hexbin(points[:, 0], points[:, 1], **hex_args)
        fig.colorbar(bin_results, ax=ax_main, label='Attractors')
    if contour:
        random.seed(1)
        density_filtered_psets = applydownsample(summary_occurrences,
                                                 density_downsample)
        density_points = reduction.reduce(psets_matrix(density_filtered_psets))
        kde = neighbors.KernelDensity(kernel='gaussian',
                                      bandwidth=0.1).fit(density_points)
        bin_x, bin_y = np.mgrid[(density_points[:, 0].min() -
                                 0.15):(density_points[:, 0].max() + 0.15):80j,
                                (density_points[:, 1].min() -
                                 0.15):(density_points[:, 1].max() + 0.15):80j]
        density = np.exp(
            kde.score_samples(np.vstack((bin_x.flatten(), bin_y.flatten())).T))
        sorted_densities = np.sort(density.flatten())
        total_density = np.sum(sorted_densities)
        cdf = np.cumsum(sorted_densities) / total_density
        if connect_psets:
            cutoff_indices = [
                np.where(cdf > percentile)[0][0]
                for percentile in np.linspace(contour, 1, 5)[:-1]
            ]
            levels = [sorted_densities[c]
                      for c in cutoff_indices] + [total_density]
            colors = ['#c65ff560', '#af36e388', '#b300ff90', '#8500e2a0']
            ax_main.contourf(bin_x,
                             bin_y,
                             density.reshape(bin_x.shape),
                             levels,
                             colors=colors,
                             zorder=len(filtered_psets))
        else:
            cutoff_indices = [
                np.where(cdf > percentile)[0][0]
                for percentile in np.linspace(contour, 0.9, 6)
            ]
            levels = [sorted_densities[c] for c in cutoff_indices]
            widths = np.linspace(0.5, 1.4, 6)
            ax_main.contour(bin_x,
                            bin_y,
                            density.reshape(bin_x.shape),
                            levels,
                            linewidths=widths,
                            colors='black',
                            zorder=(len(filtered_psets) * 3),
                            alpha=0.6)
    if square:
        ax_main.axis('square')
    elif reduction.equalscale():
        ax_main.axis('equal')
    if reduction.zerobased('x'):
        ax_main.set_xlim(left=0)
    if reduction.zerobased('y'):
        ax_main.set_ylim(bottom=0)
    locator_base = reduction.locatorbase()
    if locator_base is not None:
        ax_main.xaxis.set_major_locator(
            mpltick.MultipleLocator(base=locator_base))
        ax_main.yaxis.set_major_locator(
            mpltick.MultipleLocator(base=locator_base))
    x_text = ax_main.set_xlabel(xlabel)
    if labelsize is not None:
        x_text.set_fontsize(labelsize)
    y_text = ax_main.set_ylabel(ylabel)
    if labelsize is not None:
        y_text.set_fontsize(labelsize)
Exemple #26
0
        print(f'collect trajs {time.time() - start:.0f}s', flush=True)

        if v['obj'] in ["emd"]:
            if v['critic']["reinitialize"] or itr == 0:
                critic = Critic(len(state_indices),
                                **v['critic'],
                                device=device)
            start = time.time()
            critic_loss = critic.learn(expert_samples.copy(),
                                       agent_emp_states,
                                       iter=v['critic']['iter'])
            print(f'train critic {time.time() - start:.0f}s', flush=True)
        # Initialize a density model using KDE
        elif v['density']['model'] == 'kde':
            agent_density = neighbors.KernelDensity(
                bandwidth=v['density']['kde']['bandwidth'],
                kernel=v['density']['kde']['kernel'])
            agent_density.fit(agent_emp_states)
        elif v['density']['model'] == "disc":
            start = time.time()
            # learn log density ratio
            disc = Disc(len(state_indices),
                        **v['density']['disc'],
                        device=device)
            disc_loss = disc.learn(expert_samples.copy(),
                                   agent_emp_states,
                                   iter=v['density']['disc']['iter'])
            print(f'train disc {time.time() - start:.0f}s', flush=True)

        old_reward = copy.deepcopy(reward_func)
    def draw_fit_with_peaks(self, num_walkers, num_steps_to_include):

        num_dim = len(self.a_free_par_guesses)

        sPathToFile = self.s_directory_save_name + self.dict_filename

        if os.path.exists(sPathToFile):
            dSampler = pickle.load(open(sPathToFile, 'r'))
            l_chains = []
            for sampler in dSampler[num_walkers]:
                l_chains.append(sampler['_chain'])

            a_sampler = np.concatenate(l_chains, axis=1)

            print 'Successfully loaded sampler!'
        else:
            print sPathToFile
            print 'Could not find file!'
            sys.exit()
        """
        print 'Test: hard coding steps and walkers'
        num_steps_to_include = 1
        num_walkers = 1
        
        a_sampler = a_sampler[:num_walkers, -num_steps_to_include:, :].reshape((-1, num_dim))
        #a_sampler = a_sampler[:, -num_steps_to_include:, :].reshape((-1, num_dim))
        a_medians = np.median(a_sampler, axis=0)


        l_num_pe = [0, 1, 2, 3, 4, 5, 6]
        l_colors = ['r', 'b', 'g', 'c', 'y', 'm', 'brown']
        prob_hit_first, mean_e_from_dynode, probability_electron_ionized, bkg_mean, bkg_std, mean_num_pe, scale_par = a_medians
        """
        """
        a_sampler = a_sampler[:, -num_steps_to_include:, :].reshape((-1, num_dim))
        dd_hist, l_bins = np.histogramdd(a_sampler, bins=5)
        
        
        l_max_bins = np.unravel_index(dd_hist.argmax(), dd_hist.shape)
        
        l_bin_centers = [0 for i in xrange(len(l_max_bins))]
        # find bin centers from max
        for i in xrange(len(l_max_bins)):
            l_bin_centers[i] = (l_bins[i][l_max_bins[i]+1] + l_bins[i][l_max_bins[i]]) / 2.
        
        l_num_pe = [0, 1, 2, 3, 4, 5, 6]
        l_colors = ['r', 'b', 'g', 'c', 'y', 'm', 'brown']
        prob_hit_first, mean_e_from_dynode, probability_electron_ionized, bkg_mean, bkg_std, mean_num_pe, scale_par = l_bin_centers
        """

        max_num_events_for_kde = 5e4
        assert num_steps_to_include * num_walkers < max_num_events_for_kde, 'Using KDE to estimate maximum in full space so must use less than %d events for time constraints.\n' % (
            int(max_num_events_for_kde))
        a_sampler = a_sampler[:, -num_steps_to_include:, :].reshape(
            (-1, num_dim))

        scaler = preprocessing.StandardScaler()
        scaler.fit(a_sampler)
        a_scaled_samples = scaler.transform(a_sampler)

        #print a_sampler[:,1:3]
        #print a_scaled_samples

        # find the best fit bandwith since this allows us
        # to play with bias vs variance
        grid = grid_search.GridSearchCV(
            neighbors.KernelDensity(),
            {'bandwidth': np.linspace(0.01, 2., 20)},
            cv=4,
            verbose=1,
            n_jobs=4)
        print '\nDetermining best bandwidth...\n'
        grid.fit(a_scaled_samples)
        #print grid.best_estimator_

        kde = neighbors.KernelDensity(**grid.best_params_)
        kde.fit(a_scaled_samples)

        def func_for_minimizing_for_plot(a_parameters):
            a_scaled_parameters = scaler.transform(a_parameters)
            return -kde.score(a_scaled_parameters)

        #a_bounds = [(0.75, 1), (1, 25), (0, 1.0), (1e3, 1e5), (5e4, 8e5), (0.6, 3.), (0.2, 2)]
        a_bounds = [
            np.percentile(a_sampler[:, i], [2, 98]) for i in xrange(num_dim)
        ]
        result = op.differential_evolution(func_for_minimizing_for_plot,
                                           a_bounds,
                                           disp=True,
                                           maxiter=100,
                                           tol=0.01,
                                           popsize=20,
                                           polish=True)

        print result.x

        l_num_pe = [0, 1, 2, 3, 4, 5, 6]
        l_colors = ['r', 'b', 'g', 'c', 'y', 'm', 'brown']
        prob_hit_first, mean_e_from_dynode, width_e_from_dynode, probability_electron_ionized, bkg_mean, bkg_std, mean_num_pe, scale_par = result.x

        l_hists = [
            np.zeros(len(self.d_fit_files['bin_centers_plots']),
                     dtype=np.float32) for i in xrange(len(l_num_pe))
        ]
        sum_hist = np.zeros(len(self.d_fit_files['bin_centers_plots']),
                            dtype=np.float32)

        mean_num_pe = np.asarray(mean_num_pe, dtype=np.float32)

        num_trials = np.asarray(self.num_mc_events, dtype=np.int32)

        prob_hit_first = np.asarray(prob_hit_first, dtype=np.float32)
        mean_e_from_dynode = np.asarray(mean_e_from_dynode, dtype=np.float32)
        width_e_from_dynode = np.asarray(width_e_from_dynode, dtype=np.float32)
        probability_electron_ionized = np.asarray(probability_electron_ionized,
                                                  dtype=np.float32)
        bkg_mean = np.asarray(bkg_mean, dtype=np.float32)
        bkg_std = np.asarray(bkg_std, dtype=np.float32)

        bin_edges = np.asarray(self.d_fit_files['bin_edges_plots'],
                               dtype=np.float32)
        num_bins = np.asarray(len(bin_edges) - 1, dtype=np.int32)

        sum_of_hists = 0

        for i, num_pe in enumerate(l_num_pe):
            current_hist = l_hists[i]
            num_trials = np.asarray(
                int(self.num_mc_events *
                    scipy.stats.poisson.pmf(num_pe, mean_num_pe)),
                dtype=np.int32)
            num_pe = np.asarray(num_pe, dtype=np.int32)

            l_args_gpu = [
                self.rng_states,
                drv.In(num_trials),
                drv.In(self.num_loops),
                drv.InOut(current_hist),
                drv.In(num_pe),
                drv.In(prob_hit_first),
                drv.In(mean_e_from_dynode),
                drv.In(width_e_from_dynode),
                drv.In(probability_electron_ionized),
                drv.In(bkg_mean),
                drv.In(bkg_std),
                drv.In(num_bins),
                drv.In(bin_edges)
            ]

            gpu_fixed_pe_cascade_spectrum(*l_args_gpu, **self.d_gpu_scale)
            sum_of_hists += np.sum(current_hist)

            l_hists[i] = current_hist

        for i, num_pe in enumerate(l_num_pe):
            current_hist = l_hists[i]
            current_hist = np.asarray(current_hist, dtype=np.float32) * np.sum(
                self.d_fit_files['hist']
            ) / sum_of_hists * self.d_fit_files[
                'bin_width'] / self.d_fit_files['bin_width_plots'] * scale_par
            sum_hist += current_hist
            l_hists[i] = current_hist

        f1, (ax1) = plt.subplots(1)
        ax1.set_yscale('log', nonposx='clip')

        a_x_values, a_y_values, a_x_err_low, a_x_err_high, a_y_err_low, a_y_err_high = neriX_analysis.prepare_hist_arrays_for_plotting(
            self.d_fit_files['hist'], self.d_fit_files['bin_edges'])
        ax1.errorbar(a_x_values,
                     a_y_values,
                     xerr=[a_x_err_low, a_x_err_high],
                     yerr=[a_y_err_low, a_y_err_high],
                     color='k',
                     fmt='.')
        for i in xrange(len(l_num_pe)):
            ax1.plot(self.d_fit_files['bin_centers_plots'],
                     l_hists[i],
                     color=l_colors[i])
        ax1.plot(self.d_fit_files['bin_centers_plots'],
                 sum_hist,
                 color='darkorange',
                 linestyle='-')

        ax1.set_title('Integrated Charge Spectrum - %s' %
                      (self.file_identifier))
        ax1.set_xlabel(r'Integrated Charge [$e^{-}$]')
        ax1.set_ylabel('Counts')

        # test
        """
        num_bins_plots = len(self.d_fit_files['bin_centers_plots'])

        a_hist = np.zeros(num_bins_plots, dtype=np.float32)
            
        a_hist_pure = np.zeros(num_bins_plots, dtype=np.float32)
        
        l_args_gpu = [self.rng_states, drv.In(num_trials), drv.In(self.num_loops), drv.InOut(a_hist), drv.In(mean_num_pe), drv.In(prob_hit_first), drv.In(mean_e_from_dynode), drv.In(probability_electron_ionized), drv.In(bkg_mean), drv.In(bkg_std), drv.In(num_bins), drv.In(bin_edges)]
    
    
        #start_time_mpe1 = time.time()
        gpu_cascade_model(*l_args_gpu, **self.d_gpu_scale)
        #print 'Time for MPE1 call: %f s' % (time.time() - start_time_spe)
        a_model = np.asarray(a_hist, dtype=np.float32)*np.sum(self.d_fit_files['hist'])/np.sum(a_hist)*self.d_fit_files['bin_width']/self.d_fit_files['bin_width_plots']*scale_par

        ax1.plot(self.d_fit_files['bin_centers_plots'], a_model, color='pink', linestyle='--')
        """

        f1.savefig('%s%s_pe_specs_%s.png' %
                   (self.s_directory_save_plots_name, self.s_base_save_name,
                    self.file_identifier))
fac_dec = 2.5
fac_sigma = 2.
# ###########################################################################

logE = fac_logE * exp["logE"]
sigma = fac_sigma * np.rad2deg(exp["sigma"])
dec = fac_dec * exp["dec"]

sample = np.vstack((
    fac_logE * exp["logE"],
    fac_dec * exp["dec"],  # Normal space to have no hard cuts at the edges
    fac_sigma * np.rad2deg(exp["sigma"])  # In deg to match scale
)).T

# Optimize bandwidth in a cross validation.
kde_estimator = skn.KernelDensity(kernel="gaussian", rtol=1e-6)

# Scan grid. See comment on top on parameter ranges
SCAN = "followup_2nd_pass"
start = 0.1
step = 0.001
stop = 0.12 + step

bandwidths = np.arange(start, stop, step)
ncv = 20
param_grid = {"bandwidth": bandwidths}

model_selector = skms.GridSearchCV(
    estimator=kde_estimator,
    cv=ncv,
    param_grid=param_grid,
Exemple #29
0
n_event = len(fnames)
job_list = allocate_jobs(n_event, n_procs, rank)
for i in job_list:

    # read samples
    samples = np.genfromtxt(os.path.join(data_dir, \
          fnames[i]))
    d_l_samples.append(samples)
    pkl_fname = os.path.join(data_dir, \
           fnames[i].replace('.txt', '.pkl'))

    if recompute:

        # fit KDE to samples and pickle for later
        print 'fitting ' + fnames[i]
        gs = skms.GridSearchCV(skn.KernelDensity(), \
                {'bandwidth': bw_grid})
        gs.fit(samples[:, None])
        kde = gs.best_estimator_
        pickle.dump(kde, open(pkl_fname, 'wb'))
        print 'optimal bw: {:9.3e}'.format(kde.bandwidth)

        # plot fits
        hist, bin_edges = np.histogram(samples, bins=50, \
                  density=True)
        bin_centres = (bin_edges[1:] + bin_edges[:-1]) / 2.0
        pdf = np.exp(kde.score_samples(d_l_grid[:, None]))
        if rank == 0:
            axes[0].plot(bin_centres, hist)
            axes[1].plot(d_l_grid, pdf)
    def hkcmfind(self,
                 magname='Kmag',
                 dmagname='eKmag',
                 niter=1000,
                 kernel='epanechnikov'):
        # Set the data to find the TRGB
        mk = self.data.hmag.dropna() - self.data.kmag.dropna()
        dmk = np.sqrt(self.data.kerr.dropna()**2 + self.data.herr.dropna()**2)

        mk = -mk

        #Initialise stuff
        niter = niter  # Number of itterations
        rtol = 1e-5  # Relative tolerance of the result
        kernel = 'epanechnikov'  # Parabolic kernel for the KDE

        mx = np.linspace(max(mk) * 1.2, min(mk) * 0.8, 1000)
        trgbloc = np.zeros(niter)

        #----------------------------------------
        #Generate NITER realisations of the Kernel Density Estimation
        for i in range(niter):
            msamp = np.random.normal(
                mk, dmk
            )  # Add Noise to data -> diff. each loop -> more reliable TRGB

            # Find an ideal binwidth for the luminosity function
            # PS: Monte Carlo already smooths the distribution, so reduce the ideal binwidth a bit.

            bandwidth_factor = 0.25
            bandwidth = bandwidth_factor * (np.std(msamp) *
                                            (len(msamp)**(-0.2)))

            #----------------------------------------
            # Implement the Kernel density estimation using a KD Tree for efficient queries

            kde = neighbors.KernelDensity(bandwidth=bandwidth,
                                          rtol=rtol,
                                          kernel=kernel)  # Inialise
            kde.fit(
                msamp[:,
                      np.newaxis])  # Fit the Kernel Density model on the data.
            #kde.score_samples #returns ln(pdf)   # Evaluate the density model on data - probablility density function
            pdf = np.exp(
                kde.score_samples(mx[:, np.newaxis])
            )  #MX is x-axis range which the PDF is computed/plotted.

            plt.plot(mx, pdf)

            #----------------------------------------
            # Set the Edge Detection part using a savgol_filter

            smooth_window = 31
            poly_degree = 3
            dpdf = savgol_filter(pdf, smooth_window, poly_degree, deriv=1)
            trgbloc[i] = mx[np.argmin(
                dpdf
            )]  # Most negative value corresponds to highest rate of decrease

        trgbloc_mean = np.mean(trgbloc)  # Find the TRGB
        trgbloc_sd = np.std(trgbloc)  # Find the Error in the TRGB estimate

        return [trgbloc_mean, trgbloc_sd]