Beispiel #1
0
    def fit(self, X):
        if self._scaling:
            self._scaler = StandardScaler()
            X = self._scaler.fit_transform(X)

        X = X[:512]

        self._kde = KernelDensity().fit(X)

        return self
Beispiel #2
0
def evaluate_vec(real_points, fake_points, validation_fake_points=None):
    """Compute the average log-likelihood and the Coverage metric.
        Coverage metric is defined in arXiv paper. It counts a mass of true
        data covered by the 95% quantile of the model density.
        """

    # Estimating density with KDE
    dist = fake_points[:-1] - fake_points[1:]
    dist = dist * dist
    dist = np.sqrt(np.sum(dist, axis=1))
    bandwidth = np.median(dist)
    num_real = len(real_points)
    num_fake = len(fake_points)
    if validation_fake_points is not None:
        max_score = -1000000.
        num_val = len(validation_fake_points)
        b_grid = bandwidth * (2.**(np.arange(14) - 7.))
        for _bandwidth in b_grid:
            kde = KernelDensity(kernel='gaussian', bandwidth=_bandwidth)
            kde.fit(np.reshape(fake_points, [num_fake, -1]))
            score = np.mean(
                kde.score_samples(
                    np.reshape(validation_fake_points, [num_val, -1])))
            if score > max_score:
                # logging.debug("Updating bandwidth to %.4f"
                #             " with likelyhood %.2f" % (_bandwidth, score))
                bandwidth = _bandwidth
                max_score = score
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
    kde.fit(np.reshape(fake_points, [num_fake, -1]))

    # Computing Coverage, refer to Section 4.3 of arxiv paper
    model_log_density = kde.score_samples(
        np.reshape(fake_points, [num_fake, -1]))
    # np.percentaile(a, 10) returns t s.t. np.mean( a <= t ) = 0.1
    threshold = np.percentile(model_log_density, 5)
    real_points_log_density = kde.score_samples(
        np.reshape(real_points, [num_real, -1]))
    ratio_not_covered = np.mean(real_points_log_density <= threshold)

    C = 1. - ratio_not_covered

    return C
def est_KL(N):
    N = 5000
    tar_samps = banana(N)
    param_init_S1_quad = np.array([1.69, -0.827, -0.37, -0.2])
    res_S1_quad = minimize(objective_SAA_quad, param_init_S1_quad, args=(tar_samps[:,0], 1)) # Only take the first column since this is for S1
    param_final_S1_quad = res_S1_quad.x
    print('First component of inverse computed')
    param_init_S2_quad = np.array([0.25, 1.5, 1.55, 0.15, -0.7, -0.1, -2.5])
    res_S2_quad = minimize(objective_SAA_quad, param_init_S2_quad, args=(tar_samps, 2))
    param_final_S2_quad = res_S2_quad.x
    print('Second component of inverse computed')
    
    new_ref_samps = st.norm.rvs(size=(N,2))
#    print(new_ref_samps[0])
#    import pdb; pdb.set_trace();
    T1_approx = T1_quad(new_ref_samps[:,0], param_final_S1_quad)
    print('First component of forward computed')
# Feed in new_T1 into where z1 needs to be in the S2
    T2_approx = T2_quad(new_ref_samps[:,1], T1_approx, param_final_S2_quad)
    print('Second component of forward computed')
    # Now calculate the KL between [T1_approx, T2_approx] and new_tar_samps
    T12_kde_approx = np.column_stack((T1_approx,T2_approx))
    kde_approx = KernelDensity(kernel='gaussian').fit(T12_kde_approx) # Fit the approximate forward map
    log_dens_approx = kde_approx.score_samples(T12_kde_approx)

    # generate some new target_samps
    new_tar_samps = banana(N)
    T12_kde_true = new_tar_samps
    kde_true = KernelDensity(kernel='gaussian').fit(T12_kde_true)
    log_dens_true = kde_true.score_samples(T12_kde_true)


    PI_true = np.exp(log_dens_true)
    PI_approx = np.exp(log_dens_approx)
    
    # Remove any negative terms
##    neg_true_idx = np.where(PI_true <= 0)[0]
#    neg_approx_idx = np.where(PI_approx <= 0)[0]
#    negs = list(set(neg_true_idx).intersection(neg_approx_idx))
#    S = entropy(PI_approx[PI_approx != negs], PI_true[PI_true != negs])
    S = entropy(PI_approx, PI_true)
    return S
Beispiel #4
0
    def fit(self, X, y):
        """
        Train the model.

        :param X: A Nx3 array, where the features are
                  distance(Angstrom)/10, angle1(rad), angle2(rad)
                  The **distance** is the closest distance between the two line
                  segments (i.e. coarse grained elementts)
                  **angle1** is the angle between the line along the stem vector
                  and the line along the shortest connection between the two
                  elements. A an angle between two straight lines, it is
                  defined between 0 and 90 degrees.
                  **angle2** is the angle between the connecting vector
                  (pointing from the stem to the loop), projected onto the
                  plane normal to the stem direction and the twist vector
                  (location of minor groove) at the point closest to the
                  interaction. As an angle between two vectors, it is
                  defined between 0 and 180 degrees.
        :param y: An array of length N. 0 means no interaction,
                  1 means interaction.
        """
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        log.info("Trainings-data has shape %s", X.shape)
        log.info("We have %s known interactions ", sum(y))
        if X.shape[1] != 3:
            raise TypeError(
                "Expect exactly 3 features, found {}".format(X.shape[1]))
        if not all(yi in [0, 1] for yi in y):
            raise ValueError("y should only contain the values 1 and 0")
        ame = X[np.where(y)]
        non_ame = X[np.where(y == 0)]
        if self.symmetric:
            ame = self._make_symmetric(ame)
            non_ame = self._make_symmetric(non_ame)
        log.info("Fitting. First positive sample: %s", X[np.where(y)][0])
        self.ame_kde_ = KernelDensity(kernel=self.kernel,
                                      bandwidth=self.bandwidth).fit(ame).score_samples
        self.non_ame_kde_ = KernelDensity(kernel=self.kernel,
                                          bandwidth=self.bandwidth).fit(non_ame).score_samples
        self.X_ = X
        self.y_ = y
Beispiel #5
0
 def computePdfKdeSklearn(self, dataset):
     '''
     compute pdf and its values for elements in dataset
     '''
     bwSklearn = estimate_bandwidth(dataset)
     print("bwSklearn este " + str(bwSklearn))
     kde = KernelDensity(kernel='gaussian',
                         bandwidth=bwSklearn).fit(dataset)
     logPdf = kde.score_samples(dataset)
     pdf = np.exp(logPdf)
     return pdf
def get_kde_pdf(X, bandwidth=2, step=.1, num_samples=200, optimize=False):
    """
    return kde and pdf from a data sample
    """
    if len(X) == 0:
        return [], np.array([]), []
    if optimize:
        bandwidths = 10**np.linspace(-1, 1, 10)
        grid = GridSearchCV(KernelDensity(kernel='gaussian'),
                            {'bandwidth': bandwidths},
                            cv=LeaveOneOut(len(X)))
        grid.fit(X[:, None])
        kde = KernelDensity(kernel='gaussian',
                            bandwidth=grid.best_params_['bandwidth']).fit(
                                X[:, None])
    else:
        kde = KernelDensity(kernel='gaussian', bandwidth=2).fit(X[:, None])
    pdf = np.exp(kde.score_samples(np.arange(0, 100, step)[:, None]))
    samples = kde.sample(num_samples)
    return kde, np.array(pdf), samples
Beispiel #7
0
    def train(self):
        [parttraindata,
         validationdata] = datapreparation.splitTraindata(self.trainData)
        # individual modeling
        kdeModel = KernelDensity(kernel=self.kernel,
                                 bandwidth=self.bandwidth).fit(parttraindata)
        # modeling based on others' data
        # for each dim, train a model
        otherKdeModel = []
        for i in range(0, len(self.trainDataOfNeighborsDim)):
            aModel = KernelDensity(kernel=self.kernel,
                                   bandwidth=self.bandwidthNeighbor[i]).fit(
                                       self.trainDataOfNeighborsDim[i])
            otherKdeModel.append(aModel)

        # mixture modeling
        self.trainedModel = _mixturemodels.FixBwMixtureModels(
            parameters=None, models=[kdeModel] + otherKdeModel)
        em.runEM(validationdata, mixmodels=self.trainedModel)
        print(self.trainedModel.params)
def calculate_kde(points,
                  df_osm_built,
                  df_osm_pois=None,
                  bandwidth=400,
                  X_weights=None,
                  pois_weight=9,
                  log_weight=True):
    """
	Evaluate the probability density function using Kernel Density Estimation of input geo-localized data
	KDE's bandwidth related to walkable-distances

	Parameters
	----------
	df : pandas.DataFrame
		input data with column [geometry] containing shapely geometries
	XX_YY : pandas.Panel
		meshgrid to evaluate the probability density function
	bandwidth:

	Returns
	----------
	pandas.Series
		
	"""
    # X_b : Buildings array
    X_b = [[p.x, p.y] for p in df_osm_built.geometry.centroid.values]

    # X_p : Points array
    if (df_osm_pois is None): X_p = []
    else: X_p = [[p.x, p.y] for p in df_osm_pois.geometry.centroid.values]

    # X : Full array
    X = np.array(X_b + X_p)

    # Points where the probability density function will be evaluated
    Y = np.array([[p.x, p.y] for p in points.values])

    if (not (X_weights is None)):  # Weighted Kernel Density Estimation
        # Building's weight + POIs weight
        X_W = np.concatenate(
            [X_weights.values,
             np.repeat([pois_weight], len(X_p))])

        if (log_weight):  # Apply logarithm
            X_W = np.log(X_W)

        PDF = WeightedKernelDensityEstimation(X, X_W, bandwidth, Y)
        return pd.Series(PDF / PDF.max())
    else:  # Kernel Density Estimation
        # Sklearn
        kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(X)
        # Sklearn returns the results in the form log(density)
        PDF = np.exp(kde.score_samples(Y))
        return pd.Series(PDF / PDF.max())
Beispiel #9
0
def kernel_density_estimation(predicted_positions):
    l = predicted_positions.shape[0] // 2
    kde = KernelDensity(kernel='gaussian',
                        bandwidth=0.2).fit(predicted_positions.T)
    sc = kde.score_samples(predicted_positions.T)

    max = np.argmax(np.exp(sc))
    landmarks = predicted_positions[:, max]
    landmarks = np.reshape(landmarks, (l, 2))

    return landmarks
 def __init__(self, space_dim, done_fktn, predict_change=False, sample_rejection=False):
     self.input_dim = space_dim + 1
     self.output_dim = self.input_dim -1
     self.X = None
     self.Y = None
     self.done = done_fktn
     self.type = 'GP'
     self.predict_change = predict_change
     self.sample_rejection = sample_rejection
     self.nb_samples = 500
     self.kde =  KernelDensity(bandwidth = 10/(space_dim * np.power(1000, 1/space_dim)))
Beispiel #11
0
def marcenko_pastur_loss(sigma,
                         n_features,
                         n_obs,
                         e_val,
                         bwidth,
                         kernel='gaussian',
                         n_pts=1000):
    """
    Return the loss (sum of squared errors) from the Marcenko-Pastur distribution.
    
    Arguments
    ---------
    sigma : float
      Standard deviation of observations.
    n_features : int
      Number of features.
    n_obs : int
      Number of observations (in time).
    e_val : matrix
      Diagonal matrix of eigenvalues.
    bwidth : float
      Bandwidth values.
    kernel : KernelDensity
      Kernel used to fit observations.
    n_pts : int
      Number of points to sample the PDF.
    
    Notes
    -----
      Function adapted from "Machine Learning for Asset Managers",
      Marcos López de Prado (2020).
    """

    # Compute Theoretical PDF
    pdf0 = marcenko_pastur_pdf(n_features, n_obs, sigma, n_pts)

    # Compute Empirical PDF
    # Fit kernel to a series of observations
    if len(e_val.shape) == 1:
        e_val = e_val.reshape(-1, 1)
    kde = KernelDensity(kernel=kernel, bandwidth=bwidth).fit(e_val)
    # Create index
    x = pdf0.index.values
    if len(x.shape) == 1:
        x = x.reshape(-1, 1)
    # Derive the probability of observations
    log_density = kde.score_samples(x)
    pdf1 = pd.Series(np.exp(log_density), index=x.flatten())

    # Return loss
    loss = np.sum((pdf1 - pdf0)**2)

    return loss
Beispiel #12
0
def contour_plot(points, x_label, y_label):
    kde = KernelDensity(kernel='epanechnikov', bandwidth=0.01).fit(points)
    x_limits = np.min(points[:, 0]), np.max(points[:, 0])
    y_limits = np.min(points[:, 1]), np.max(points[:, 1])
    x, y = np.meshgrid(np.linspace(*x_limits, 300), np.linspace(*y_limits, 300))
    xy = np.stack([x.ravel(), y.ravel()]).T
    z = kde.score_samples(xy).reshape(x.shape)
    levels = np.linspace(z.max() - 10., z.max(), 100)
    plt.contourf(x, y, np.exp(z), levels=np.exp(levels), cmap=plt.cm.gist_rainbow)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()
Beispiel #13
0
def kde_fit_cv(x, bs=None, cv=10):
    """
    x is [n x p]
    bs is [k x 1] list of bootstrap values to compare
    cv is int, the number of folds to cross-validate
    """
    if bs is None:
        bs = np.linspace(0.1, 1.0, 30)
    grid = GridSearchCV(KernelDensity(), {'bandwidth': bs}, cv=cv)
    grid.fit(x)
    # print grid.best_params_
    return grid.best_estimator_, grid.best_params_['bandwidth']
Beispiel #14
0
def approximateLogLiklihood(x_generated, x_test, searchSpace = np.logspace(-4, 0, 5)):
    x_generated = np.array(x_generated).reshape((len(x_generated),-1))
    x_test = np.array(x_test).reshape((len(x_test),-1))
    # use grid search cross-validation to optimize the bandwidth
    print "new"
    params = {'bandwidth': searchSpace}
    grid = GridSearchCV(KernelDensity(), params, n_jobs=4)
    grid.fit(x_generated)
    print grid.best_params_
    kde = grid.best_estimator_
    scores = kde.score_samples(x_test)
    return np.sum(scores)/len(scores)
Beispiel #15
0
 def __init__(self, data, mirror=False, **kwds):
     self.mirror = mirror
     if kwds is None:
         if self.mirror:
             self.kde_object = KernelDensity(kernel='gaussian').fit(
                 np.vstack([-data, data]).reshape(-1, 1))
         else:
             self.kde_object = KernelDensity(kernel='gaussian').fit(data)
     else:
         if self.mirror:
             self.kde_object = KernelDensity(**kwds).fit(
                 np.vstack([-data.reshape(-1, 1),
                            data.reshape(-1, 1)]))
         else:
             self.kde_object = KernelDensity(**kwds).fit(data.reshape(
                 -1, 1))
     try:
         self.d = data.shape[1]
     except IndexError:
         self.d = 1
     self.n = data.shape[0]
Beispiel #16
0
 def crossValidation(self, data):
     '''
     Compute the band width by using the cross validation method.
     Input:
     data -- a numpy array
     Output: [float]
     '''
     grid = GridSearchCV(KernelDensity(),
                         {'bandwidth': np.linspace(.1, 1.0, 30)},
                         cv=20)
     grid.fit(data)
     return grid.best_params_['bandwidth']
Beispiel #17
0
def plot_scatter(X, scale, out_prefix, title, kde=True):
    """Draws a 2D scatter plot (png) of the core and accessory distances

    Also draws contours of kernel density estimare

    Args:
        X (numpy.array)
            n x 2 array of core and accessory distances for n samples.
        scale (numpy.array)
            Scaling factor from :class:`~PopPUNK.models.BGMMFit`
        out_prefix (str)
            Prefix for output plot file (.png will be appended)
        title (str)
            The title to display above the plot
        kde (bool)
            Whether to draw kernel density estimate contours

            (default = True)
    """
    plt.figure(figsize=(11, 8), dpi=160, facecolor='w', edgecolor='k')
    if kde:
        xx, yy, xy = get_grid(0, 1, 100)

        # KDE estimate
        kde = KernelDensity(bandwidth=0.03,
                            metric='euclidean',
                            kernel='epanechnikov',
                            algorithm='ball_tree')
        kde.fit(X)
        z = np.exp(kde.score_samples(xy))
        z = z.reshape(xx.shape).T

        levels = np.linspace(z.min(), z.max(), 10)
        plt.contour(xx * scale[0],
                    yy * scale[1],
                    z,
                    levels=levels[1:],
                    cmap='plasma')
        scatter_alpha = 1
    else:
        scatter_alpha = 0.1

    plt.scatter(X[:, 0] * scale[0].flat,
                X[:, 1] * scale[1].flat,
                s=1,
                alpha=scatter_alpha)

    plt.title(title)
    plt.xlabel('Core distance (' + r'$\pi$' + ')')
    plt.ylabel('Accessory distance (' + r'$a$' + ')')
    plt.savefig(out_prefix + ".png")
    plt.close()
Beispiel #18
0
def check_if_events_in_cluster(points, events, event_time,
                               n_selection=n_selection_po, multiprocess=True,
                               event_type='po', ):
    #pylint: disable=redefined-outer-name
    '''check if a list of events are in the 4D cluster.'''
    output = {'event_number': [], 'run_number': [], 'in_veto_volume': [], }
    data_arr_nowall = remove_wall_points_np(data_arr_from_points(points))
    #print(data_arr_nowall.shape)
    if not data_arr_nowall.shape[0]:
        warn.warn('No points left in cluster after removing wall points',
                  RuntimeWarning)
        for row in events.iterrows():
            output['event_number'].append(row[1].event_number)
            output['run_number'].append(row[1].run_number)
            output['in_veto_volume'].append(False)
        return output
    if events.empty:
        return output
    data_arr_scores = kde_likelihood(data_arr_nowall,
                                     multiprocess=multiprocess,
                                     event_type=event_type)
    data_arr_selected = data_arr_scores[-len(data_arr_scores)//n_selection:]
    db = DBSCAN(eps=DBSCAN_radius,
                min_samples=DBSCAN_samples)\
                .fit(pd.DataFrame(data_arr_selected).values[:, :4])
    data_arr_cluster = np.zeros(data_arr_selected.shape,
                                dtype=[('x', np.double),
                                       ('y', np.double),
                                       ('z', np.double),
                                       ('t', np.double),
                                       ('score', np.double),
                                       ('label', int)])
    data_arr_cluster['x'] = data_arr_selected['x']
    data_arr_cluster['y'] = data_arr_selected['y']
    data_arr_cluster['z'] = data_arr_selected['z']
    data_arr_cluster['t'] = data_arr_selected['t']
    data_arr_cluster['score'] = data_arr_selected['score']
    data_arr_cluster['label'] = db.labels_
    data_arr_df = pd.DataFrame(data_arr_cluster)
    data_wo_outliers = data_arr_df.query('label != -1').values[:, :4]
    selected_fit = KernelDensity(kernel='tophat', rtol=kde_rtol,
                                 bandwidth=kernel_radius).fit(data_wo_outliers)
    for row in events.iterrows():
        t = abs(row[1].event_time - event_time)/(2*timestep)
        score = selected_fit.score([[row[1].x_3d_nn,
                                     row[1].y_3d_nn,
                                     row[1].z_3d_nn,
                                     t]])
        output['event_number'].append(row[1].event_number)
        output['run_number'].append(row[1].run_number)
        output['in_veto_volume'].append(not score == -np.inf)
    return output
Beispiel #19
0
def get_minima(a, bandwidth, isPlot):

    a_low = a.min() - 2
    a_hi = a.max() + 2

    a = a.reshape(-1, 1)

    num_x = 300 + a_hi - a_low

    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(a)

    s = np.linspace(a_low, a_hi, num=num_x)

    s = s.reshape(-1, 1)

    e = kde.score_samples(s)

    e = np.exp(e)

    # Fill in low values on -inf for a derivable function
    #
    # e_min = np.min(e[np.isfinite(e)])
    # e [np.isneginf(e) ] = (e_min - 1)

    if isPlot:
        plt.plot(s, e)
        plt.show()

    mi = argrelextrema(e, np.less)[0]

    s = s.squeeze()

    minima = s[mi]

    nMinima = len(minima)

    l = []

    if nMinima == 0:
        # print("No minima found, no split")
        l.append(a)

    else:
        l.append(a[a < minima[0]])

        for i in range(0, nMinima - 1):
            l.append(a[(a > minima[i]) & (a < minima[i + 1])])
        """c"""

        l.append(a[a > minima[nMinima - 1]])

    return l
    def set_args_params(self, args):
        self.args = args
        self.use_index = self.args.use_index
        self.reward_type = self.args.reward_type
        self.ep_length = self.args.ep_length
        self.always_render = self.args.render
        self.use_global_density = self.args.use_global_density
        self.use_extrinsic_reward = self.args.use_extrinsic_reward

        self.kde_goal = KernelDensity(kernel='gaussian',
                                      bandwidth=self.args.goal_bandwidth)
        self.kde_tra = KernelDensity(kernel='gaussian',
                                     bandwidth=self.args.trajectory_bandwidth)

        self.set_observation_space()
        print('use index', self.use_index)
        print('reward_type', self.reward_type)
        print('ep_length', self.ep_length)

        if self.always_render:
            self.viewer = self._get_viewer('human')
            self.viewer._run_speed = 100
Beispiel #21
0
def train_KDE_model(train_df, bandwith=KDE_BANDWITH):
    """
    Train KDE model based on coordinates of incidents.
    """

    kde = KernelDensity(bandwidth=bandwith,
                        metric='haversine',
                        kernel='gaussian',
                        algorithm='ball_tree')

    kde.fit(train_df[['latitude', 'longitude']] * np.pi / 180)

    return kde
Beispiel #22
0
def weighted_density(coords, gridcoords, weights, bandwidth=0.004, atol=0.01):
    """
    Compute a weighted density estimate
    :param coords: NP-array (N X 3) of coordinates
    :param gridcoords: 3D matrix of coordinates on which the kde will be evaluated
    :param weights: NP-array (N x 1) of values for each point indicated by the coordinates
    :param bandwidth: bandwidth of the Gaussian kernel to be used
    :return: density estimate
    """
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth,
                        atol=atol).fit(coords, sample_weight=weights)
    density = kde.score_samples(gridcoords)  # return log scores
    return density
Beispiel #23
0
def get_numerical_signature(values, S):
    '''
    Learns a distribution of the values
    Then generates a sample of size S
    '''
    # Transform data to numpy array
    Xnumpy = np.asarray(values)
    X = Xnumpy.reshape(-1, 1)
    # Learn kernel
    kde = KernelDensity(kernel=C.kd["kernel"],
                        bandwidth=C.kd["bandwidth"]).fit(X)
    sig_v = [kde.sample()[0][0] for x in range(S)]
    return sig_v
Beispiel #24
0
def cross_valid_bw(X):

    std_data = np.linalg.norm(np.std(X, axis=0))
    print('bw_CV')
    grid = GridSearchCV(
        KernelDensity(kernel='tophat'),
        {'bandwidth': np.linspace(0.2 * std_data, 1.5 * std_data, 15)},
        cv=20)  # 20-fold cross-validation
    grid.fit(X)
    h_cv = grid.best_params_['bandwidth']

    print('done', std_data, h_cv)
    return h_cv
Beispiel #25
0
def calc_kdes(X_train, Y_train, X_valid, bw):

    prob_matrix = np.zeros((2, X_valid.shape[0]))
    
    for i in range(0, 2):
        X_train_class_i = X_train[Y_train == i, :]
        for j in range(0, FEATS):
            kde = KernelDensity(kernel = 'gaussian', bandwidth = bw)
            kde.fit(X_train_class_i[:,[j]])
            log_prob = kde.score_samples(X_valid[:,[j]])
            prob_matrix[i] = np.add(prob_matrix[i], log_prob)

    return prob_matrix
Beispiel #26
0
    def fit(self, X, y):
        unique_vals = np.unique(y)
        unique_vals = np.sort(unique_vals)
        if len(unique_vals) == 1:
            kde = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth)
            kde.fit(X[y == unique_vals[0]])
            if unique_vals[0] == 0:
                self.kernels.append(kde)
                self.kernels.append(None)
            else:
                self.kernels.append(None)
                self.kernels.append(kde)

        else:
            assert (len(unique_vals) == 2)
            kde = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth)
            kde.fit(X[y == 0])
            self.kernels.append(kde)

            kde = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth)
            kde.fit(X[y == 1])
            self.kernels.append(kde)
Beispiel #27
0
def density_est_kde(ds1, ds2, metric='euclidean'):

    if metric == 'cosine_similarity':
        sep_intra = cosine_similarity(ds1, ds1)
        sep_inter = cosine_similarity(ds1, ds2)
    else:
        sep_intra = pairwise_distances(ds1, ds1, metric=metric)
        sep_inter = pairwise_distances(ds1, ds2, metric=metric)

    sep_intra = sep_intra.flatten()
    sep_inter = sep_inter.flatten()

    # --- intra

    xfit = np.linspace(0, 2, len(sep_intra))
    X = sep_intra[:, np.newaxis]
    Xfit = xfit[:, np.newaxis]
    kde = KernelDensity(bandwidth=0.05)
    kde.fit(X)
    log_dens = kde.score_samples(Xfit)
    density = np.exp(log_dens)
    # density *= 1/np.sum(density)
    plt.plot(xfit, density, '#069af3', lw=2)

    print "--- fitted intra cluster separation"

    # --- inter

    xfit = np.linspace(0, 2, len(sep_inter))
    X = sep_inter[:, np.newaxis]
    Xfit = xfit[:, np.newaxis]
    kde = KernelDensity(bandwidth=0.05)
    kde.fit(X)
    log_dens = kde.score_samples(Xfit)
    density = np.exp(log_dens)
    # density *= 1/np.sum(density)
    plt.plot(xfit, density, '#00B050', lw=2)

    plt.show()
Beispiel #28
0
def fitKDE(obs, bWidth=.25, kernel='gaussian', x=None):
    # Fit kernel to a series of obs, and derive the prob of obs
    # x is the array of values on which the fit KDE will be evaluated
    if len(obs.shape) == 1:
        obs = obs.reshape(-1, 1)
    kde = KernelDensity(kernel=kernel, bandwidth=bWidth).fit(obs)
    if x is None:
        x = np.unique(obs).reshape(-1, 1)
    if len(x.shape) == 1:
        x = x.reshape(-1, 1)
    logProb = kde.score_samples(x)  # log(density)
    pdf = pd.Series(np.exp(logProb), index=x.flatten())
    return pdf
def build_kde_model(prices):
    min_dcgr = -0.3
    max_dcgr = 0.3
    num_bins = 1e4
    bin_width = (max_dcgr - min_dcgr) / num_bins
    possible_cgr = np.linspace(min_dcgr, max_dcgr, num_bins)

    dcgr = contGrowthRate(prices)
    silverman_bw = 1.06 * np.std(dcgr[:, 0]) * len(dcgr[:, 0])**(-1 / 5)
    kde = KernelDensity(kernel='gaussian', bandwidth=silverman_bw).fit(dcgr)
    kernel_estimate = np.exp(kde.score_samples(possible_cgr[:, np.newaxis]))

    return kernel_estimate
Beispiel #30
0
def parzen(dataset: int, feature: str) -> None:
    b = 0.1
    X = load_false(dataset)
    scaler = StandardScaler(copy=False)
    X_transformed = scaler.fit_transform(X)
    kdex = KernelDensity(kernel='gaussian', bandwidth=b)
    fx = X[feature].values.reshape((-1, 1))
    kdex.fit(fx)
    x_d = np.linspace(min(fx.flatten()) - .5, max(fx.flatten()) + .5, 1000)
    logprob = kdex.score_samples(x_d[:, None])

    Y = load_true(dataset)
    Y_transformed = scaler.transform(Y)
    kdey = KernelDensity(kernel='gaussian', bandwidth=b)
    fy = Y[feature].values.reshape((-1, 1))
    kdey.fit(fy)
    y_d = np.linspace(min(fy.flatten()) - .5, max(fy.flatten()) + .5, 1000)
    logproby = kdey.score_samples(y_d[:, None])

    plt.clf()
    plt.fill_between(x_d, np.exp(logprob), alpha=0.5)
    plt.plot(fx,
             np.full_like(fx, -0.01),
             '|k',
             label='Inliers',
             markeredgewidth=1)
    plt.fill_between(y_d, np.exp(logproby), alpha=0.5)
    plt.plot(fy,
             np.full_like(fy, -0.01),
             '.k',
             label='Outliers',
             markeredgewidth=1)

    d = DATASETS_FALSE[dataset].split("/")[2]
    #plt.title(feature)
    plt.legend(loc='best')
    plt.ylim(-0.02, 1.1)
    #plt.show()
    plt.savefig("results/plots_features/" + d + "/" + feature + '.svg')