Beispiel #1
0
def test_1d_data_inputs(bw, kernel):
    """
    Test that passing data as lists, tuples and NumPy arrays are all ok.
    """
    input_data = [1, 2, 5, 10]

    k = NaiveKDE(kernel=kernel, bw=bw)
    # Arrays
    k.fit(np.array(input_data))
    x_1, y_1 = k.evaluate()

    # Lists
    k.fit(list(input_data))
    x_2, y_2 = k.evaluate()

    # Tuples
    k.fit(tuple(input_data))
    x_3, y_3 = k.evaluate()

    # Arrays of shape (obs, dims)
    k.fit(np.array(input_data).reshape(-1, 1))
    x_4, y_4 = k.evaluate()

    assert np.allclose(y_1, y_2)
    assert np.allclose(y_2, y_3)
    assert np.allclose(y_3, y_4)
Beispiel #2
0
def test_grid_must_have_length():
    """
    Test that an error is raised when the grid has no length.
    """

    input_data = np.array([3, 4])
    k = NaiveKDE(kernel='gaussian', bw=1)

    with pytest.raises(ValueError):
        k.fit(np.array(input_data))
        k.evaluate(np.array([]))
Beispiel #3
0
def test_constant_values_silverman():
    """
    Test that a data set with constant values does not fail when using silverman's rule.
    Tests with "almost" constant values should also get a bw assigned automatically,
    although silverman's rule technically does not do this.

    https://github.com/tommyod/KDEpy/issues/9
    """

    data = np.ones(100, dtype=float)
    kde = NaiveKDE(bw="silverman").fit(data)
    with pytest.warns(UserWarning):
        kde.evaluate()
    assert np.isclose(kde.bw, 1.0)

    data = np.ones(1000, dtype=float)
    data[0] = 0.0
    data[999] = 2.0
    kde = NaiveKDE(bw="silverman").fit(data)
    with pytest.warns(UserWarning):
        kde.evaluate()
Beispiel #4
0
def test_common_API_patterns():
    """
    Test common API patterns.
    """
    # Simplest way, with auto grid
    data = [1, 2, 5, 10]
    x, y = NaiveKDE().fit(data).evaluate()

    # Using a pre-defined grid
    x = np.linspace(-10, 50)
    y1 = NaiveKDE().fit(data).evaluate(x)

    # No chaining
    k = NaiveKDE()
    k.fit(data)
    y2 = k.evaluate(x)

    assert np.allclose(y1, y2)
def price_clustering(dataframe,
                     min_items=2,
                     min_var_coef=0.3,
                     column='product_id',
                     fluctuation=0.20,
                     column_name='price_min'):

    #categories_list = [categories_list[i] for i in range(len(categories_list)-1) if len(categories_list[i]) >= min_items]
    data = dataframe.copy()
    s = time.time()
    new_cat_counter = 0
    unique_categories = list(set(data[column]))
    segmentation_policy = {}
    for i in unique_categories:
        x = np.sort(data[data[column] == i][column_name].values)
        var_coef_x = var_coef(x)
        iqr = float((pd.DataFrame(x).quantile(0.75)) -
                    (pd.DataFrame(x).quantile(0.25))) / 1.349
        # instantiate and fit the KDE model
        varx = np.var(x)
        k1 = 1
        k2 = 1
        k3 = 1
        m = min(k1 * varx**(k2 * 1 / 2), k3 * iqr / 1.349)
        h = 0.9 * m / (len(x)**(1 / 5))  # h sugerido Stata
        if (h > 0) & (len(x) > 2):
            if var_coef_x >= min_var_coef:
                x_d = np.linspace(min(x), max(x), len(x))
                kde = KernelDensity(bandwidth=h, kernel='gaussian')
                kde.fit(x[:, None])
                e = np.exp(kde.score_samples((x_d.reshape(-1, 1))))
                mi = argrelextrema(e, np.less)[0]
                #ma = argrelextrema(e, np.greater)[0]
                x_split = np.split(np.sort(x), mi)
                #k = 1/len(x_split)*len(x)/(np.var(x))**(1/2)
                x_band = []
                for k in x_split:
                    x_band.extend(np.repeat(
                        np.median(k) * fluctuation, len(k)))
                x_band = np.array(x_band)
                #estimator = NaiveKDE(kernel='gaussian', bw=fluctuation/np.log(k*x)*x).fit(np.array(x))
                estimator = NaiveKDE(kernel='gaussian',
                                     bw=x_band).fit(np.array(x))
                x_d = np.linspace(min(x), max(x), len(x))
                y = estimator.evaluate(x_d)
                mi = argrelextrema(y, np.less)[0]
                x_split = np.split(np.sort(x), mi)
                segmentation_policy[str(i)] = x_split
                j = 0

                unchanged_cat_str = str(int(i))
                while j in range(len(x_split)):
                    indexes = eval(
                        'data[data.%s == i].index[data[data.%s == i][column_name].isin(x_split[j])]'
                        % (column, column))
                    new_category = str(unchanged_cat_str + '_' + str(j))
                    data.loc[indexes, column + '_by_price'] = new_category
                    j += 1
                    new_cat_counter += 1
            else:
                unchanged_cat_str = str(int(i))
                new_category = str(unchanged_cat_str + '_0')
                indexes = eval('data[data.%s == i].index' % column)
                data.loc[indexes, column + '_by_price'] = new_category
                new_cat_counter += 1

        else:
            if var_coef_x <= min_var_coef:
                unchanged_cat_str = str(int(i))
                new_category = str(unchanged_cat_str + '_0')
                indexes = eval('data[data.%s == i].index' % (column))
                data.loc[indexes, column + '_by_price'] = new_category
                new_cat_counter += 1
            else:
                unchanged_cat_str = str(int(i))
                index_max = eval(
                    'data[data.%s == i].index[data[data.%s == i][column_name] == max(data[data.%s == i][column_name])]'
                    % (column, column, column))
                index_min = eval(
                    'data[data.%s == i].index[data[data.%s == i][column_name] == min(data[data.%s == i][column_name])]'
                    % (column, column, column))
                new_category_min = str(unchanged_cat_str + '_0')
                data.loc[index_min, column + '_by_price'] = new_category_min
                new_category_max = str(unchanged_cat_str + '_1')
                data.loc[index_max, column + '_by_price'] = new_category_max

    print('ran in ' + str(time.time() - s) + 's')
    print(str(new_cat_counter) + ' new categories found')
    newdata = data
    return [newdata, segmentation_policy]
Beispiel #6
0
    kernel = 'triweight'
    bw = [3, 0.3, 1, 0.3, 2]
    weights = [1, 1, 1, 1, 1]

    plt.figure(figsize=(10, 4))
    plt.title('Basic example of the naive KDE')

    plt.subplot(1, 2, 1)
    kde = NaiveKDE(kernel=kernel, bw=bw)
    kde.fit(data, weights)
    x = np.linspace(0, 10, num=1024)
    for d, b in zip(data, bw):
        k = NaiveKDE(kernel=kernel, bw=b).fit([d]).evaluate(x) / len(data)
        plt.plot(x, k, color='k', ls='--')

    y = kde.evaluate(x)
    plt.plot(x, y)
    plt.scatter(data, np.zeros_like(data))

    plt.subplot(1, 2, 2)
    kde = TreeKDE(kernel=kernel, bw=bw)
    kde.fit(data, weights)
    x = np.linspace(0, 10, num=1024)
    for d, b in zip(data, bw):
        k = NaiveKDE(kernel=kernel, bw=b).fit([d]).evaluate(x) / len(data)
        plt.plot(x, k, color='k', ls='--')

    y = kde.evaluate(x)
    plt.plot(x, y)
    plt.scatter(data, np.zeros_like(data))
    plt.show()