Esempio n. 1
0
def test_additivity_with_weights(data, split_index):
    """
    Test the additive propery of the KDE, with weights.
    """

    x = np.linspace(-10, 15)
    weights = np.arange(len(data)) + 1
    weights = weights / np.sum(weights)

    # Fit to add data
    y = NaiveKDE().fit(data, weights).evaluate(x)

    # Split up the data and the weights
    data = list(data)
    weights = list(weights)
    data_first_split = data[:split_index]
    data_second_split = data[split_index:]
    weights_first_split = weights[:split_index]
    weights_second_split = weights[split_index:]

    # Fit to splits, and compensate for smaller data using weights
    y_1 = NaiveKDE().fit(data_first_split, weights_first_split).evaluate(x) * sum(weights_first_split)

    y_2 = NaiveKDE().fit(data_second_split, weights_second_split).evaluate(x) * sum(weights_second_split)

    # Additive property of the functions
    assert np.allclose(y, y_1 + y_2)
Esempio n. 2
0
def test_api_2D_data_which_is_1D(estimator):
    """
    Test that 2D data along a line is the same as 1D data.
    """

    np.random.seed(123)
    random_data = np.random.randn(50).reshape(-1, 1)
    zeros = np.zeros_like(random_data)
    data_2D = np.concatenate((random_data, zeros), axis=1)

    x2, y2 = NaiveKDE().fit(data_2D).evaluate((1024, 3))
    y2 = y2.reshape((1024, 3))
    x, y = NaiveKDE().fit(random_data).evaluate(1024)

    # Proportions
    prop = y2[:, 3 // 2].ravel() / y

    # At zero, epsilon is added and eps / eps = 1, remove these values
    prop = prop[~np.isclose(prop, 1)]

    # Every other value should be equal - i.e they should be proportional
    # To see why they are equal, consider points (0, 0), (1, 0) and (2, 0).
    # Depending on the norm the normalization will make the heigh smaller
    assert np.all(np.isclose(prop, prop[0]))

    # Again the other way around too
    data_2D = np.concatenate((zeros, random_data), axis=1)
    x2, y2 = NaiveKDE().fit(data_2D).evaluate((3, 1024))
    y2 = y2.reshape((3, 1024))
    x, y = NaiveKDE().fit(random_data).evaluate(1024)
    prop = y2[3 // 2, :].ravel() / y
    prop = prop[~np.isclose(prop, 1)]
    assert np.all(np.isclose(prop, prop[0]))
Esempio n. 3
0
def test_data_must_have_length():
    """
    Test that an error is raised when the data has no length.
    """

    input_data = np.array([])
    k = NaiveKDE(kernel='gaussian', bw=1)

    with pytest.raises(ValueError):
        k.fit(np.array(input_data))
Esempio n. 4
0
def test_weights():
    """
    Test that the default weights are set to uniform.
    """
    data = [1, 2, 5, 10]
    x1, y1 = NaiveKDE().fit(data).evaluate()

    weights = np.array(np.ones_like(data)) / len(data)

    x2, y2 = NaiveKDE().fit(data, weights=weights).evaluate()

    assert np.allclose(y1, y2)
Esempio n. 5
0
def test_additivity(data, split_index):
    """
    Test the additive propery of the KDE.
    """
    x = np.linspace(-10, 10)

    # Fit to add data
    y = NaiveKDE().fit(data).evaluate(x)

    # Fit to splits, and compensate for smaller data using weights
    weight_1 = split_index / len(data)
    y_1 = NaiveKDE().fit(data[:split_index]).evaluate(x) * weight_1

    weight_2 = (len(data) - split_index) / len(data)
    y_2 = NaiveKDE().fit(data[split_index:]).evaluate(x) * weight_2

    # Additive property of the functions
    assert np.allclose(y, y_1 + y_2)
Esempio n. 6
0
def test_against_naive_KDE(data, bw):
    """
    The the FFTKDE against a naive KDE without weights.
    """

    # Higher accuracy when num gets larger
    x = np.linspace(min(data) - bw, max(data) + bw, num=2**10)

    y1 = NaiveKDE("epa", bw=bw).fit(data, weights=None).evaluate(x)
    y2 = FFTKDE("epa", bw=bw).fit(data, weights=None).evaluate(x)

    assert np.allclose(y1, y2, atol=10e-5)
Esempio n. 7
0
def test_against_R_density(kernel, bw, n, expected_result):
    """
    Test against the following function call in R:
        
        d <- density(c(0, 0.1, 1), kernel="{kernel}", bw={bw}, 
        n={n}, from=-1, to=1);
        d$y
    """
    data = np.array([0, 0.1, 1])
    x = np.linspace(-1, 1, num=n)
    y = NaiveKDE(kernel, bw=bw).fit(data).evaluate(x)
    assert np.allclose(y, expected_result, atol=10**(-2.7))
Esempio n. 8
0
def test_against_naive_KDE_w_weights(data, bw):
    """
    The the FFTKDE against a naive KDE with weights.
    """

    # Higher accuracy when num gets larger
    x = np.linspace(min(data) - bw, max(data) + bw, num=2**10)
    weights = np.arange(len(data)) + 1

    y1 = NaiveKDE('epa', bw=bw).fit(data, weights=weights).evaluate(x)
    y2 = FFTKDE('epa', bw=bw).fit(data, weights=weights).evaluate(x)

    assert np.allclose(y1, y2, atol=10e-4)
Esempio n. 9
0
def test_grid_must_have_length():
    """
    Test that an error is raised when the grid has no length.
    """

    input_data = np.array([3, 4])
    k = NaiveKDE(kernel="gaussian", bw=1)

    with pytest.raises(ValueError):
        k.fit(np.array(input_data))
        k.evaluate(np.array([]))
Esempio n. 10
0
def test_against_scipy_density(bw, n, expected_result):
    """
    Test against the following function call in SciPy:

        data = np.array([0, 0.1, 1])
        x = np.linspace(-1, 1, {n})
        bw = {bw}/np.asarray(data).std(ddof=1)
        density_estimate = gaussian_kde(dataset = data, bw_method = bw)
        y = density_estimate.evaluate(x)

    # Note that scipy weights its bandwidth by the covariance of the
    # input data.  To make the results comparable to the other methods,
    # we divide the bandwidth by the sample standard deviation here.
    """
    data = np.array([0, 0.1, 1])
    x = np.linspace(-1, 1, num=n)
    y = NaiveKDE(kernel="gaussian", bw=bw).fit(data).evaluate(x)
    assert np.allclose(y, expected_result)
Esempio n. 11
0
def test_constant_values_silverman():
    """
    Test that a data set with constant values does not fail when using silverman's rule.
    Tests with "almost" constant values should also get a bw assigned automatically,
    although silverman's rule technically does not do this.

    https://github.com/tommyod/KDEpy/issues/9
    """

    data = np.ones(100, dtype=float)
    kde = NaiveKDE(bw="silverman").fit(data)
    with pytest.warns(UserWarning):
        kde.evaluate()
    assert np.isclose(kde.bw, 1.0)

    data = np.ones(1000, dtype=float)
    data[0] = 0.0
    data[999] = 2.0
    kde = NaiveKDE(bw="silverman").fit(data)
    with pytest.warns(UserWarning):
        kde.evaluate()
Esempio n. 12
0
def test_common_API_patterns():
    """
    Test common API patterns.
    """
    # Simplest way, with auto grid
    data = [1, 2, 5, 10]
    x, y = NaiveKDE().fit(data).evaluate()

    # Using a pre-defined grid
    x = np.linspace(-10, 50)
    y1 = NaiveKDE().fit(data).evaluate(x)

    # No chaining
    k = NaiveKDE()
    k.fit(data)
    y2 = k.evaluate(x)

    assert np.allclose(y1, y2)
Esempio n. 13
0
def test_1d_data_inputs(bw, kernel):
    """
    Test that passing data as lists, tuples and NumPy arrays are all ok.
    """
    input_data = [1, 2, 5, 10]

    k = NaiveKDE(kernel=kernel, bw=bw)
    # Arrays
    k.fit(np.array(input_data))
    x_1, y_1 = k.evaluate()

    # Lists
    k.fit(list(input_data))
    x_2, y_2 = k.evaluate()

    # Tuples
    k.fit(tuple(input_data))
    x_3, y_3 = k.evaluate()

    # Arrays of shape (obs, dims)
    k.fit(np.array(input_data).reshape(-1, 1))
    x_4, y_4 = k.evaluate()

    assert np.allclose(y_1, y_2)
    assert np.allclose(y_2, y_3)
    assert np.allclose(y_3, y_4)
def price_clustering(dataframe,
                     min_items=2,
                     min_var_coef=0.3,
                     column='product_id',
                     fluctuation=0.20,
                     column_name='price_min'):

    #categories_list = [categories_list[i] for i in range(len(categories_list)-1) if len(categories_list[i]) >= min_items]
    data = dataframe.copy()
    s = time.time()
    new_cat_counter = 0
    unique_categories = list(set(data[column]))
    segmentation_policy = {}
    for i in unique_categories:
        x = np.sort(data[data[column] == i][column_name].values)
        var_coef_x = var_coef(x)
        iqr = float((pd.DataFrame(x).quantile(0.75)) -
                    (pd.DataFrame(x).quantile(0.25))) / 1.349
        # instantiate and fit the KDE model
        varx = np.var(x)
        k1 = 1
        k2 = 1
        k3 = 1
        m = min(k1 * varx**(k2 * 1 / 2), k3 * iqr / 1.349)
        h = 0.9 * m / (len(x)**(1 / 5))  # h sugerido Stata
        if (h > 0) & (len(x) > 2):
            if var_coef_x >= min_var_coef:
                x_d = np.linspace(min(x), max(x), len(x))
                kde = KernelDensity(bandwidth=h, kernel='gaussian')
                kde.fit(x[:, None])
                e = np.exp(kde.score_samples((x_d.reshape(-1, 1))))
                mi = argrelextrema(e, np.less)[0]
                #ma = argrelextrema(e, np.greater)[0]
                x_split = np.split(np.sort(x), mi)
                #k = 1/len(x_split)*len(x)/(np.var(x))**(1/2)
                x_band = []
                for k in x_split:
                    x_band.extend(np.repeat(
                        np.median(k) * fluctuation, len(k)))
                x_band = np.array(x_band)
                #estimator = NaiveKDE(kernel='gaussian', bw=fluctuation/np.log(k*x)*x).fit(np.array(x))
                estimator = NaiveKDE(kernel='gaussian',
                                     bw=x_band).fit(np.array(x))
                x_d = np.linspace(min(x), max(x), len(x))
                y = estimator.evaluate(x_d)
                mi = argrelextrema(y, np.less)[0]
                x_split = np.split(np.sort(x), mi)
                segmentation_policy[str(i)] = x_split
                j = 0

                unchanged_cat_str = str(int(i))
                while j in range(len(x_split)):
                    indexes = eval(
                        'data[data.%s == i].index[data[data.%s == i][column_name].isin(x_split[j])]'
                        % (column, column))
                    new_category = str(unchanged_cat_str + '_' + str(j))
                    data.loc[indexes, column + '_by_price'] = new_category
                    j += 1
                    new_cat_counter += 1
            else:
                unchanged_cat_str = str(int(i))
                new_category = str(unchanged_cat_str + '_0')
                indexes = eval('data[data.%s == i].index' % column)
                data.loc[indexes, column + '_by_price'] = new_category
                new_cat_counter += 1

        else:
            if var_coef_x <= min_var_coef:
                unchanged_cat_str = str(int(i))
                new_category = str(unchanged_cat_str + '_0')
                indexes = eval('data[data.%s == i].index' % (column))
                data.loc[indexes, column + '_by_price'] = new_category
                new_cat_counter += 1
            else:
                unchanged_cat_str = str(int(i))
                index_max = eval(
                    'data[data.%s == i].index[data[data.%s == i][column_name] == max(data[data.%s == i][column_name])]'
                    % (column, column, column))
                index_min = eval(
                    'data[data.%s == i].index[data[data.%s == i][column_name] == min(data[data.%s == i][column_name])]'
                    % (column, column, column))
                new_category_min = str(unchanged_cat_str + '_0')
                data.loc[index_min, column + '_by_price'] = new_category_min
                new_category_max = str(unchanged_cat_str + '_1')
                data.loc[index_max, column + '_by_price'] = new_category_max

    print('ran in ' + str(time.time() - s) + 's')
    print(str(new_cat_counter) + ' new categories found')
    newdata = data
    return [newdata, segmentation_policy]
Esempio n. 15
0
    import matplotlib.pyplot as plt
    from KDEpy.NaiveKDE import NaiveKDE

    # Comparing tree and naive
    # -----------------------------------------
    data = [3, 3.5, 4, 6, 8]
    kernel = 'triweight'
    bw = [3, 0.3, 1, 0.3, 2]
    weights = [1, 1, 1, 1, 1]

    plt.figure(figsize=(10, 4))
    plt.title('Basic example of the naive KDE')

    plt.subplot(1, 2, 1)
    kde = NaiveKDE(kernel=kernel, bw=bw)
    kde.fit(data, weights)
    x = np.linspace(0, 10, num=1024)
    for d, b in zip(data, bw):
        k = NaiveKDE(kernel=kernel, bw=b).fit([d]).evaluate(x) / len(data)
        plt.plot(x, k, color='k', ls='--')

    y = kde.evaluate(x)
    plt.plot(x, y)
    plt.scatter(data, np.zeros_like(data))

    plt.subplot(1, 2, 2)
    kde = TreeKDE(kernel=kernel, bw=bw)
    kde.fit(data, weights)
    x = np.linspace(0, 10, num=1024)
    for d, b in zip(data, bw):