def test_additivity_with_weights(data, split_index): """ Test the additive propery of the KDE, with weights. """ x = np.linspace(-10, 15) weights = np.arange(len(data)) + 1 weights = weights / np.sum(weights) # Fit to add data y = NaiveKDE().fit(data, weights).evaluate(x) # Split up the data and the weights data = list(data) weights = list(weights) data_first_split = data[:split_index] data_second_split = data[split_index:] weights_first_split = weights[:split_index] weights_second_split = weights[split_index:] # Fit to splits, and compensate for smaller data using weights y_1 = NaiveKDE().fit(data_first_split, weights_first_split).evaluate(x) * sum(weights_first_split) y_2 = NaiveKDE().fit(data_second_split, weights_second_split).evaluate(x) * sum(weights_second_split) # Additive property of the functions assert np.allclose(y, y_1 + y_2)
def test_api_2D_data_which_is_1D(estimator): """ Test that 2D data along a line is the same as 1D data. """ np.random.seed(123) random_data = np.random.randn(50).reshape(-1, 1) zeros = np.zeros_like(random_data) data_2D = np.concatenate((random_data, zeros), axis=1) x2, y2 = NaiveKDE().fit(data_2D).evaluate((1024, 3)) y2 = y2.reshape((1024, 3)) x, y = NaiveKDE().fit(random_data).evaluate(1024) # Proportions prop = y2[:, 3 // 2].ravel() / y # At zero, epsilon is added and eps / eps = 1, remove these values prop = prop[~np.isclose(prop, 1)] # Every other value should be equal - i.e they should be proportional # To see why they are equal, consider points (0, 0), (1, 0) and (2, 0). # Depending on the norm the normalization will make the heigh smaller assert np.all(np.isclose(prop, prop[0])) # Again the other way around too data_2D = np.concatenate((zeros, random_data), axis=1) x2, y2 = NaiveKDE().fit(data_2D).evaluate((3, 1024)) y2 = y2.reshape((3, 1024)) x, y = NaiveKDE().fit(random_data).evaluate(1024) prop = y2[3 // 2, :].ravel() / y prop = prop[~np.isclose(prop, 1)] assert np.all(np.isclose(prop, prop[0]))
def test_data_must_have_length(): """ Test that an error is raised when the data has no length. """ input_data = np.array([]) k = NaiveKDE(kernel='gaussian', bw=1) with pytest.raises(ValueError): k.fit(np.array(input_data))
def test_weights(): """ Test that the default weights are set to uniform. """ data = [1, 2, 5, 10] x1, y1 = NaiveKDE().fit(data).evaluate() weights = np.array(np.ones_like(data)) / len(data) x2, y2 = NaiveKDE().fit(data, weights=weights).evaluate() assert np.allclose(y1, y2)
def test_additivity(data, split_index): """ Test the additive propery of the KDE. """ x = np.linspace(-10, 10) # Fit to add data y = NaiveKDE().fit(data).evaluate(x) # Fit to splits, and compensate for smaller data using weights weight_1 = split_index / len(data) y_1 = NaiveKDE().fit(data[:split_index]).evaluate(x) * weight_1 weight_2 = (len(data) - split_index) / len(data) y_2 = NaiveKDE().fit(data[split_index:]).evaluate(x) * weight_2 # Additive property of the functions assert np.allclose(y, y_1 + y_2)
def test_against_naive_KDE(data, bw): """ The the FFTKDE against a naive KDE without weights. """ # Higher accuracy when num gets larger x = np.linspace(min(data) - bw, max(data) + bw, num=2**10) y1 = NaiveKDE("epa", bw=bw).fit(data, weights=None).evaluate(x) y2 = FFTKDE("epa", bw=bw).fit(data, weights=None).evaluate(x) assert np.allclose(y1, y2, atol=10e-5)
def test_against_R_density(kernel, bw, n, expected_result): """ Test against the following function call in R: d <- density(c(0, 0.1, 1), kernel="{kernel}", bw={bw}, n={n}, from=-1, to=1); d$y """ data = np.array([0, 0.1, 1]) x = np.linspace(-1, 1, num=n) y = NaiveKDE(kernel, bw=bw).fit(data).evaluate(x) assert np.allclose(y, expected_result, atol=10**(-2.7))
def test_against_naive_KDE_w_weights(data, bw): """ The the FFTKDE against a naive KDE with weights. """ # Higher accuracy when num gets larger x = np.linspace(min(data) - bw, max(data) + bw, num=2**10) weights = np.arange(len(data)) + 1 y1 = NaiveKDE('epa', bw=bw).fit(data, weights=weights).evaluate(x) y2 = FFTKDE('epa', bw=bw).fit(data, weights=weights).evaluate(x) assert np.allclose(y1, y2, atol=10e-4)
def test_grid_must_have_length(): """ Test that an error is raised when the grid has no length. """ input_data = np.array([3, 4]) k = NaiveKDE(kernel="gaussian", bw=1) with pytest.raises(ValueError): k.fit(np.array(input_data)) k.evaluate(np.array([]))
def test_against_scipy_density(bw, n, expected_result): """ Test against the following function call in SciPy: data = np.array([0, 0.1, 1]) x = np.linspace(-1, 1, {n}) bw = {bw}/np.asarray(data).std(ddof=1) density_estimate = gaussian_kde(dataset = data, bw_method = bw) y = density_estimate.evaluate(x) # Note that scipy weights its bandwidth by the covariance of the # input data. To make the results comparable to the other methods, # we divide the bandwidth by the sample standard deviation here. """ data = np.array([0, 0.1, 1]) x = np.linspace(-1, 1, num=n) y = NaiveKDE(kernel="gaussian", bw=bw).fit(data).evaluate(x) assert np.allclose(y, expected_result)
def test_constant_values_silverman(): """ Test that a data set with constant values does not fail when using silverman's rule. Tests with "almost" constant values should also get a bw assigned automatically, although silverman's rule technically does not do this. https://github.com/tommyod/KDEpy/issues/9 """ data = np.ones(100, dtype=float) kde = NaiveKDE(bw="silverman").fit(data) with pytest.warns(UserWarning): kde.evaluate() assert np.isclose(kde.bw, 1.0) data = np.ones(1000, dtype=float) data[0] = 0.0 data[999] = 2.0 kde = NaiveKDE(bw="silverman").fit(data) with pytest.warns(UserWarning): kde.evaluate()
def test_common_API_patterns(): """ Test common API patterns. """ # Simplest way, with auto grid data = [1, 2, 5, 10] x, y = NaiveKDE().fit(data).evaluate() # Using a pre-defined grid x = np.linspace(-10, 50) y1 = NaiveKDE().fit(data).evaluate(x) # No chaining k = NaiveKDE() k.fit(data) y2 = k.evaluate(x) assert np.allclose(y1, y2)
def test_1d_data_inputs(bw, kernel): """ Test that passing data as lists, tuples and NumPy arrays are all ok. """ input_data = [1, 2, 5, 10] k = NaiveKDE(kernel=kernel, bw=bw) # Arrays k.fit(np.array(input_data)) x_1, y_1 = k.evaluate() # Lists k.fit(list(input_data)) x_2, y_2 = k.evaluate() # Tuples k.fit(tuple(input_data)) x_3, y_3 = k.evaluate() # Arrays of shape (obs, dims) k.fit(np.array(input_data).reshape(-1, 1)) x_4, y_4 = k.evaluate() assert np.allclose(y_1, y_2) assert np.allclose(y_2, y_3) assert np.allclose(y_3, y_4)
def price_clustering(dataframe, min_items=2, min_var_coef=0.3, column='product_id', fluctuation=0.20, column_name='price_min'): #categories_list = [categories_list[i] for i in range(len(categories_list)-1) if len(categories_list[i]) >= min_items] data = dataframe.copy() s = time.time() new_cat_counter = 0 unique_categories = list(set(data[column])) segmentation_policy = {} for i in unique_categories: x = np.sort(data[data[column] == i][column_name].values) var_coef_x = var_coef(x) iqr = float((pd.DataFrame(x).quantile(0.75)) - (pd.DataFrame(x).quantile(0.25))) / 1.349 # instantiate and fit the KDE model varx = np.var(x) k1 = 1 k2 = 1 k3 = 1 m = min(k1 * varx**(k2 * 1 / 2), k3 * iqr / 1.349) h = 0.9 * m / (len(x)**(1 / 5)) # h sugerido Stata if (h > 0) & (len(x) > 2): if var_coef_x >= min_var_coef: x_d = np.linspace(min(x), max(x), len(x)) kde = KernelDensity(bandwidth=h, kernel='gaussian') kde.fit(x[:, None]) e = np.exp(kde.score_samples((x_d.reshape(-1, 1)))) mi = argrelextrema(e, np.less)[0] #ma = argrelextrema(e, np.greater)[0] x_split = np.split(np.sort(x), mi) #k = 1/len(x_split)*len(x)/(np.var(x))**(1/2) x_band = [] for k in x_split: x_band.extend(np.repeat( np.median(k) * fluctuation, len(k))) x_band = np.array(x_band) #estimator = NaiveKDE(kernel='gaussian', bw=fluctuation/np.log(k*x)*x).fit(np.array(x)) estimator = NaiveKDE(kernel='gaussian', bw=x_band).fit(np.array(x)) x_d = np.linspace(min(x), max(x), len(x)) y = estimator.evaluate(x_d) mi = argrelextrema(y, np.less)[0] x_split = np.split(np.sort(x), mi) segmentation_policy[str(i)] = x_split j = 0 unchanged_cat_str = str(int(i)) while j in range(len(x_split)): indexes = eval( 'data[data.%s == i].index[data[data.%s == i][column_name].isin(x_split[j])]' % (column, column)) new_category = str(unchanged_cat_str + '_' + str(j)) data.loc[indexes, column + '_by_price'] = new_category j += 1 new_cat_counter += 1 else: unchanged_cat_str = str(int(i)) new_category = str(unchanged_cat_str + '_0') indexes = eval('data[data.%s == i].index' % column) data.loc[indexes, column + '_by_price'] = new_category new_cat_counter += 1 else: if var_coef_x <= min_var_coef: unchanged_cat_str = str(int(i)) new_category = str(unchanged_cat_str + '_0') indexes = eval('data[data.%s == i].index' % (column)) data.loc[indexes, column + '_by_price'] = new_category new_cat_counter += 1 else: unchanged_cat_str = str(int(i)) index_max = eval( 'data[data.%s == i].index[data[data.%s == i][column_name] == max(data[data.%s == i][column_name])]' % (column, column, column)) index_min = eval( 'data[data.%s == i].index[data[data.%s == i][column_name] == min(data[data.%s == i][column_name])]' % (column, column, column)) new_category_min = str(unchanged_cat_str + '_0') data.loc[index_min, column + '_by_price'] = new_category_min new_category_max = str(unchanged_cat_str + '_1') data.loc[index_max, column + '_by_price'] = new_category_max print('ran in ' + str(time.time() - s) + 's') print(str(new_cat_counter) + ' new categories found') newdata = data return [newdata, segmentation_policy]
import matplotlib.pyplot as plt from KDEpy.NaiveKDE import NaiveKDE # Comparing tree and naive # ----------------------------------------- data = [3, 3.5, 4, 6, 8] kernel = 'triweight' bw = [3, 0.3, 1, 0.3, 2] weights = [1, 1, 1, 1, 1] plt.figure(figsize=(10, 4)) plt.title('Basic example of the naive KDE') plt.subplot(1, 2, 1) kde = NaiveKDE(kernel=kernel, bw=bw) kde.fit(data, weights) x = np.linspace(0, 10, num=1024) for d, b in zip(data, bw): k = NaiveKDE(kernel=kernel, bw=b).fit([d]).evaluate(x) / len(data) plt.plot(x, k, color='k', ls='--') y = kde.evaluate(x) plt.plot(x, y) plt.scatter(data, np.zeros_like(data)) plt.subplot(1, 2, 2) kde = TreeKDE(kernel=kernel, bw=bw) kde.fit(data, weights) x = np.linspace(0, 10, num=1024) for d, b in zip(data, bw):