def test_1d_data_inputs(bw, kernel): """ Test that passing data as lists, tuples and NumPy arrays are all ok. """ input_data = [1, 2, 5, 10] k = NaiveKDE(kernel=kernel, bw=bw) # Arrays k.fit(np.array(input_data)) x_1, y_1 = k.evaluate() # Lists k.fit(list(input_data)) x_2, y_2 = k.evaluate() # Tuples k.fit(tuple(input_data)) x_3, y_3 = k.evaluate() # Arrays of shape (obs, dims) k.fit(np.array(input_data).reshape(-1, 1)) x_4, y_4 = k.evaluate() assert np.allclose(y_1, y_2) assert np.allclose(y_2, y_3) assert np.allclose(y_3, y_4)
def test_grid_must_have_length(): """ Test that an error is raised when the grid has no length. """ input_data = np.array([3, 4]) k = NaiveKDE(kernel='gaussian', bw=1) with pytest.raises(ValueError): k.fit(np.array(input_data)) k.evaluate(np.array([]))
def test_constant_values_silverman(): """ Test that a data set with constant values does not fail when using silverman's rule. Tests with "almost" constant values should also get a bw assigned automatically, although silverman's rule technically does not do this. https://github.com/tommyod/KDEpy/issues/9 """ data = np.ones(100, dtype=float) kde = NaiveKDE(bw="silverman").fit(data) with pytest.warns(UserWarning): kde.evaluate() assert np.isclose(kde.bw, 1.0) data = np.ones(1000, dtype=float) data[0] = 0.0 data[999] = 2.0 kde = NaiveKDE(bw="silverman").fit(data) with pytest.warns(UserWarning): kde.evaluate()
def test_common_API_patterns(): """ Test common API patterns. """ # Simplest way, with auto grid data = [1, 2, 5, 10] x, y = NaiveKDE().fit(data).evaluate() # Using a pre-defined grid x = np.linspace(-10, 50) y1 = NaiveKDE().fit(data).evaluate(x) # No chaining k = NaiveKDE() k.fit(data) y2 = k.evaluate(x) assert np.allclose(y1, y2)
def price_clustering(dataframe, min_items=2, min_var_coef=0.3, column='product_id', fluctuation=0.20, column_name='price_min'): #categories_list = [categories_list[i] for i in range(len(categories_list)-1) if len(categories_list[i]) >= min_items] data = dataframe.copy() s = time.time() new_cat_counter = 0 unique_categories = list(set(data[column])) segmentation_policy = {} for i in unique_categories: x = np.sort(data[data[column] == i][column_name].values) var_coef_x = var_coef(x) iqr = float((pd.DataFrame(x).quantile(0.75)) - (pd.DataFrame(x).quantile(0.25))) / 1.349 # instantiate and fit the KDE model varx = np.var(x) k1 = 1 k2 = 1 k3 = 1 m = min(k1 * varx**(k2 * 1 / 2), k3 * iqr / 1.349) h = 0.9 * m / (len(x)**(1 / 5)) # h sugerido Stata if (h > 0) & (len(x) > 2): if var_coef_x >= min_var_coef: x_d = np.linspace(min(x), max(x), len(x)) kde = KernelDensity(bandwidth=h, kernel='gaussian') kde.fit(x[:, None]) e = np.exp(kde.score_samples((x_d.reshape(-1, 1)))) mi = argrelextrema(e, np.less)[0] #ma = argrelextrema(e, np.greater)[0] x_split = np.split(np.sort(x), mi) #k = 1/len(x_split)*len(x)/(np.var(x))**(1/2) x_band = [] for k in x_split: x_band.extend(np.repeat( np.median(k) * fluctuation, len(k))) x_band = np.array(x_band) #estimator = NaiveKDE(kernel='gaussian', bw=fluctuation/np.log(k*x)*x).fit(np.array(x)) estimator = NaiveKDE(kernel='gaussian', bw=x_band).fit(np.array(x)) x_d = np.linspace(min(x), max(x), len(x)) y = estimator.evaluate(x_d) mi = argrelextrema(y, np.less)[0] x_split = np.split(np.sort(x), mi) segmentation_policy[str(i)] = x_split j = 0 unchanged_cat_str = str(int(i)) while j in range(len(x_split)): indexes = eval( 'data[data.%s == i].index[data[data.%s == i][column_name].isin(x_split[j])]' % (column, column)) new_category = str(unchanged_cat_str + '_' + str(j)) data.loc[indexes, column + '_by_price'] = new_category j += 1 new_cat_counter += 1 else: unchanged_cat_str = str(int(i)) new_category = str(unchanged_cat_str + '_0') indexes = eval('data[data.%s == i].index' % column) data.loc[indexes, column + '_by_price'] = new_category new_cat_counter += 1 else: if var_coef_x <= min_var_coef: unchanged_cat_str = str(int(i)) new_category = str(unchanged_cat_str + '_0') indexes = eval('data[data.%s == i].index' % (column)) data.loc[indexes, column + '_by_price'] = new_category new_cat_counter += 1 else: unchanged_cat_str = str(int(i)) index_max = eval( 'data[data.%s == i].index[data[data.%s == i][column_name] == max(data[data.%s == i][column_name])]' % (column, column, column)) index_min = eval( 'data[data.%s == i].index[data[data.%s == i][column_name] == min(data[data.%s == i][column_name])]' % (column, column, column)) new_category_min = str(unchanged_cat_str + '_0') data.loc[index_min, column + '_by_price'] = new_category_min new_category_max = str(unchanged_cat_str + '_1') data.loc[index_max, column + '_by_price'] = new_category_max print('ran in ' + str(time.time() - s) + 's') print(str(new_cat_counter) + ' new categories found') newdata = data return [newdata, segmentation_policy]
kernel = 'triweight' bw = [3, 0.3, 1, 0.3, 2] weights = [1, 1, 1, 1, 1] plt.figure(figsize=(10, 4)) plt.title('Basic example of the naive KDE') plt.subplot(1, 2, 1) kde = NaiveKDE(kernel=kernel, bw=bw) kde.fit(data, weights) x = np.linspace(0, 10, num=1024) for d, b in zip(data, bw): k = NaiveKDE(kernel=kernel, bw=b).fit([d]).evaluate(x) / len(data) plt.plot(x, k, color='k', ls='--') y = kde.evaluate(x) plt.plot(x, y) plt.scatter(data, np.zeros_like(data)) plt.subplot(1, 2, 2) kde = TreeKDE(kernel=kernel, bw=bw) kde.fit(data, weights) x = np.linspace(0, 10, num=1024) for d, b in zip(data, bw): k = NaiveKDE(kernel=kernel, bw=b).fit([d]).evaluate(x) / len(data) plt.plot(x, k, color='k', ls='--') y = kde.evaluate(x) plt.plot(x, y) plt.scatter(data, np.zeros_like(data)) plt.show()