def test_scaling_inputs_gives_same_or_similar_results(self, cdnow_customers): mbgf = estimation.ModifiedBetaGeoFitter() mbgf.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T']) scale = 10. mbgf_with_large_inputs = estimation.ModifiedBetaGeoFitter() mbgf_with_large_inputs.fit(cdnow_customers['frequency'], scale * cdnow_customers['recency'], scale * cdnow_customers['T'], iterative_fitting=2) assert mbgf_with_large_inputs._scale < 1. assert abs(mbgf_with_large_inputs.conditional_probability_alive(1, scale * 1, scale * 2) - mbgf.conditional_probability_alive(1, 1, 2)) < 10e-2 assert abs(mbgf_with_large_inputs.conditional_probability_alive(1, scale * 2, scale * 10) - mbgf.conditional_probability_alive(1, 2, 10)) < 10e-2
def test_fit_method_allows_for_better_accuracy_by_using_iterative_fitting(self, cdnow_customers): mbfg1 = estimation.ModifiedBetaGeoFitter() mbfg2 = estimation.ModifiedBetaGeoFitter() np.random.seed(0) mbfg1.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T']) np.random.seed(0) mbfg2.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T'], iterative_fitting=5) assert mbfg1._negative_log_likelihood_ >= mbfg2._negative_log_likelihood_
def test_mgbf_does_not_hang_for_small_datasets_but_can_be_improved_with_iterative_fitting(self, cdnow_customers): reduced_dataset = cdnow_customers.ix[:2] mbfg1 = estimation.ModifiedBetaGeoFitter() mbfg2 = estimation.ModifiedBetaGeoFitter() np.random.seed(0) mbfg1.fit(reduced_dataset['frequency'], reduced_dataset['recency'], reduced_dataset['T']) np.random.seed(0) mbfg2.fit(reduced_dataset['frequency'], reduced_dataset['recency'], reduced_dataset['T'], iterative_fitting=10) assert mbfg1._negative_log_likelihood_ >= mbfg2._negative_log_likelihood_
def test_penalizer_term_will_shrink_coefs_to_0(self, cdnow_customers): mbfg_no_penalizer = estimation.ModifiedBetaGeoFitter() mbfg_no_penalizer.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T']) params_1 = np.array(list(mbfg_no_penalizer.params_.values())) mbfg_with_penalizer = estimation.ModifiedBetaGeoFitter(penalizer_coef=0.1) mbfg_with_penalizer.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T'], iterative_fitting=3) params_2 = np.array(list(mbfg_with_penalizer.params_.values())) assert params_2.sum() < params_1.sum() mbfg_with_more_penalizer = estimation.ModifiedBetaGeoFitter(penalizer_coef=1.) mbfg_with_more_penalizer.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T'], iterative_fitting=5) params_3 = np.array(list(mbfg_with_more_penalizer.params_.values())) assert params_3.sum() < params_2.sum()
def test_fit_with_index(self, cdnow_customers): mbgf = estimation.ModifiedBetaGeoFitter() index = range(len(cdnow_customers), 0, -1) mbgf.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T'], index=index) assert (mbgf.data.index == index).all() == True mbgf = estimation.ModifiedBetaGeoFitter() mbgf.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T'], index=None) assert (mbgf.data.index == index).all() == False
def test_purchase_predictions_do_not_differ_much_if_looking_at_hourly_or_daily_frequencies( self): transaction_data = load_transaction_data(parse_dates=['date']) daily_summary = utils.summary_data_from_transaction_data( transaction_data, 'id', 'date', observation_period_end=max(transaction_data.date), freq='D') hourly_summary = utils.summary_data_from_transaction_data( transaction_data, 'id', 'date', observation_period_end=max(transaction_data.date), freq='h') thirty_days = 30 hours_in_day = 24 mbfg = estimation.ModifiedBetaGeoFitter() np.random.seed(0) mbfg.fit(daily_summary['frequency'], daily_summary['recency'], daily_summary['T']) thirty_day_prediction_from_daily_data = mbfg.expected_number_of_purchases_up_to_time( thirty_days) np.random.seed(0) mbfg.fit(hourly_summary['frequency'], hourly_summary['recency'], hourly_summary['T']) thirty_day_prediction_from_hourly_data = mbfg.expected_number_of_purchases_up_to_time( thirty_days * hours_in_day) npt.assert_almost_equal(thirty_day_prediction_from_daily_data, thirty_day_prediction_from_hourly_data)
def test_probability_of_n_purchases_up_to_time_same_as_R_BTYD(self): """ See https://cran.r-project.org/web/packages/BTYD/BTYD.pdf """ from collections import OrderedDict mbgf = estimation.ModifiedBetaGeoFitter() mbgf.params_ = OrderedDict({ 'r': 0.243, 'alpha': 4.414, 'a': 0.793, 'b': 2.426 }) # probability that a customer will make 10 repeat transactions in the # time interval (0,2] expected = 1.07869e-07 actual = mbgf.probability_of_n_purchases_up_to_time(2, 10) assert abs(expected - actual) < 10e-5 # PMF expected = np.array([ 0.0019995214, 0.0015170236, 0.0011633150, 0.0009003148, 0.0007023638, 0.0005517902, 0.0004361913, 0.0003467171, 0.0002769613, 0.0002222260 ]) actual = np.array([ mbgf.probability_of_n_purchases_up_to_time(30, n) for n in range(11, 21) ]) npt.assert_allclose(expected, actual, rtol=0.5)
def test_conditional_probability_alive_returns_lessthan_1_if_no_repeat_purchases( self, cdnow_customers): mbfg = estimation.ModifiedBetaGeoFitter() mbfg.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T']) assert mbfg.conditional_probability_alive(0, 1, 1) < 1.0
def test_conditional_probability_alive_is_between_0_and_1(self, cdnow_customers): mbfg = estimation.ModifiedBetaGeoFitter() mbfg.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T']) for i in range(0, 100, 10): for j in range(0, 100, 10): for k in range(j, 100, 10): assert 0 <= mbfg.conditional_probability_alive(i, j, k) <= 1.0
def test_expectation_returns_same_value_Hardie_excel_sheet(self, cdnow_customers): mbfg = estimation.ModifiedBetaGeoFitter() mbfg.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T'], tol=1e-6, iterative_fitting=3) times = np.array([0.1429, 1.0, 3.00, 31.8571, 32.00, 78.00]) expected = np.array([0.0078, 0.0532, 0.1506, 1.0405, 1.0437, 1.8576]) actual = mbfg.expected_number_of_purchases_up_to_time(times) npt.assert_allclose(actual, expected, rtol=0.05)
def test_conditional_probability_alive_matrix(self, cdnow_customers): mbfg = estimation.ModifiedBetaGeoFitter() mbfg.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T']) Z = mbfg.conditional_probability_alive_matrix() max_t = int(mbfg.data['T'].max()) for t_x in range(Z.shape[0]): for x in range(Z.shape[1]): assert Z[t_x][x] == mbfg.conditional_probability_alive(x, t_x, max_t)
def test_conditional_expectation_returns_same_value_as_Hardie_excel_sheet(self, cdnow_customers): mbfg = estimation.ModifiedBetaGeoFitter() mbfg.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T']) x = 2 t_x = 30.43 T = 38.86 t = 39 expected = 1.226 actual = mbfg.conditional_expected_number_of_purchases_up_to_time(t, x, t_x, T) assert abs(expected - actual) < 0.05
def test_params_out_is_close_to_BTYDplus(self, cdnow_customers): """ See https://github.com/mplatzer/BTYDplus """ mbfg = estimation.ModifiedBetaGeoFitter() mbfg.fit(cdnow_customers['frequency'], cdnow_customers['recency'], cdnow_customers['T'], iterative_fitting=3) expected = np.array([0.525, 6.183, 0.891, 1.614]) npt.assert_array_almost_equal( expected, np.array(mbfg._unload_params('r', 'alpha', 'a', 'b')), decimal=3)