def test_plot_calibration_purchases_vs_holdout_purchases_time_since_last_purchase(self, transaction_data, bgf): summary = utils.calibration_and_holdout_data(transaction_data, 'id', 'date', '2014-09-01', '2014-12-31') bgf.fit(summary['frequency_cal'], summary['recency_cal'], summary['T_cal']) plt.figure() plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary, kind='time_since_last_purchase') return plt.gcf()
def test_plot_calibration_purchases_vs_holdout_purchases(self): transaction_data = load_transaction_data() summary = utils.calibration_and_holdout_data(transaction_data, 'id', 'date', '2014-09-01', '2014-12-31') bgf.fit(summary['frequency_cal'], summary['recency_cal'], summary['T_cal']) plt.figure() plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary) return plt.gcf()
def test_plot_calibration_purchases_vs_holdout_purchases_time_since_last_purchase(self): transaction_data = load_transaction_data() summary = utils.calibration_and_holdout_data(transaction_data, 'id', 'date', '2014-09-01', '2014-12-31') bgf.fit(summary['frequency_cal'], summary['recency_cal'], summary['T_cal']) plt.figure() plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary, kind='time_since_last_purchase') return plt.gcf()
def test_plot_calibration_purchases_vs_holdout_purchases_time_since_last_purchase(self): transaction_data = load_transaction_data() summary = utils.calibration_and_holdout_data(transaction_data, "id", "date", "2014-09-01", "2014-12-31") bgf.fit(summary["frequency_cal"], summary["recency_cal"], summary["T_cal"]) plt.figure() plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary, kind="time_since_last_purchase") return plt.gcf()
def test_plot_calibration_purchases_vs_holdout_purchases(self): from matplotlib import pyplot as plt transaction_data = load_transaction_data() summary = utils.calibration_and_holdout_data(transaction_data, 'id', 'date', '2014-09-01', '2014-12-31') bgf.fit(summary['frequency_cal'], summary['recency_cal'], summary['T_cal']) plt.figure() plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary) plt.show()
def evaluation_plots(plot_type): """ Evaluation Plots: - Tracking Cumulative Transactions - Tracking Daily Transactions - Frequency of Repeated Transactions - Calibration vs Holdout. Parameters ---------- plot_type: str. "tracking" - Tracking Cumulative and Tracking Daily Transactions. "repeated" - Frequency of Repeated Transactions. "calibration_holdout" - Calibration vs Holdout Purchases. """ # Loading Calibration Model. cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0) cal_bg_nbd.load_model(path="models/calibration_model.pkl") # Loading summary_cal_holdout dataset. summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv") # Loading Transactions. transactions = pd.read_csv("datasets/transactions.csv") if plot_type == "tracking": fig = plt.figure(figsize=(20, 4)) plot_cumulative_transactions(model=cal_bg_nbd, transactions=transactions, datetime_col="order_purchase_timestamp", customer_id_col="customer_unique_id", t=604, t_cal=512, freq="D", ax=fig.add_subplot(121)) plot_incremental_transactions(model=cal_bg_nbd, transactions=transactions, datetime_col="order_purchase_timestamp", customer_id_col="customer_unique_id", t=604, t_cal=512, freq="D", ax=fig.add_subplot(122)) elif plot_type == "repeated": plot_period_transactions(model=cal_bg_nbd) elif plot_type == "calibration_holdout": plot_calibration_purchases_vs_holdout_purchases( model=cal_bg_nbd, calibration_holdout_matrix=summary_cal_holdout) return
def calibrate_bgf(self, calib_end_date, period_end_date, viz=False): ''' Visualize the goodness of fit of BGF model ''' summary_cal_holdout = calibration_and_holdout_data(self.transaction_data, 'CustomerNo', 'OrderDate', calibration_period_end=calib_end_date, #use 75% of data for training observation_period_end=period_end_date ) if viz==True: print(summary_cal_holdout.head()) self.bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal']) plot_calibration_purchases_vs_holdout_purchases(self.bgf, summary_cal_holdout, colormap='coolwarm', alpha=0.75) plt.savefig('calibration_purchases_vs_holdout_purchases.png') plt.close()
def test_plot_calibration_purchases_vs_holdout_purchases_time_since_last_purchase( self, transaction_data, bgf): holdout_expected = [3.954, 3.431, 3.482, 3.484, 2.75, 2.289, 1.968] predictions_expected = [ 4.345, 2.993, 3.236, 2.677, 2.240, 2.608, 2.430 ] labels = ['frequency_holdout', 'model_predictions'] summary = utils.calibration_and_holdout_data(transaction_data, 'id', 'date', '2014-09-01', '2014-12-31') bgf.fit(summary['frequency_cal'], summary['recency_cal'], summary['T_cal']) ax = plotting.plot_calibration_purchases_vs_holdout_purchases( bgf, summary, kind='time_since_last_purchase') lines = ax.lines legend = ax.legend_ holdout = lines[0].get_data()[1] predictions = lines[1].get_data()[1] assert_allclose(holdout, holdout_expected, atol=0.01) assert_allclose(predictions, predictions_expected, atol=0.01) assert_array_equal([e.get_text() for e in legend.get_texts()], labels) assert_equal( ax.title.get_text(), "Actual Purchases in Holdout Period vs Predicted Purchases") assert_equal(ax.xaxis.get_label().get_text(), "Time since user made last purchase") assert_equal(ax.yaxis.get_label().get_text(), "Average of Purchases in Holdout Period") plt.close()
def test_plot_calibration_purchases_vs_holdout_purchases( self, transaction_data, bgf): holdout_expected = [0.161, 0.233, 0.348, 0.544, 0.710, 0.704, 1.606] predictions_expected = [ 0.270, 0.294, 0.402, 0.422, 0.706, 0.809, 1.019 ] labels = ['frequency_holdout', 'model_predictions'] summary = utils.calibration_and_holdout_data(transaction_data, 'id', 'date', '2014-09-01', '2014-12-31') bgf.fit(summary['frequency_cal'], summary['recency_cal'], summary['T_cal']) ax = plotting.plot_calibration_purchases_vs_holdout_purchases( bgf, summary) lines = ax.lines legend = ax.legend_ holdout = lines[0].get_data()[1] predictions = lines[1].get_data()[1] assert_allclose(holdout, holdout_expected, atol=0.01) assert_allclose(predictions, predictions_expected, atol=0.01) assert_array_equal([e.get_text() for e in legend.get_texts()], labels) assert_equal( ax.title.get_text(), "Actual Purchases in Holdout Period vs Predicted Purchases") assert_equal(ax.xaxis.get_label().get_text(), "Purchases in calibration period") assert_equal(ax.yaxis.get_label().get_text(), "Average of Purchases in Holdout Period") plt.close()
from lifetimes.plotting import plot_period_transactions #used to validate the model plot_period_transactions(bgf) #another type of model validation summary_cal_holdout = calibration_and_holdout_data(df, 'CustomerID', 'InvoiceDate', calibration_period_end='2011-06-08', observation_period_end='2011-12-9' ) print(summary_cal_holdout.head()) from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal']) plot_calibration_purchases_vs_holdout_purchases(bgf, summary_cal_holdout) #predict the number of purchase made with t =days for single customer t = 30 individual = modeldata.loc[12380] bgf.predict(t, individual['frequency'], individual['recency'], individual['T']) from lifetimes.plotting import plot_history_alive import matplotlib.pyplot as plt fig = plt.figure(figsize=(12,8)) # plot setting width and height id = 14620 # id = 18074 id = 14606 days_since_birth = 365 sp_trans = df.loc[df['CustomerID'] == id]
def get_clv(oracle_conn_id, src_client_id, storage_bucket, ds, **context): import matplotlib.pyplot matplotlib.pyplot.ioff() ## from lifetimes.utils import calibration_and_holdout_data from lifetimes.plotting import plot_frequency_recency_matrix from lifetimes.plotting import plot_probability_alive_matrix from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases from lifetimes.plotting import plot_period_transactions from lifetimes.plotting import plot_history_alive from lifetimes.plotting import plot_cumulative_transactions from lifetimes.utils import expected_cumulative_transactions from lifetimes.utils import summary_data_from_transaction_data from lifetimes import BetaGeoFitter from lifetimes import GammaGammaFitter import datetime import pandas as pd import datalab.storage as gcs conn = OracleHook(oracle_conn_id=oracle_conn_id).get_conn() print(src_client_id, context) query = context['templates_dict']['query'] data = pd.read_sql(query, con=conn) data.columns = data.columns.str.lower() print(data.head()) # Calculate RFM values# calibration_end_date = datetime.datetime(2018, 5, 24) training_rfm = calibration_and_holdout_data( transactions=data, customer_id_col='src_user_id', datetime_col='pickup_date', calibration_period_end=calibration_end_date, freq='D', monetary_value_col='price_total') bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(training_rfm['frequency_cal'], training_rfm['recency_cal'], training_rfm['T_cal']) print(bgf) # Matrix charts plot_period_transactions_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_period_transactions_chart.svg' plot_frequency_recency_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_frequency_recency_matrix.svg' plot_probability_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_probability_alive_matrix.svg' plot_calibration_vs_holdout_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_calibration_vs_holdout_purchases.svg' ax0 = plot_period_transactions(bgf, max_frequency=30) ax0.figure.savefig(plot_period_transactions_chart, format='svg') ax1 = plot_frequency_recency_matrix(bgf) ax1.figure.savefig(plot_frequency_recency_chart, format='svg') ax2 = plot_probability_alive_matrix(bgf) ax2.figure.savefig(plot_probability_chart, format='svg') ax3 = plot_calibration_purchases_vs_holdout_purchases(bgf, training_rfm, n=50) ax3.figure.savefig(plot_calibration_vs_holdout_chart, format='svg') full_rfm = summary_data_from_transaction_data( data, customer_id_col='src_user_id', datetime_col='pickup_date', monetary_value_col='price_total', datetime_format=None, observation_period_end=None, freq='D') returning_full_rfm = full_rfm[full_rfm['frequency'] > 0] ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(returning_full_rfm['frequency'], returning_full_rfm['monetary_value']) customer_lifetime = 30 # expected number of months lifetime of a customer clv = ggf.customer_lifetime_value( bgf, #the model to use to predict the number of future transactions full_rfm['frequency'], full_rfm['recency'], full_rfm['T'], full_rfm['monetary_value'], time=customer_lifetime, # months discount_rate=0.01 # monthly discount rate ~ 12.7% annually ).sort_values(ascending=False) full_rfm_with_value = full_rfm.join(clv) full_rfm_file = context.get("ds_nodash") + "-src_client_id-" + str( src_client_id) + '-icabbi-test.csv' full_rfm_with_value.to_csv(full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + full_rfm_file, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_period_transactions_chart, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_frequency_recency_chart, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_probability_chart, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_calibration_vs_holdout_chart, filename=full_rfm_file)
val = np.sqrt(val) # Mean Absolute Error elif metric == 'mae': val = np.sum(np.abs(actuals - predicted)) / actuals.shape[0] else: val = None return val # score the model print('MSE: {0}'.format( score_model(combined_data["frequency_holdout"], predicted_freq, 'mse'))) # Modelleri karşılaştırmak için önemli olsa da, MSE metriğini herhangi bir modelin genel fit iyiliği açısından yorumlamak biraz daha zordur. # Modelimizin verilerimize ne kadar iyi fit olduğuna dair daha fazla bilgi sağlamak için, bazı gerçek ve tahmin edilen değerler arasındaki ilişkileri görselleştirelim. plot_calibration_purchases_vs_holdout_purchases(bgf, combined_data) plt.show() plot_period_transactions(bgf) plt.show() # Tahmin edilen frekans değeri combined_data'ya ekleme combined_data["frequency_predict"] = predicted_freq combined_data.head() ############################################################## # GAMMA GAMMA MODEL ############################################################## # Gamma Gamma'yı kullanabileceğimizden emin olmak için, frekans ve parasal değerlerin # ilişkili olup olmadığını kontrol etmemiz gerekir. (?)
mse = score_model(frequency_holdout_actual, frequency_holdout_predicted, 'mse') print('MSE: {0}'.format(mse)) # COMMAND ---------- # MAGIC %md While important for comparing models, the MSE metric is a bit more challenging to interpret in terms of the overall goodness of fit of any individual model. To provide more insight into how well our model fits our data, let's visualize the relationships between some actual and predicted values. # MAGIC # MAGIC To get started, we can examine how purchase frequencies in the calibration period relates to actual (frequency_holdout) and predicted (model_predictions) frequencies in the holdout period: # COMMAND ---------- from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases plot_calibration_purchases_vs_holdout_purchases(model, input_pd, n=90, **{'figsize': (8, 8)}) display() # COMMAND ---------- # MAGIC %md What we see here is that a higher number of purchases in the calibration period predicts a higher average number of purchases in the holdout period but the actual values diverge sharply from model predictions when we consider customers with a large number of purchases (>60) in the calibration period. Thinking back to the charts in the data exploration section of this notebook, you might recall that there are very few customers with such a large number of purchases so that this divergence may be a result of a very limited number of instances at the higher end of the frequency range. More data may bring the predicted and actuals back together at this higher end of the curve. If this divergence persists, it may indicate a range of customer engagement frequency above which we cannot make reliable predictions. # MAGIC # MAGIC Using the same method call, we can visualize time since last purchase relative to the average number of purchases in the holdout period. This visualization illustrates that as time since the last purchase increases, the number of purchases in the holdout period decreases. In otherwords, those customers we haven't seen in a while aren't likely coming back anytime soon: # MAGIC # MAGIC NOTE As before, we will hide the code in the following cells to focus on the visualizations. Use **Show code** to see the associated Python logic. # COMMAND ---------- plot_calibration_purchases_vs_holdout_purchases(
on='customer_id') df_final['wholesaler'] = np.where(df_final['predicted_cltv'] < 1000, 0, 1) df_final['churn_group'] = np.where(df_final['probability_alive'] < .5, 0, 1) df_final # Plots and Validation plot_period_transactions(bgf_mod) cal_hold = calibration_and_holdout_data( df, 'customer_id', 'date', calibration_period_end='2018-12-31', #3 years calibration observation_period_end='2020-12-31', #2 year holdout freq=frq) # plots the efficiacy of the model using the hold-out period plt.rcParams['figure.figsize'] = (20, 10) bgf = BetaGeoFitter() bgf.fit(cal_hold['frequency_cal'], cal_hold['recency_cal'], cal_hold['T_cal']) plot_calibration_purchases_vs_holdout_purchases(bgf, cal_hold) fig = plt.figure(figsize=(8, 6)) plot_frequency_recency_matrix(bgf_mod) fig = plt.figure(figsize=(8, 6)) plot_probability_alive_matrix(bgf_mod)