def summaryOutput(self, discount_rate=0.12, months=12): ''' Fit beta geometric model to calculate CLV, and use GG model to calculate expected profit Per customer Write out CLV and profits to csv, print out averages to screen ''' beta_model = BetaGeoFitter() #calulate average transaction value self.summary_monetary['avg_transaction_value'] = self.ggf.conditional_expected_average_profit( self.summary_monetary['frequency'], self.summary_monetary['monetary_value']) #fit beta geo model beta_model.fit(self.summary_monetary['frequency'], self.summary_monetary['recency'], self.summary_monetary['T']) #calculate clv, with discount rate calulated over year (default) disc_rate = discount_rate/months/30 self.summary_monetary['clv'] = self.ggf.customer_lifetime_value( beta_model, #the model to use to predict the number of future transactions self.summary_monetary['frequency'], self.summary_monetary['recency'], self.summary_monetary['T'], self.summary_monetary['monetary_value'], time=months, # months discount_rate=disc_rate # monthly discount rate ~ 12.7% annually ) #print customer data with calculations self.summary_monetary.to_csv("CLV_AVG_transactionValue_perCustomer.csv", index=False) #print summary stats print("Expected conditional average profit: {}, Average profit: {}".format( self.ggf.conditional_expected_average_profit( self.summary_monetary['frequency'], self.summary_monetary['monetary_value']).mean(), self.summary_monetary[self.summary_monetary['frequency']>0]['monetary_value'].mean()))
def readBetaGeoFitterModel(): betaGeoFitterModel = BetaGeoFitter() betaGeoFitterModel.load_model("BetaGeoFitterModel.pkl") return betaGeoFitterModel
def train_metric(d, metric, plot=True, penalty=0): frequency = metric + "_frequency" recency = metric + "_recency" T = metric + "_T" train = d train = train[(train[frequency] > 0) & (train[recency] >= 0)] train[frequency] = train[frequency] - 1 bgf = BetaGeoFitter(penalizer_coef=penalty) bgf.fit(train[frequency], train[recency], train[T]) n = bgf.data.shape[0] simulated_data = bgf.generate_new_data(size=n) model_counts = pd.DataFrame( bgf.data["frequency"].value_counts().sort_index().iloc[:28]) simulated_counts = pd.DataFrame( simulated_data["frequency"].value_counts().sort_index().iloc[:28]) combined_counts = model_counts.merge(simulated_counts, how="outer", left_index=True, right_index=True).fillna(0) combined_counts.columns = ["Actual", "Model"] if plot: combined_counts.plot.bar() display() return combined_counts, bgf
def single_customer_evaluation(time_units=243): """ Predicts Number of Purchases of a randomly chosen customer from the dataset. (conditional_expected_number_of_purchases_up_to_time) Parameters ---------- time_units: int, default=243. Number of days for prediction. Returns ------- (frequency_predicted, frequency_holdout) """ # Loading Calibration Model. cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0) cal_bg_nbd.load_model(path="models/calibration_model.pkl") # Loading summary_cal_holdout dataset. summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv") # Randomly sample single customer. individual = summary_cal_holdout.sample() frequency_prediction = cal_bg_nbd.predict( t=time_units, frequency=individual["frequency_cal"], recency=individual["recency_cal"], T=individual["T_cal"]) frequency_holdout = individual["frequency_holdout"] return frequency_prediction, frequency_holdout
def create_cltv_pred(dataframe, w=4, m=1): """ Gamagama and BGNBD model and prediction Parameters ---------- dataframe w: int, week information for BGNBD model m: int, month information for gamama model Returns Dataframe ------- """ # BGNBD dataframe = dataframe[dataframe["monetary_avg"] > 0] dataframe["frequency"] = dataframe["frequency"].astype(int) bgf = BetaGeoFitter(penalizer_coef=0.001) bgf.fit(dataframe['frequency'], dataframe['recency_weekly'], dataframe['T_weekly']) dataframe[f'exp_sales_{w}_week'] = bgf.predict(w, dataframe['frequency'], dataframe['recency_weekly'], dataframe['T_weekly']) # Gamagama - expected_average_profit ggf = GammaGammaFitter(penalizer_coef=0.001) ggf.fit(dataframe['frequency'], dataframe['monetary_avg']) dataframe[ "expected_average_profit"] = ggf.conditional_expected_average_profit( dataframe['frequency'], dataframe['monetary_avg']) # CLTV Prediction cltv = ggf.customer_lifetime_value(bgf, dataframe['frequency'], dataframe['recency_weekly'], dataframe['T_weekly'], dataframe['monetary_avg'], time=m, freq="W", discount_rate=0.01) dataframe[f'cltv_p_{m}_month'] = cltv scaler = MinMaxScaler(feature_range=(1, 100)) dataframe['cltv_p_score'] = scaler.fit_transform( dataframe[[f'cltv_p_{m}_month']]) # cltv_p Segment dataframe['cltv_p_segment'] = pd.qcut(dataframe['cltv_p_score'], 3, labels=['C', 'B', 'A']) new_col = dataframe.columns[~dataframe.columns. isin(['recency', 'frequency', 'monetary'])] dataframe = dataframe[new_col] return dataframe
def bgf(cd_data): bgf_model = BetaGeoFitter() bgf_model.fit(cd_data['frequency'], cd_data['recency'], cd_data['T'], iterative_fitting=1) return bgf_model
def test_plot_incremental_transactions(self): """Test plotting incremental transactions with CDNOW example.""" transactions = load_dataset('CDNOW_sample.txt', header=None, sep='\s+') transactions.columns = [ 'id_total', 'id_sample', 'date', 'num_cd_purc', 'total_value' ] t = 39 freq = 'W' transactions_summary = utils.summary_data_from_transaction_data( transactions, 'id_sample', 'date', datetime_format='%Y%m%d', observation_period_end='19970930', freq=freq) bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(transactions_summary['frequency'], transactions_summary['recency'], transactions_summary['T']) plt.figure() plotting.plot_incremental_transactions(bgf, transactions, 'date', 'id_sample', 2 * t, t, freq=freq, xlabel='week', datetime_format='%Y%m%d') return plt.gcf()
def fitted_bg(example_summary_data): bg = BetaGeoFitter() bg.fit(example_summary_data['frequency'], example_summary_data['recency'], example_summary_data['T'], iterative_fitting=0) return bg
def test_expected_cumulative_transactions_date_index(cdnow_transactions): """ Test set_index as date for cumulative transactions and bgf fitter. Get first 14 cdnow transactions dates and validate that date index, freq_multiplier = 1 working and compare with tested data for last 4 records. dates = ['1997-01-11', '1997-01-12', '1997-01-13', '1997-01-14'] actual_trans = [11, 12, 15, 19] expected_trans = [10.67, 12.67, 14.87, 17.24] """ datetime_col = "date" customer_id_col = "id_sample" t = 14 datetime_format = "%Y%m%d" freq = "D" observation_period_end = "19970930" freq_multiplier = 1 transactions_summary = utils.summary_data_from_transaction_data( cdnow_transactions, customer_id_col, datetime_col, datetime_format=datetime_format, freq=freq, freq_multiplier=freq_multiplier, observation_period_end=observation_period_end, ) transactions_summary = transactions_summary.reset_index() model = BetaGeoFitter() model.fit(transactions_summary["frequency"], transactions_summary["recency"], transactions_summary["T"]) df_cum = utils.expected_cumulative_transactions( model, cdnow_transactions, datetime_col, customer_id_col, t, datetime_format, freq, set_index_date=True, freq_multiplier=freq_multiplier, ) dates = ["1997-01-11", "1997-01-12", "1997-01-13", "1997-01-14"] actual_trans = [11, 12, 15, 19] expected_trans = [10.67, 12.67, 14.87, 17.24] date_index = df_cum.iloc[-4:].index.to_timestamp().astype(str) actual = df_cum["actual"].iloc[-4:].values predicted = df_cum["predicted"].iloc[-4:].values.round(2) assert all(dates == date_index) assert_allclose(actual, actual_trans) assert_allclose(predicted, expected_trans, atol=1e-2)
def bgf_transactions(cdnow_transactions): transactions_summary = utils.summary_data_from_transaction_data( cdnow_transactions, 'id_sample', 'date', datetime_format='%Y%m%d', observation_period_end='19970930', freq='W') bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(transactions_summary['frequency'], transactions_summary['recency'], transactions_summary['T']) return bgf
def fitted_bg(example_summary_data): bg = BetaGeoFitter() bg.fit( example_summary_data["frequency"], example_summary_data["recency"], example_summary_data["T"], iterative_fitting=2, tol=1e-6, ) return bg
def bgnbd_model(summary): """Instantiate and fit a BG/NBD model. Args: summary: RFM transaction data Returns: bgnbd model fit to the data """ bgf = BetaGeoFitter(penalizer_coef=PENALIZER_COEF) bgf.fit(summary['frequency'], summary['recency'], summary['T']) return bgf
def rfm_model(data, end_date, f, p): rfm1 = lifetimes.utils.summary_data_from_transaction_data( data, 'customer_id', 'date', monetary_value_col='amount', observation_period_end=end_date, freq=f) rfm1 = rfm1[rfm1.monetary_value < 600] bgf = BetaGeoFitter(penalizer_coef=p) bgf.fit(rfm1['frequency'], rfm1['recency'], rfm1['T']) return rfm1, bgf
def evaluation_plots(plot_type): """ Evaluation Plots: - Tracking Cumulative Transactions - Tracking Daily Transactions - Frequency of Repeated Transactions - Calibration vs Holdout. Parameters ---------- plot_type: str. "tracking" - Tracking Cumulative and Tracking Daily Transactions. "repeated" - Frequency of Repeated Transactions. "calibration_holdout" - Calibration vs Holdout Purchases. """ # Loading Calibration Model. cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0) cal_bg_nbd.load_model(path="models/calibration_model.pkl") # Loading summary_cal_holdout dataset. summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv") # Loading Transactions. transactions = pd.read_csv("datasets/transactions.csv") if plot_type == "tracking": fig = plt.figure(figsize=(20, 4)) plot_cumulative_transactions(model=cal_bg_nbd, transactions=transactions, datetime_col="order_purchase_timestamp", customer_id_col="customer_unique_id", t=604, t_cal=512, freq="D", ax=fig.add_subplot(121)) plot_incremental_transactions(model=cal_bg_nbd, transactions=transactions, datetime_col="order_purchase_timestamp", customer_id_col="customer_unique_id", t=604, t_cal=512, freq="D", ax=fig.add_subplot(122)) elif plot_type == "repeated": plot_period_transactions(model=cal_bg_nbd) elif plot_type == "calibration_holdout": plot_calibration_purchases_vs_holdout_purchases( model=cal_bg_nbd, calibration_holdout_matrix=summary_cal_holdout) return
def upload(): # -*- coding: utf-8 -*- if request.method == 'POST': f = request.files['file'] basepath = os.path.dirname(__file__) file_path = os.path.join(basepath, 'uploads', secure_filename(f.filename)) f.save(file_path) df = pd.read_csv(file_path) df['salesDate'] = pd.to_datetime(df['salesDate']) cols_of_interest = ['memberID', 'salesDate', 'sales'] df = df[cols_of_interest] df['memberID'] = df['memberID'].apply(lambda x: format(x, '.0f')) max_date = df['salesDate'].max() min_date = max_date - relativedelta(months=+12) df = df.loc[(df['salesDate'] >= min_date) & (df['salesDate'] <= max_date)] min_order = df['salesDate'].min() max_order = df['salesDate'].max() data = summary_data_from_transaction_data( df, 'memberID', 'salesDate', monetary_value_col='sales', observation_period_end=max_order) d2 = data.sort_values('frequency', ascending=False) bgf = BetaGeoFitter(penalizer_coef=0.0001) bgf.fit(data['frequency'], data['recency'], data['T']) t = 30 data[ 'customer_livelyhood'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, data['frequency'], data['recency'], data['T']) data.sort_values(by='customer_livelyhood', ascending=False, inplace=True) return data.to_html() return None
def trainBetaGeoFitterModel(): summaryDataFromTransactionDataForCLV = readsummaryDataFromTransactionDataForCLV( ) #training model betaGeoFitterModel = BetaGeoFitter(penalizer_coef=0.0) betaGeoFitterModel.fit(summaryDataFromTransactionDataForCLV["frequency"], summaryDataFromTransactionDataForCLV["recency"], summaryDataFromTransactionDataForCLV["T"]) #saving the model in pickle file saveBetaGeoFitterModel(betaGeoFitterModel) print(betaGeoFitterModel.summary)
def probability_alive(historical_rfm_data): """ Predicted Conditional Probability Alive. Parameters ---------- historical_rfm_data: Historical Frequency, Recency & T of an individual Returns ------- Conditional Probability Alive. """ clv_model = BetaGeoFitter(penalizer_coef=0.0) clv_model.load_model(path="models/customer_lifetime_estimator.pkl") alive_probability = clv_model.conditional_probability_alive( frequency=historical_rfm_data["frequency"], recency=historical_rfm_data["recency"], T=historical_rfm_data["T"]) return alive_probability
def estimate_clv_model(summary, model_penalizer=None): #set default values if they are not stated if model_penalizer is None: model_penalizer = 0 # Building the Model using BG/NBD bgf = BetaGeoFitter(penalizer_coef=model_penalizer) bgf.fit(summary['frequency'], summary['recency'], summary['T']) # There cannot be non-positive values in the monetary_value or frequency vector summary_with_value_and_returns = summary[(summary['monetary_value']>0) & (summary['frequency']>0)] # Setting up Gamma Gamma model ggf = GammaGammaFitter(penalizer_coef = 0) ggf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors if not (len(x) == 0 for x in [summary_with_value_and_returns['recency'],summary_with_value_and_returns['frequency'],summary_with_value_and_returns['T']]): bgf.fit(summary_with_value_and_returns['frequency'],summary_with_value_and_returns['recency'],summary_with_value_and_returns['T']) return [bgf, ggf]
def root_mean_squared_error(time_units=243): """ Calculates Root Mean Squared Error of all predictions. Parameters ---------- time_units: int, default=243. Number of days for prediction. Yields ------ summary_cal_holdout_preds.csv. Returns ------ rmse """ # Loading Calibration Model. cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0) cal_bg_nbd.load_model(path="models/calibration_model.pkl") # Loading summary_cal_holdout dataset. summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv") frequency_holdout = summary_cal_holdout["frequency_holdout"].copy() # Predictions. frequency_predictions = cal_bg_nbd.predict( t=time_units, frequency=summary_cal_holdout["frequency_cal"], recency=summary_cal_holdout["recency_cal"], T=summary_cal_holdout["T_cal"]) # Adding Predictions to Summary dataset. summary_cal_holdout["frequency_predictions"] = frequency_predictions.copy() file_path = Path.cwd() / "datasets/summary_cal_holdout_preds.csv" summary_cal_holdout.to_csv(file_path, index=False) rmse = mean_squared_error(frequency_holdout, frequency_predictions, squared=False) return rmse
def _calibration_model(): """ Trains BG/NBD Calibration Model. Yields ------ calibration_model.pkl """ summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv") # Training Calibration Model. cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0) cal_bg_nbd.fit(frequency=summary_cal_holdout["frequency_cal"], recency=summary_cal_holdout["recency_cal"], T=summary_cal_holdout["T_cal"], verbose=True) # Saving Model. file_path = Path.cwd() / "models/calibration_model.pkl" cal_bg_nbd.save_model(path=file_path) return
def _clv_model(): """ Trains BG/NBD Model on entire RFM data, final fit. Yields ------ customer_lifetime_estimator.pkl """ summary = pd.read_csv("datasets/summary.csv") # Training Calibration Model. clv = BetaGeoFitter(penalizer_coef=0.0) clv.fit(frequency=summary["frequency"], recency=summary["recency"], T=summary["T"], verbose=True) # Saving Model. file_path = Path.cwd() / "models/customer_lifetime_estimator.pkl" clv.save_model(path=file_path) return
def number_of_purchases(historical_rfm_data, time_units=30): """ Predicted Conditional Expected Number of Purchases. Parameters ---------- historical_rfm_data: Historical Frequency, Recency & T of an individual time_units: int, default=30. Number of days for predictions. Returns ------- expected number of purchases. """ clv_model = BetaGeoFitter(penalizer_coef=0.0) clv_model.load_model(path="models/customer_lifetime_estimator.pkl") frequency_predictions = clv_model.predict( t=time_units, frequency=historical_rfm_data["frequency"], recency=historical_rfm_data["recency"], T=historical_rfm_data["T"]) return frequency_predictions
def load_data_and_model(): """Loads Customer Lifetime Estimator Model""" model = BetaGeoFitter(penalizer_coef=0.0) model.load_model("../models/calibration_model.pkl") summary_cal_holdout = pd.read_csv("../datasets/summary_cal_holdout.csv") return model, summary_cal_holdout
def generate_clv_table(data, clv_prediction_time=None, model_penalizer=None): #set default values if they are not stated if clv_prediction_time is None: clv_prediction_time = 12 if model_penalizer is None: model_penalizer = 0 # Reformat csv as a Pandas dataframe #data = pd.read_csv(csv_file) #Remove non search sessions data = data[data['Searches'] > 0] max_date = data['activity_date'].max() # Using "summary_data_from_transaction_data" function to agregate the activity stream into the appropriate metrics # Model requires 'activity_date' column name. For our purpose this is synonymous with submission_date. summary = summary_data_from_transaction_data( data, 'client_id', 'activity_date', 'Revenue', observation_period_end=max_date) # Building the Model using BG/NBD bgf = BetaGeoFitter(penalizer_coef=model_penalizer) bgf.fit(summary['frequency'], summary['recency'], summary['T']) # Conditional expected purchases # These are the expected purchases expected from each individual given the time specified # t = days in to future t = 14 summary[ 'predicted_searches'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, summary['frequency'], summary['recency'], summary['T']) #Conditional Alive Probability summary['alive_prob'] = summary.apply( lambda row: calc_alive_prob(row, bgf), axis=1) summary['alive_prob'] = summary['alive_prob'].astype(float) #print summary['alive_prob'] # There cannot be non-positive values in the monetary_value or frequency vector summary_with_value_and_returns = summary[(summary['monetary_value'] > 0) & (summary['frequency'] > 0)] # There cannot be zero length vectors in one of frequency, recency or T #summary_with_value_and_returns = #print summary_with_value_and_returns[ # (len(summary_with_value_and_returns['recency'])>0) & # (len(summary_with_value_and_returns['frequency'])>0) & # (len(summary_with_value_and_returns['T'])>0) #] if any( len(x) == 0 for x in [ summary_with_value_and_returns['recency'], summary_with_value_and_returns['frequency'], summary_with_value_and_returns['T'] ]): logger.debug(data['client_id']) # Setting up Gamma Gamma model ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Output average profit per tranaction by client ID ggf_output = ggf.conditional_expected_average_profit( summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors if not (len(x) == 0 for x in [ summary_with_value_and_returns['recency'], summary_with_value_and_returns['frequency'], summary_with_value_and_returns['T'] ]): bgf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['recency'], summary_with_value_and_returns['T']) # Getting Customer lifetime value using the Gamma Gamma output # NOTE: the time can be adjusted, but is currently set to 12 months customer_predicted_value = ggf.customer_lifetime_value( bgf, #the model to use to predict the number of future transactions summary_with_value_and_returns['frequency'], summary_with_value_and_returns['recency'], summary_with_value_and_returns['T'], summary_with_value_and_returns['monetary_value'], time=clv_prediction_time, # months discount_rate=0.01 # monthly discount rate ~ 12.7% annually ) # Converting to dataframe df_cpv = pd.DataFrame({ 'client_id': customer_predicted_value.index, 'pred_values': customer_predicted_value.values }) # Setting client_id as index df_cpv = df_cpv.set_index('client_id') # Merge with original summary df_merged = pd.merge(summary, df_cpv, left_index=True, right_index=True, how='outer') # Historical CLV data_hist = data.groupby( ['client_id'])['Searches', 'Revenue'].apply(lambda x: x.astype(float).sum()) # Merge with original summary df_final = pd.merge(df_merged, data_hist, left_index=True, right_index=True, how='outer') # Prevent NaN on the pred_clv column df_final.pred_values[df_final.frequency == 0] = 0.0 # Create column that combines historical and predicted customer value df_final['total_clv'] = df_final['pred_values'] + df_final['Revenue'] # Create column which calculates in days the number of days since they were last active df_final['last_active'] = df_final['T'] - df_final['recency'] # Create a column which labels users inactive over 14 days as "Expired" ELSE "Active" df_final['user_status'] = np.where(df_final['last_active'] > 14, 'Expired', 'Active') # Add column with date of calculation # Set calc_date to max submission date df_final['calc_date'] = max_date.date() #pd.Timestamp('today').date() # Rename columns as appropriate df_final.columns = [ 'frequency', 'recency', 'customer_age', 'avg_session_value', 'predicted_searches_14_days', 'alive_probability', 'predicted_clv_12_months', 'historical_searches', 'historical_clv', 'total_clv', 'days_since_last_active', 'user_status', 'calc_date' ] #Prevent non returning customers from having 100% alive probability df_final.alive_probability[df_final.frequency == 0] = 0.0 return df_final
customer’s purchases divided by the total number of purchases. Note that the denominator here is different than the frequency described above. """ data = summary_data_from_transaction_data( df, customer_id, date_col, monetary_value_col='Sales', ) # observation_period_end='2011-12-9') # default period end date is the date when the last transaction happened ### Basic Frequency/Recency analysis using the BG/NBD model ### """ BG/NBD is an attractive alternative to the Pareto/NBD, which costs less computation and yields similar results. """ bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(data['frequency'], data['recency'], data['T']) print(bgf) # For small samples sizes, the parameters can get implausibly large, so by adding an l2 penalty the likelihood, # we can control how large these parameters can be. This is implemented as setting as positive penalizer_coef in the # initialization of the model. In typical applications, penalizers on the order of 0.001 to 0.1 are effective. # Model fit plot_period_transactions(bgf) # Calibration summary_cal_holdout = calibration_and_holdout_data( df, customer_id, date_col, calibration_period_end='2011-06-08', observation_period_end='2011-12-9')
def test_everything(X_train, y_train, X_test, y_test): ''' 1) test whether Full AdaBoost model performs better than BG/NBD 2) test whether AdaBoost model trained on same variables performs better 3) test Adaboost splitted in 8 RFM groups vs AdaBoost at once vs AdaBoost at RFM 4) alternative test/train split ''' ##################### ## FULL ADABOOST ## ##################### print_annotation('FULL ADABOOST') ada = AdaBoostClassifier() n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)] learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)] random_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate} clf = GridSearchCV(ada, random_grid, verbose=False, n_jobs=3, scoring='f1').fit(X_train, y_train) # print(clf.best_params_) y_pred_full_ada = clf.predict(X_test) print(confusion_matrix(y_test, y_pred_full_ada)) print(classification_report(y_test, y_pred_full_ada)) ######################## ## PARTIAL ADABOOST ## ######################## print_annotation('PARTIAL ADABOOST') ada = AdaBoostClassifier() n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)] learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)] random_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate} clf = GridSearchCV(ada, random_grid, verbose=False, n_jobs=3, scoring='f1') \ .fit(X_train[['txn_total', 'recency_true', 'T']], y_train) y_pred_part_ada = clf.predict(X_test[['txn_total', 'recency_true', 'T']]) print(confusion_matrix(y_test, y_pred_part_ada)) print(classification_report(y_test, y_pred_part_ada)) ################## ### BG/NBD ### ################## print_annotation('BG/NBD') bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(X_train['txn_total'], X_train['recency_true'] / 7, X_train['T'] / 7) t = 52 y_pred_bgnbd = bgf \ .conditional_expected_number_of_purchases_up_to_time( t, X_test['txn_total'], X_test['recency_true'] / 7, X_test['T'] / 7 ) for threshold in np.linspace(0.7, 1.8, 4): threshold = round(threshold, 2) print('_' * 25) print(f"BG/NBD threshold: {threshold}") y_pred_bgnbd_tf = y_pred_bgnbd < threshold print('churn rate: ' + str(sum(y_pred_bgnbd_tf) / len(y_pred_bgnbd_tf))) print(confusion_matrix(y_test, y_pred_bgnbd_tf)) print(classification_report(y_test, y_pred_bgnbd_tf)) ############################# ### ALTERNATIVE SPLIT ### ############################# print('_' * 25) print('_,-*-,' * 4) print('_' * 25) print_annotation('FULL ADABOOST alt split') X_train_alt, X_test_alt, y_train_alt, y_test_alt = \ train_test_split(X_test, y_test, test_size=0.33, random_state=42) ada = AdaBoostClassifier() n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)] learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)] random_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate} clf = GridSearchCV(ada, random_grid, verbose=False, n_jobs=3, scoring='f1').fit(X_train_alt, y_train_alt) # print(clf.best_params_) y_pred_ada_alt = clf.predict(X_test_alt) print(confusion_matrix(y_test_alt, y_pred_ada_alt)) print(classification_report(y_test_alt, y_pred_ada_alt)) ###################################### print_annotation('PARTIAL ADABOOST alt split') ada = AdaBoostClassifier() n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)] learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)] random_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate} clf = GridSearchCV(ada, random_grid, verbose=False, n_jobs=3, scoring='f1') \ .fit(X_train_alt[['txn_total', 'recency_true', 'T']], y_train_alt) y_pred_part_ada_alt = clf.predict( X_test_alt[['txn_total', 'recency_true', 'T']]) print(confusion_matrix(y_test_alt, y_pred_part_ada_alt)) print(classification_report(y_test_alt, y_pred_part_ada_alt)) ###################################### print_annotation('BD/NBD alt split') bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(X_train_alt['txn_total'], X_train_alt['recency_true'] / 7, X_train_alt['T'] / 7) t = 52 y_pred_bgnbd_ALT = bgf \ .conditional_expected_number_of_purchases_up_to_time( t, X_test_alt['txn_total'], X_test_alt['recency_true'] / 7, X_test_alt['T'] / 7 ) for threshold in np.linspace(0.2, 2.5, 6): print('_' * 25) print(f"BG/NBD threshold: {threshold}") y_pred_bgnbd_tf_alt = y_pred_bgnbd_ALT < threshold print('churn rate: ' + str(sum(y_pred_bgnbd_tf_alt) / len(y_pred_bgnbd_tf_alt))) print(confusion_matrix(y_test_alt, y_pred_bgnbd_tf_alt)) print(classification_report(y_test_alt, y_pred_bgnbd_tf_alt))
import os import pandas as pd import pytest import matplotlib matplotlib.use('AGG') # use a non-interactive backend from matplotlib import pyplot as plt from lifetimes import plotting from lifetimes import BetaGeoFitter, ParetoNBDFitter, ModifiedBetaGeoFitter from lifetimes.datasets import load_cdnow, load_transaction_data from lifetimes import utils bgf = BetaGeoFitter() cd_data = load_cdnow() bgf.fit(cd_data['frequency'], cd_data['recency'], cd_data['T'], iterative_fitting=1) @pytest.mark.plottest class TestPlotting(): @pytest.mark.mpl_image_compare(tolerance=30) def test_plot_period_transactions(self): plt.figure() plotting.plot_period_transactions(bgf) return plt.gcf() @pytest.mark.mpl_image_compare(tolerance=30) def test_plot_period_transactions_parento(self):
def create_cltv_p(dataframe): today_date = dt.datetime(2011, 12, 11) # recency kullanıcıya özel dinamik. rfm = dataframe.groupby('Customer ID').agg({ 'InvoiceDate': [ lambda date: (date.max() - date.min()).days, lambda date: (today_date - date.min()).days ], 'Invoice': lambda num: num.nunique(), 'TotalPrice': lambda TotalPrice: TotalPrice.sum() }) rfm.columns = rfm.columns.droplevel(0) # recency_cltv_p rfm.columns = ['recency_cltv_p', 'T', 'frequency', 'monetary'] # basitleştirilmiş monetary_avg rfm["monetary"] = rfm["monetary"] / rfm["frequency"] rfm.rename(columns={"monetary": "monetary_avg"}, inplace=True) # BGNBD için WEEKLY RECENCY VE WEEKLY T'nin HESAPLANMASI # recency_weekly_cltv_p rfm["recency_weekly_cltv_p"] = rfm["recency_cltv_p"] / 7 rfm["T_weekly"] = rfm["T"] / 7 # KONTROL rfm = rfm[rfm["monetary_avg"] > 0] # recency filtre (daha saglıklı cltvp hesabı için) rfm = rfm[(rfm['frequency'] > 1)] rfm["frequency"] = rfm["frequency"].astype(int) # BGNBD bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # exp_sales_1_month rfm["exp_sales_1_month"] = bgf.predict(4, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # exp_sales_3_month rfm["exp_sales_3_month"] = bgf.predict(12, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # expected_average_profit ggf = GammaGammaFitter(penalizer_coef=0.01) ggf.fit(rfm['frequency'], rfm['monetary_avg']) rfm["expected_average_profit"] = ggf.conditional_expected_average_profit( rfm['frequency'], rfm['monetary_avg']) # 6 aylık cltv_p cltv = ggf.customer_lifetime_value(bgf, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly'], rfm['monetary_avg'], time=6, freq="W", discount_rate=0.01) rfm["cltv_p"] = cltv # minmaxscaler scaler = MinMaxScaler(feature_range=(1, 100)) scaler.fit(rfm[["cltv_p"]]) rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]]) # rfm.fillna(0, inplace=True) # cltv_p_segment rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"]) # recency_cltv_p, recency_weekly_cltv_p rfm = rfm[[ "recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p", "T_weekly", "exp_sales_1_month", "exp_sales_3_month", "expected_average_profit", "cltv_p", "cltv_p_segment" ]] return rfm
def create_cltv_p(dataframe): today_date = dt.datetime(2011, 12, 11) # recency user-specific rfm = dataframe.groupby('Customer ID').agg({'InvoiceDate': [lambda date: (date.max() - date.min()).days, # "recency_cltv_p" lambda date: (today_date - date.min()).days], # "T" 'Invoice': lambda num: num.nunique(), # "frequency" 'TotalPrice': lambda TotalPrice: TotalPrice.sum()}) # "monetary" rfm.columns = rfm.columns.droplevel(0) # recency_cltv_p rfm.columns = ["recency_cltv_p", "T", "frequency", "monetary"] # Simplified monetary_avg (since Gamma-Gamma model requires this way) rfm["monetary"] = rfm["monetary"] / rfm["frequency"] rfm.rename(columns={"monetary": "monetary_avg"}, inplace=True) # Calculating WEEKLY RECENCY VE WEEKLY T for BG/NBD MODEL # recency_weekly_cltv_p rfm["recency_weekly_cltv_p"] = rfm["recency_cltv_p"] / 7 rfm["T_weekly"] = rfm["T"] / 7 # CHECK IT OUT! Monetary avg must be positive rfm = rfm[rfm["monetary_avg"] > 0] # recency filter rfm = rfm[(rfm["frequency"] > 1)] rfm["frequency"] = rfm["frequency"].astype(int) # converting it to integer just in case! # Establishing the BGNBD Model bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(rfm["frequency"], rfm["recency_weekly_cltv_p"], rfm["T_weekly"]) # exp_sales_1_month rfm["exp_sales_1_month"] = bgf.predict(4, rfm["frequency"], rfm["recency_weekly_cltv_p"], rfm["T_weekly"]) # exp_sales_3_month rfm["exp_sales_3_month"] = bgf.predict(12, rfm["frequency"], rfm["recency_weekly_cltv_p"], rfm["T_weekly"]) # Establishing Gamma-Gamma Model calculates=> Expected Average Profit ggf = GammaGammaFitter(penalizer_coef=0.01) ggf.fit(rfm["frequency"], rfm["monetary_avg"]) rfm["expected_average_profit"] = ggf.conditional_expected_average_profit(rfm["frequency"], rfm["monetary_avg"]) # CLTV Pred for 6 months cltv = ggf.customer_lifetime_value(bgf, rfm["frequency"], rfm["recency_weekly_cltv_p"], rfm["T_weekly"], rfm["monetary_avg"], time=6, freq="W", discount_rate=0.01) rfm["cltv_p"] = cltv # Minmaxscaler scaler = MinMaxScaler(feature_range=(1, 100)) scaler.fit(rfm[["cltv_p"]]) rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]]) # rfm.fillna(0, inplace=True) # cltv_p_segment rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"]) # recency_cltv_p, recency_weekly_cltv_p rfm = rfm[["recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p", "T_weekly", "exp_sales_1_month", "exp_sales_3_month", "expected_average_profit", "cltv_p", "cltv_p_segment"]] return rfm
def create_cltv_p(dataframe): today_date = dt.datetime(2011, 12, 11) rfm = dataframe.groupby('Customer ID').agg({ 'InvoiceDate': [ lambda date: (date.max() - date.min()).days, lambda date: (today_date - date.min()).days ], 'Invoice': lambda num: num.nunique(), 'TotalPrice': lambda price: price.sum() }) rfm.columns = rfm.columns.droplevel(0) rfm.columns = ['recency_cltv_p', 'T', 'frequency', 'monetary'] rfm['monetary'] = rfm['monetary'] / rfm['frequency'] rfm.rename(columns={'monetary': 'monetary_avg'}, inplace=True) rfm["recency_weekly_cltv_p"] = rfm['recency_cltv_p'] / 7 rfm['T_weekly'] = rfm['T'] / 7 rfm = rfm[rfm['monetary_avg'] > 0] rfm = rfm[(rfm['frequency'] > 1)] rfm['frequency'] = rfm['frequency'].astype(int) #BGNBD bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) rfm["exp_sales_1_month"] = bgf.predict(4, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) rfm["exp_sales_3_month"] = bgf.predict(12, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) #Gamma Gamma ggf = GammaGammaFitter(penalizer_coef=0.01) ggf.fit(rfm['frequency'], rfm['monetary_avg']) rfm["expected_average_profit"] = ggf.conditional_expected_average_profit( rfm['frequency'], rfm['monetary_avg']) cltv = ggf.customer_lifetime_value(bgf, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly'], rfm['monetary_avg'], time=6, freq='W', discount_rate=0.01) rfm["cltv_p"] = cltv scaler = MinMaxScaler(feature_range=(1, 100)) scaler.fit(rfm[["cltv_p"]]) rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]]) rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"]) rfm = rfm[[ "recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p", "T_weekly", "exp_sales_1_month", "exp_sales_3_month", "expected_average_profit", "cltv_p", "cltv_p_segment" ]] return rfm