def summaryOutput(self, discount_rate=0.12, months=12):
     '''
     Fit beta geometric model to calculate CLV, and use GG model to calculate expected profit
     Per customer
     Write out CLV and profits to csv, print out averages to screen
     '''
     beta_model = BetaGeoFitter()
     #calulate average transaction value
     self.summary_monetary['avg_transaction_value'] = self.ggf.conditional_expected_average_profit(
     self.summary_monetary['frequency'],
     self.summary_monetary['monetary_value'])
     #fit beta geo model
     beta_model.fit(self.summary_monetary['frequency'], self.summary_monetary['recency'], self.summary_monetary['T'])
     #calculate clv, with discount rate calulated over year (default)
     disc_rate = discount_rate/months/30
     self.summary_monetary['clv'] = self.ggf.customer_lifetime_value(
     beta_model, #the model to use to predict the number of future transactions
     self.summary_monetary['frequency'],
     self.summary_monetary['recency'],
     self.summary_monetary['T'],
     self.summary_monetary['monetary_value'], time=months, # months
     discount_rate=disc_rate # monthly discount rate ~ 12.7% annually
     )
     #print customer data with calculations
     self.summary_monetary.to_csv("CLV_AVG_transactionValue_perCustomer.csv", index=False)
     #print summary stats
     print("Expected conditional average profit: {}, Average profit: {}".format(
     self.ggf.conditional_expected_average_profit(
         self.summary_monetary['frequency'],
         self.summary_monetary['monetary_value']).mean(),
     self.summary_monetary[self.summary_monetary['frequency']>0]['monetary_value'].mean()))
def readBetaGeoFitterModel():

    betaGeoFitterModel = BetaGeoFitter()

    betaGeoFitterModel.load_model("BetaGeoFitterModel.pkl")

    return betaGeoFitterModel
Ejemplo n.º 3
0
def train_metric(d, metric, plot=True, penalty=0):
    frequency = metric + "_frequency"
    recency = metric + "_recency"
    T = metric + "_T"
    train = d
    train = train[(train[frequency] > 0) & (train[recency] >= 0)]
    train[frequency] = train[frequency] - 1

    bgf = BetaGeoFitter(penalizer_coef=penalty)
    bgf.fit(train[frequency], train[recency], train[T])
    n = bgf.data.shape[0]
    simulated_data = bgf.generate_new_data(size=n)

    model_counts = pd.DataFrame(
        bgf.data["frequency"].value_counts().sort_index().iloc[:28])
    simulated_counts = pd.DataFrame(
        simulated_data["frequency"].value_counts().sort_index().iloc[:28])
    combined_counts = model_counts.merge(simulated_counts,
                                         how="outer",
                                         left_index=True,
                                         right_index=True).fillna(0)
    combined_counts.columns = ["Actual", "Model"]
    if plot:
        combined_counts.plot.bar()
        display()
    return combined_counts, bgf
def single_customer_evaluation(time_units=243):
    """
    Predicts Number of Purchases of a randomly chosen customer from the dataset.
    (conditional_expected_number_of_purchases_up_to_time)

    Parameters
    ----------
        time_units: int, default=243.
            Number of days for prediction.

    Returns
    -------
        (frequency_predicted, frequency_holdout)
    """
    # Loading Calibration Model.
    cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0)
    cal_bg_nbd.load_model(path="models/calibration_model.pkl")

    # Loading summary_cal_holdout dataset.
    summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv")

    # Randomly sample single customer.
    individual = summary_cal_holdout.sample()
    frequency_prediction = cal_bg_nbd.predict(
        t=time_units,
        frequency=individual["frequency_cal"],
        recency=individual["recency_cal"],
        T=individual["T_cal"])
    frequency_holdout = individual["frequency_holdout"]

    return frequency_prediction, frequency_holdout
def create_cltv_pred(dataframe, w=4, m=1):
    """
    Gamagama and BGNBD model and prediction
    Parameters
    ----------
    dataframe
    w: int, week information for BGNBD model
    m: int, month information for gamama model

    Returns
    Dataframe
    -------

    """
    # BGNBD

    dataframe = dataframe[dataframe["monetary_avg"] > 0]
    dataframe["frequency"] = dataframe["frequency"].astype(int)

    bgf = BetaGeoFitter(penalizer_coef=0.001)
    bgf.fit(dataframe['frequency'], dataframe['recency_weekly'],
            dataframe['T_weekly'])

    dataframe[f'exp_sales_{w}_week'] = bgf.predict(w, dataframe['frequency'],
                                                   dataframe['recency_weekly'],
                                                   dataframe['T_weekly'])

    # Gamagama - expected_average_profit
    ggf = GammaGammaFitter(penalizer_coef=0.001)
    ggf.fit(dataframe['frequency'], dataframe['monetary_avg'])
    dataframe[
        "expected_average_profit"] = ggf.conditional_expected_average_profit(
            dataframe['frequency'], dataframe['monetary_avg'])

    # CLTV Prediction
    cltv = ggf.customer_lifetime_value(bgf,
                                       dataframe['frequency'],
                                       dataframe['recency_weekly'],
                                       dataframe['T_weekly'],
                                       dataframe['monetary_avg'],
                                       time=m,
                                       freq="W",
                                       discount_rate=0.01)

    dataframe[f'cltv_p_{m}_month'] = cltv

    scaler = MinMaxScaler(feature_range=(1, 100))
    dataframe['cltv_p_score'] = scaler.fit_transform(
        dataframe[[f'cltv_p_{m}_month']])

    # cltv_p Segment
    dataframe['cltv_p_segment'] = pd.qcut(dataframe['cltv_p_score'],
                                          3,
                                          labels=['C', 'B', 'A'])

    new_col = dataframe.columns[~dataframe.columns.
                                isin(['recency', 'frequency', 'monetary'])]
    dataframe = dataframe[new_col]

    return dataframe
Ejemplo n.º 6
0
def bgf(cd_data):
    bgf_model = BetaGeoFitter()
    bgf_model.fit(cd_data['frequency'],
                  cd_data['recency'],
                  cd_data['T'],
                  iterative_fitting=1)
    return bgf_model
Ejemplo n.º 7
0
    def test_plot_incremental_transactions(self):
        """Test plotting incremental transactions with CDNOW example."""
        transactions = load_dataset('CDNOW_sample.txt', header=None, sep='\s+')
        transactions.columns = [
            'id_total', 'id_sample', 'date', 'num_cd_purc', 'total_value'
        ]
        t = 39
        freq = 'W'

        transactions_summary = utils.summary_data_from_transaction_data(
            transactions,
            'id_sample',
            'date',
            datetime_format='%Y%m%d',
            observation_period_end='19970930',
            freq=freq)

        bgf = BetaGeoFitter(penalizer_coef=0.01)
        bgf.fit(transactions_summary['frequency'],
                transactions_summary['recency'], transactions_summary['T'])

        plt.figure()
        plotting.plot_incremental_transactions(bgf,
                                               transactions,
                                               'date',
                                               'id_sample',
                                               2 * t,
                                               t,
                                               freq=freq,
                                               xlabel='week',
                                               datetime_format='%Y%m%d')
        return plt.gcf()
Ejemplo n.º 8
0
def fitted_bg(example_summary_data):
    bg = BetaGeoFitter()
    bg.fit(example_summary_data['frequency'],
           example_summary_data['recency'],
           example_summary_data['T'],
           iterative_fitting=0)
    return bg
Ejemplo n.º 9
0
def test_expected_cumulative_transactions_date_index(cdnow_transactions):
    """
    Test set_index as date for cumulative transactions and bgf fitter.

    Get first 14 cdnow transactions dates and validate that date index,
    freq_multiplier = 1 working and compare with tested data for last 4 records.

    dates = ['1997-01-11', '1997-01-12', '1997-01-13', '1997-01-14']
    actual_trans = [11, 12, 15, 19]
    expected_trans = [10.67, 12.67, 14.87, 17.24]

    """
    datetime_col = "date"
    customer_id_col = "id_sample"
    t = 14
    datetime_format = "%Y%m%d"
    freq = "D"
    observation_period_end = "19970930"
    freq_multiplier = 1

    transactions_summary = utils.summary_data_from_transaction_data(
        cdnow_transactions,
        customer_id_col,
        datetime_col,
        datetime_format=datetime_format,
        freq=freq,
        freq_multiplier=freq_multiplier,
        observation_period_end=observation_period_end,
    )

    transactions_summary = transactions_summary.reset_index()

    model = BetaGeoFitter()
    model.fit(transactions_summary["frequency"],
              transactions_summary["recency"], transactions_summary["T"])

    df_cum = utils.expected_cumulative_transactions(
        model,
        cdnow_transactions,
        datetime_col,
        customer_id_col,
        t,
        datetime_format,
        freq,
        set_index_date=True,
        freq_multiplier=freq_multiplier,
    )

    dates = ["1997-01-11", "1997-01-12", "1997-01-13", "1997-01-14"]
    actual_trans = [11, 12, 15, 19]
    expected_trans = [10.67, 12.67, 14.87, 17.24]

    date_index = df_cum.iloc[-4:].index.to_timestamp().astype(str)
    actual = df_cum["actual"].iloc[-4:].values
    predicted = df_cum["predicted"].iloc[-4:].values.round(2)

    assert all(dates == date_index)
    assert_allclose(actual, actual_trans)
    assert_allclose(predicted, expected_trans, atol=1e-2)
Ejemplo n.º 10
0
def bgf_transactions(cdnow_transactions):
    transactions_summary = utils.summary_data_from_transaction_data(
        cdnow_transactions, 'id_sample', 'date', datetime_format='%Y%m%d',
        observation_period_end='19970930', freq='W')

    bgf = BetaGeoFitter(penalizer_coef=0.01)
    bgf.fit(transactions_summary['frequency'],
            transactions_summary['recency'], transactions_summary['T'])
    return bgf
Ejemplo n.º 11
0
def fitted_bg(example_summary_data):
    bg = BetaGeoFitter()
    bg.fit(
        example_summary_data["frequency"],
        example_summary_data["recency"],
        example_summary_data["T"],
        iterative_fitting=2,
        tol=1e-6,
    )
    return bg
Ejemplo n.º 12
0
def bgnbd_model(summary):
    """Instantiate and fit a BG/NBD model.

  Args:
    summary: RFM transaction data
  Returns:
    bgnbd model fit to the data
  """
    bgf = BetaGeoFitter(penalizer_coef=PENALIZER_COEF)
    bgf.fit(summary['frequency'], summary['recency'], summary['T'])
    return bgf
Ejemplo n.º 13
0
def rfm_model(data, end_date, f, p):
    rfm1 = lifetimes.utils.summary_data_from_transaction_data(
        data,
        'customer_id',
        'date',
        monetary_value_col='amount',
        observation_period_end=end_date,
        freq=f)
    rfm1 = rfm1[rfm1.monetary_value < 600]
    bgf = BetaGeoFitter(penalizer_coef=p)
    bgf.fit(rfm1['frequency'], rfm1['recency'], rfm1['T'])
    return rfm1, bgf
def evaluation_plots(plot_type):
    """
    Evaluation Plots:
    - Tracking Cumulative Transactions
    - Tracking Daily Transactions
    - Frequency of Repeated Transactions
    - Calibration vs Holdout.

    Parameters
    ----------
        plot_type: str.
            "tracking" - Tracking Cumulative and Tracking Daily Transactions.
            "repeated" - Frequency of Repeated Transactions.
            "calibration_holdout" - Calibration vs Holdout Purchases.
    """
    # Loading Calibration Model.
    cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0)
    cal_bg_nbd.load_model(path="models/calibration_model.pkl")

    # Loading summary_cal_holdout dataset.
    summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv")

    # Loading Transactions.
    transactions = pd.read_csv("datasets/transactions.csv")

    if plot_type == "tracking":
        fig = plt.figure(figsize=(20, 4))
        plot_cumulative_transactions(model=cal_bg_nbd,
                                     transactions=transactions,
                                     datetime_col="order_purchase_timestamp",
                                     customer_id_col="customer_unique_id",
                                     t=604,
                                     t_cal=512,
                                     freq="D",
                                     ax=fig.add_subplot(121))

        plot_incremental_transactions(model=cal_bg_nbd,
                                      transactions=transactions,
                                      datetime_col="order_purchase_timestamp",
                                      customer_id_col="customer_unique_id",
                                      t=604,
                                      t_cal=512,
                                      freq="D",
                                      ax=fig.add_subplot(122))
    elif plot_type == "repeated":
        plot_period_transactions(model=cal_bg_nbd)

    elif plot_type == "calibration_holdout":
        plot_calibration_purchases_vs_holdout_purchases(
            model=cal_bg_nbd, calibration_holdout_matrix=summary_cal_holdout)
    return
Ejemplo n.º 15
0
def upload():
    # -*- coding: utf-8 -*-
    if request.method == 'POST':
        f = request.files['file']

        basepath = os.path.dirname(__file__)
        file_path = os.path.join(basepath, 'uploads',
                                 secure_filename(f.filename))
        f.save(file_path)
        df = pd.read_csv(file_path)

        df['salesDate'] = pd.to_datetime(df['salesDate'])

        cols_of_interest = ['memberID', 'salesDate', 'sales']
        df = df[cols_of_interest]

        df['memberID'] = df['memberID'].apply(lambda x: format(x, '.0f'))

        max_date = df['salesDate'].max()
        min_date = max_date - relativedelta(months=+12)

        df = df.loc[(df['salesDate'] >= min_date)
                    & (df['salesDate'] <= max_date)]

        min_order = df['salesDate'].min()
        max_order = df['salesDate'].max()
        data = summary_data_from_transaction_data(
            df,
            'memberID',
            'salesDate',
            monetary_value_col='sales',
            observation_period_end=max_order)

        d2 = data.sort_values('frequency', ascending=False)

        bgf = BetaGeoFitter(penalizer_coef=0.0001)
        bgf.fit(data['frequency'], data['recency'], data['T'])

        t = 30
        data[
            'customer_livelyhood'] = bgf.conditional_expected_number_of_purchases_up_to_time(
                t, data['frequency'], data['recency'], data['T'])

        data.sort_values(by='customer_livelyhood',
                         ascending=False,
                         inplace=True)

        return data.to_html()
    return None
Ejemplo n.º 16
0
def trainBetaGeoFitterModel():

    summaryDataFromTransactionDataForCLV = readsummaryDataFromTransactionDataForCLV(
    )

    #training model
    betaGeoFitterModel = BetaGeoFitter(penalizer_coef=0.0)

    betaGeoFitterModel.fit(summaryDataFromTransactionDataForCLV["frequency"],
                           summaryDataFromTransactionDataForCLV["recency"],
                           summaryDataFromTransactionDataForCLV["T"])

    #saving the model in pickle file
    saveBetaGeoFitterModel(betaGeoFitterModel)

    print(betaGeoFitterModel.summary)
def probability_alive(historical_rfm_data):
    """
    Predicted Conditional Probability Alive.

    Parameters
    ----------
        historical_rfm_data: Historical Frequency, Recency & T of an individual

    Returns
    -------
        Conditional Probability Alive.
    """
    clv_model = BetaGeoFitter(penalizer_coef=0.0)
    clv_model.load_model(path="models/customer_lifetime_estimator.pkl")

    alive_probability = clv_model.conditional_probability_alive(
        frequency=historical_rfm_data["frequency"],
        recency=historical_rfm_data["recency"],
        T=historical_rfm_data["T"])
    return alive_probability
Ejemplo n.º 18
0
def estimate_clv_model(summary, model_penalizer=None):
  #set default values if they are not stated
  if model_penalizer is None:
    model_penalizer = 0

  # Building the Model using BG/NBD
  bgf = BetaGeoFitter(penalizer_coef=model_penalizer)
  bgf.fit(summary['frequency'], summary['recency'], summary['T'])

  # There cannot be non-positive values in the monetary_value or frequency vector
  summary_with_value_and_returns = summary[(summary['monetary_value']>0) & (summary['frequency']>0)]
  # Setting up Gamma Gamma model
  ggf = GammaGammaFitter(penalizer_coef = 0)
  ggf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) 

  # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors
  if not (len(x) == 0 for x in [summary_with_value_and_returns['recency'],summary_with_value_and_returns['frequency'],summary_with_value_and_returns['T']]):
    bgf.fit(summary_with_value_and_returns['frequency'],summary_with_value_and_returns['recency'],summary_with_value_and_returns['T'])

  return [bgf, ggf]
def root_mean_squared_error(time_units=243):
    """
    Calculates Root Mean Squared Error of all predictions.

    Parameters
    ----------
        time_units: int, default=243.
            Number of days for prediction.

    Yields
    ------
        summary_cal_holdout_preds.csv.

    Returns
    ------
        rmse
    """
    # Loading Calibration Model.
    cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0)
    cal_bg_nbd.load_model(path="models/calibration_model.pkl")

    # Loading summary_cal_holdout dataset.
    summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv")
    frequency_holdout = summary_cal_holdout["frequency_holdout"].copy()

    # Predictions.
    frequency_predictions = cal_bg_nbd.predict(
        t=time_units,
        frequency=summary_cal_holdout["frequency_cal"],
        recency=summary_cal_holdout["recency_cal"],
        T=summary_cal_holdout["T_cal"])

    # Adding Predictions to Summary dataset.
    summary_cal_holdout["frequency_predictions"] = frequency_predictions.copy()
    file_path = Path.cwd() / "datasets/summary_cal_holdout_preds.csv"
    summary_cal_holdout.to_csv(file_path, index=False)

    rmse = mean_squared_error(frequency_holdout,
                              frequency_predictions,
                              squared=False)
    return rmse
Ejemplo n.º 20
0
def _calibration_model():
    """
    Trains BG/NBD Calibration Model.

    Yields
    ------
        calibration_model.pkl
    """
    summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv")

    # Training Calibration Model.
    cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0)
    cal_bg_nbd.fit(frequency=summary_cal_holdout["frequency_cal"],
                   recency=summary_cal_holdout["recency_cal"],
                   T=summary_cal_holdout["T_cal"],
                   verbose=True)

    # Saving Model.
    file_path = Path.cwd() / "models/calibration_model.pkl"
    cal_bg_nbd.save_model(path=file_path)
    return
Ejemplo n.º 21
0
def _clv_model():
    """
    Trains BG/NBD Model on entire RFM data, final fit.

    Yields
    ------
        customer_lifetime_estimator.pkl
    """
    summary = pd.read_csv("datasets/summary.csv")

    # Training Calibration Model.
    clv = BetaGeoFitter(penalizer_coef=0.0)
    clv.fit(frequency=summary["frequency"],
            recency=summary["recency"],
            T=summary["T"],
            verbose=True)

    # Saving Model.
    file_path = Path.cwd() / "models/customer_lifetime_estimator.pkl"
    clv.save_model(path=file_path)
    return
def number_of_purchases(historical_rfm_data, time_units=30):
    """
    Predicted Conditional Expected Number of Purchases.

    Parameters
    ----------
        historical_rfm_data: Historical Frequency, Recency & T of an individual

        time_units: int, default=30.
            Number of days for predictions.
    Returns
    -------
        expected number of purchases.
    """
    clv_model = BetaGeoFitter(penalizer_coef=0.0)
    clv_model.load_model(path="models/customer_lifetime_estimator.pkl")
    frequency_predictions = clv_model.predict(
        t=time_units,
        frequency=historical_rfm_data["frequency"],
        recency=historical_rfm_data["recency"],
        T=historical_rfm_data["T"])
    return frequency_predictions
Ejemplo n.º 23
0
def load_data_and_model():
    """Loads Customer Lifetime Estimator Model"""
    model = BetaGeoFitter(penalizer_coef=0.0)
    model.load_model("../models/calibration_model.pkl")
    summary_cal_holdout = pd.read_csv("../datasets/summary_cal_holdout.csv")
    return model, summary_cal_holdout
Ejemplo n.º 24
0
def generate_clv_table(data, clv_prediction_time=None, model_penalizer=None):

    #set default values if they are not stated
    if clv_prediction_time is None:
        clv_prediction_time = 12
    if model_penalizer is None:
        model_penalizer = 0

    # Reformat csv as a Pandas dataframe
    #data = pd.read_csv(csv_file)

    #Remove non search sessions
    data = data[data['Searches'] > 0]

    max_date = data['activity_date'].max()

    # Using "summary_data_from_transaction_data" function to agregate the activity stream into the appropriate metrics
    # Model requires 'activity_date' column name.  For our purpose this is synonymous with submission_date.
    summary = summary_data_from_transaction_data(
        data,
        'client_id',
        'activity_date',
        'Revenue',
        observation_period_end=max_date)

    # Building the Model using BG/NBD
    bgf = BetaGeoFitter(penalizer_coef=model_penalizer)
    bgf.fit(summary['frequency'], summary['recency'], summary['T'])

    # Conditional expected purchases
    # These are the expected purchases expected from each individual given the time specified

    # t = days in to future
    t = 14
    summary[
        'predicted_searches'] = bgf.conditional_expected_number_of_purchases_up_to_time(
            t, summary['frequency'], summary['recency'], summary['T'])

    #Conditional Alive Probability
    summary['alive_prob'] = summary.apply(
        lambda row: calc_alive_prob(row, bgf), axis=1)
    summary['alive_prob'] = summary['alive_prob'].astype(float)
    #print summary['alive_prob']

    # There cannot be non-positive values in the monetary_value or frequency vector
    summary_with_value_and_returns = summary[(summary['monetary_value'] > 0)
                                             & (summary['frequency'] > 0)]

    # There cannot be zero length vectors in one of frequency, recency or T
    #summary_with_value_and_returns =
    #print summary_with_value_and_returns[
    #    (len(summary_with_value_and_returns['recency'])>0) &
    #    (len(summary_with_value_and_returns['frequency'])>0) &
    #    (len(summary_with_value_and_returns['T'])>0)
    #]

    if any(
            len(x) == 0 for x in [
                summary_with_value_and_returns['recency'],
                summary_with_value_and_returns['frequency'],
                summary_with_value_and_returns['T']
            ]):
        logger.debug(data['client_id'])

    # Setting up Gamma Gamma model
    ggf = GammaGammaFitter(penalizer_coef=0)
    ggf.fit(summary_with_value_and_returns['frequency'],
            summary_with_value_and_returns['monetary_value'])

    # Output average profit per tranaction by client ID
    ggf_output = ggf.conditional_expected_average_profit(
        summary_with_value_and_returns['frequency'],
        summary_with_value_and_returns['monetary_value'])

    # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors
    if not (len(x) == 0 for x in [
            summary_with_value_and_returns['recency'],
            summary_with_value_and_returns['frequency'],
            summary_with_value_and_returns['T']
    ]):
        bgf.fit(summary_with_value_and_returns['frequency'],
                summary_with_value_and_returns['recency'],
                summary_with_value_and_returns['T'])

    # Getting Customer lifetime value using the Gamma Gamma output
    # NOTE: the time can be adjusted, but is currently set to 12 months

    customer_predicted_value = ggf.customer_lifetime_value(
        bgf,  #the model to use to predict the number of future transactions
        summary_with_value_and_returns['frequency'],
        summary_with_value_and_returns['recency'],
        summary_with_value_and_returns['T'],
        summary_with_value_and_returns['monetary_value'],
        time=clv_prediction_time,  # months
        discount_rate=0.01  # monthly discount rate ~ 12.7% annually
    )

    # Converting to dataframe
    df_cpv = pd.DataFrame({
        'client_id': customer_predicted_value.index,
        'pred_values': customer_predicted_value.values
    })

    # Setting client_id as index
    df_cpv = df_cpv.set_index('client_id')

    # Merge with original summary
    df_merged = pd.merge(summary,
                         df_cpv,
                         left_index=True,
                         right_index=True,
                         how='outer')

    # Historical CLV
    data_hist = data.groupby(
        ['client_id'])['Searches',
                       'Revenue'].apply(lambda x: x.astype(float).sum())

    # Merge with original summary
    df_final = pd.merge(df_merged,
                        data_hist,
                        left_index=True,
                        right_index=True,
                        how='outer')

    # Prevent NaN on the pred_clv column
    df_final.pred_values[df_final.frequency == 0] = 0.0

    # Create column that combines historical and predicted customer value
    df_final['total_clv'] = df_final['pred_values'] + df_final['Revenue']

    # Create column which calculates in days the number of days since they were last active
    df_final['last_active'] = df_final['T'] - df_final['recency']

    # Create a column which labels users inactive over 14 days as "Expired" ELSE "Active"
    df_final['user_status'] = np.where(df_final['last_active'] > 14, 'Expired',
                                       'Active')

    # Add column with date of calculation
    # Set calc_date to max submission date
    df_final['calc_date'] = max_date.date()  #pd.Timestamp('today').date()

    # Rename columns as appropriate
    df_final.columns = [
        'frequency', 'recency', 'customer_age', 'avg_session_value',
        'predicted_searches_14_days', 'alive_probability',
        'predicted_clv_12_months', 'historical_searches', 'historical_clv',
        'total_clv', 'days_since_last_active', 'user_status', 'calc_date'
    ]

    #Prevent non returning customers from having 100% alive probability
    df_final.alive_probability[df_final.frequency == 0] = 0.0

    return df_final
Ejemplo n.º 25
0
customer’s purchases divided by the total number of purchases. Note that the denominator here is different than 
the frequency described above. 
"""
data = summary_data_from_transaction_data(
    df,
    customer_id,
    date_col,
    monetary_value_col='Sales',
)
# observation_period_end='2011-12-9') # default period end date is the date when the last transaction happened

### Basic Frequency/Recency analysis using the BG/NBD model ###
"""
BG/NBD is an attractive alternative to the Pareto/NBD, which costs less computation and yields similar results.
"""
bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(data['frequency'], data['recency'], data['T'])
print(bgf)
# For small samples sizes, the parameters can get implausibly large, so by adding an l2 penalty the likelihood,
# we can control how large these parameters can be. This is implemented as setting as positive penalizer_coef in the
# initialization of the model. In typical applications, penalizers on the order of 0.001 to 0.1 are effective.

# Model fit
plot_period_transactions(bgf)  # Calibration

summary_cal_holdout = calibration_and_holdout_data(
    df,
    customer_id,
    date_col,
    calibration_period_end='2011-06-08',
    observation_period_end='2011-12-9')
Ejemplo n.º 26
0
def test_everything(X_train, y_train, X_test, y_test):
    '''
    1) test whether Full AdaBoost model performs better than BG/NBD
    2) test whether AdaBoost model trained on same 
        variables performs better
    3) test Adaboost splitted in 8 RFM groups
         vs AdaBoost at once vs AdaBoost at RFM
    4) alternative test/train split
    '''
    #####################
    ##  FULL ADABOOST  ##
    #####################
    print_annotation('FULL ADABOOST')
    ada = AdaBoostClassifier()
    n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)]
    learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)]

    random_grid = {'n_estimators': n_estimators,
                'learning_rate': learning_rate}
    clf = GridSearchCV(ada, random_grid, verbose=False,
                    n_jobs=3, scoring='f1').fit(X_train, y_train)

    # print(clf.best_params_)
    y_pred_full_ada = clf.predict(X_test)
    print(confusion_matrix(y_test, y_pred_full_ada))
    print(classification_report(y_test, y_pred_full_ada))


    ########################
    ##  PARTIAL ADABOOST  ##
    ########################
    print_annotation('PARTIAL ADABOOST')
    ada = AdaBoostClassifier()
    n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)]
    learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)]

    random_grid = {'n_estimators': n_estimators,
                   'learning_rate': learning_rate}
    clf = GridSearchCV(ada, random_grid, verbose=False,
                       n_jobs=3, scoring='f1') \
                       .fit(X_train[['txn_total', 'recency_true', 'T']], y_train)
    y_pred_part_ada = clf.predict(X_test[['txn_total', 'recency_true', 'T']])
    print(confusion_matrix(y_test, y_pred_part_ada))
    print(classification_report(y_test, y_pred_part_ada))


    ##################
    ###   BG/NBD   ###
    ##################
    print_annotation('BG/NBD')
    bgf = BetaGeoFitter(penalizer_coef=0.0)
    bgf.fit(X_train['txn_total'], X_train['recency_true'] / 7,
        X_train['T'] / 7)
    
    t = 52
    y_pred_bgnbd = bgf \
        .conditional_expected_number_of_purchases_up_to_time(
                t, X_test['txn_total'], 
                X_test['recency_true'] / 7,
                X_test['T'] / 7
            )
    for threshold in np.linspace(0.7, 1.8, 4):
        threshold = round(threshold, 2)
        print('_' * 25)
        print(f"BG/NBD threshold: {threshold}")
        y_pred_bgnbd_tf = y_pred_bgnbd < threshold
        print('churn rate: ' + str(sum(y_pred_bgnbd_tf) / len(y_pred_bgnbd_tf)))
        print(confusion_matrix(y_test, y_pred_bgnbd_tf))
        print(classification_report(y_test, y_pred_bgnbd_tf))


    #############################
    ###   ALTERNATIVE SPLIT   ###
    #############################
    print('_' * 25)
    print('_,-*-,' * 4)
    print('_' * 25)
    print_annotation('FULL ADABOOST alt split')

    X_train_alt, X_test_alt, y_train_alt, y_test_alt = \
        train_test_split(X_test, y_test, test_size=0.33, random_state=42)
    
    
    ada = AdaBoostClassifier()
    n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)]
    learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)]

    random_grid = {'n_estimators': n_estimators,
                   'learning_rate': learning_rate}
    clf = GridSearchCV(ada, random_grid, verbose=False,
                       n_jobs=3, scoring='f1').fit(X_train_alt, y_train_alt)

    # print(clf.best_params_)
    y_pred_ada_alt = clf.predict(X_test_alt)
    print(confusion_matrix(y_test_alt, y_pred_ada_alt))
    print(classification_report(y_test_alt, y_pred_ada_alt))
    ######################################

    print_annotation('PARTIAL ADABOOST alt split')
    ada = AdaBoostClassifier()
    n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)]
    learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)]

    random_grid = {'n_estimators': n_estimators,
                   'learning_rate': learning_rate}
    clf = GridSearchCV(ada, random_grid, verbose=False,
                       n_jobs=3, scoring='f1') \
        .fit(X_train_alt[['txn_total', 'recency_true', 'T']], y_train_alt)
    y_pred_part_ada_alt = clf.predict(
        X_test_alt[['txn_total', 'recency_true', 'T']])
    print(confusion_matrix(y_test_alt, y_pred_part_ada_alt))
    print(classification_report(y_test_alt, y_pred_part_ada_alt))


    ######################################
    print_annotation('BD/NBD alt split')
    bgf = BetaGeoFitter(penalizer_coef=0.0)
    bgf.fit(X_train_alt['txn_total'], X_train_alt['recency_true'] / 7,
            X_train_alt['T'] / 7)

    t = 52
    y_pred_bgnbd_ALT = bgf \
        .conditional_expected_number_of_purchases_up_to_time(
            t, X_test_alt['txn_total'],
            X_test_alt['recency_true'] / 7,
            X_test_alt['T'] / 7
        )
    for threshold in np.linspace(0.2, 2.5, 6):
        print('_' * 25)
        print(f"BG/NBD threshold: {threshold}")
        y_pred_bgnbd_tf_alt = y_pred_bgnbd_ALT < threshold
        print('churn rate: ' + str(sum(y_pred_bgnbd_tf_alt) / len(y_pred_bgnbd_tf_alt)))
        print(confusion_matrix(y_test_alt, y_pred_bgnbd_tf_alt))
        print(classification_report(y_test_alt, y_pred_bgnbd_tf_alt))
Ejemplo n.º 27
0
import os
import pandas as pd
import pytest

import matplotlib
matplotlib.use('AGG')  # use a non-interactive backend
from matplotlib import pyplot as plt

from lifetimes import plotting
from lifetimes import BetaGeoFitter, ParetoNBDFitter, ModifiedBetaGeoFitter
from lifetimes.datasets import load_cdnow, load_transaction_data
from lifetimes import utils

bgf = BetaGeoFitter()
cd_data = load_cdnow()
bgf.fit(cd_data['frequency'],
        cd_data['recency'],
        cd_data['T'],
        iterative_fitting=1)


@pytest.mark.plottest
class TestPlotting():
    @pytest.mark.mpl_image_compare(tolerance=30)
    def test_plot_period_transactions(self):
        plt.figure()
        plotting.plot_period_transactions(bgf)
        return plt.gcf()

    @pytest.mark.mpl_image_compare(tolerance=30)
    def test_plot_period_transactions_parento(self):
Ejemplo n.º 28
0
def create_cltv_p(dataframe):
    today_date = dt.datetime(2011, 12, 11)

    # recency kullanıcıya özel dinamik.
    rfm = dataframe.groupby('Customer ID').agg({
        'InvoiceDate': [
            lambda date: (date.max() - date.min()).days, lambda date:
            (today_date - date.min()).days
        ],
        'Invoice':
        lambda num: num.nunique(),
        'TotalPrice':
        lambda TotalPrice: TotalPrice.sum()
    })

    rfm.columns = rfm.columns.droplevel(0)

    # recency_cltv_p
    rfm.columns = ['recency_cltv_p', 'T', 'frequency', 'monetary']

    # basitleştirilmiş monetary_avg
    rfm["monetary"] = rfm["monetary"] / rfm["frequency"]

    rfm.rename(columns={"monetary": "monetary_avg"}, inplace=True)

    # BGNBD için WEEKLY RECENCY VE WEEKLY T'nin HESAPLANMASI
    # recency_weekly_cltv_p
    rfm["recency_weekly_cltv_p"] = rfm["recency_cltv_p"] / 7
    rfm["T_weekly"] = rfm["T"] / 7

    # KONTROL
    rfm = rfm[rfm["monetary_avg"] > 0]

    # recency filtre (daha saglıklı cltvp hesabı için)
    rfm = rfm[(rfm['frequency'] > 1)]

    rfm["frequency"] = rfm["frequency"].astype(int)

    # BGNBD
    bgf = BetaGeoFitter(penalizer_coef=0.01)
    bgf.fit(rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly'])

    # exp_sales_1_month
    rfm["exp_sales_1_month"] = bgf.predict(4, rfm['frequency'],
                                           rfm['recency_weekly_cltv_p'],
                                           rfm['T_weekly'])
    # exp_sales_3_month
    rfm["exp_sales_3_month"] = bgf.predict(12, rfm['frequency'],
                                           rfm['recency_weekly_cltv_p'],
                                           rfm['T_weekly'])

    # expected_average_profit
    ggf = GammaGammaFitter(penalizer_coef=0.01)
    ggf.fit(rfm['frequency'], rfm['monetary_avg'])
    rfm["expected_average_profit"] = ggf.conditional_expected_average_profit(
        rfm['frequency'], rfm['monetary_avg'])
    # 6 aylık cltv_p
    cltv = ggf.customer_lifetime_value(bgf,
                                       rfm['frequency'],
                                       rfm['recency_weekly_cltv_p'],
                                       rfm['T_weekly'],
                                       rfm['monetary_avg'],
                                       time=6,
                                       freq="W",
                                       discount_rate=0.01)

    rfm["cltv_p"] = cltv

    # minmaxscaler
    scaler = MinMaxScaler(feature_range=(1, 100))
    scaler.fit(rfm[["cltv_p"]])
    rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]])

    # rfm.fillna(0, inplace=True)

    # cltv_p_segment
    rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"])

    # recency_cltv_p, recency_weekly_cltv_p
    rfm = rfm[[
        "recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p",
        "T_weekly", "exp_sales_1_month", "exp_sales_3_month",
        "expected_average_profit", "cltv_p", "cltv_p_segment"
    ]]

    return rfm
Ejemplo n.º 29
0
def create_cltv_p(dataframe):
    today_date = dt.datetime(2011, 12, 11)

    # recency user-specific
    rfm = dataframe.groupby('Customer ID').agg({'InvoiceDate': [lambda date: (date.max() - date.min()).days,       # "recency_cltv_p"
                                                                lambda date: (today_date - date.min()).days],      # "T"
                                                'Invoice': lambda num: num.nunique(),                              # "frequency"
                                                'TotalPrice': lambda TotalPrice: TotalPrice.sum()})                # "monetary"
    rfm.columns = rfm.columns.droplevel(0)

    # recency_cltv_p
    rfm.columns = ["recency_cltv_p", "T", "frequency", "monetary"]

    # Simplified monetary_avg (since Gamma-Gamma model requires this way)
    rfm["monetary"] = rfm["monetary"] / rfm["frequency"]
    rfm.rename(columns={"monetary": "monetary_avg"}, inplace=True)

    # Calculating WEEKLY RECENCY VE WEEKLY T for BG/NBD MODEL
    # recency_weekly_cltv_p
    rfm["recency_weekly_cltv_p"] = rfm["recency_cltv_p"] / 7
    rfm["T_weekly"] = rfm["T"] / 7

    # CHECK IT OUT! Monetary avg must be positive
    rfm = rfm[rfm["monetary_avg"] > 0]

    # recency filter
    rfm = rfm[(rfm["frequency"] > 1)]
    rfm["frequency"] = rfm["frequency"].astype(int)  # converting it to integer just in case!

    # Establishing the BGNBD Model
    bgf = BetaGeoFitter(penalizer_coef=0.01)
    bgf.fit(rfm["frequency"],
            rfm["recency_weekly_cltv_p"],
            rfm["T_weekly"])

    # exp_sales_1_month
    rfm["exp_sales_1_month"] = bgf.predict(4,
                                           rfm["frequency"],
                                           rfm["recency_weekly_cltv_p"],
                                           rfm["T_weekly"])
    # exp_sales_3_month
    rfm["exp_sales_3_month"] = bgf.predict(12,
                                           rfm["frequency"],
                                           rfm["recency_weekly_cltv_p"],
                                           rfm["T_weekly"])

    # Establishing Gamma-Gamma Model  calculates=> Expected Average Profit
    ggf = GammaGammaFitter(penalizer_coef=0.01)
    ggf.fit(rfm["frequency"], rfm["monetary_avg"])
    rfm["expected_average_profit"] = ggf.conditional_expected_average_profit(rfm["frequency"],
                                                                             rfm["monetary_avg"])
    # CLTV Pred for 6 months
    cltv = ggf.customer_lifetime_value(bgf,
                                       rfm["frequency"],
                                       rfm["recency_weekly_cltv_p"],
                                       rfm["T_weekly"],
                                       rfm["monetary_avg"],
                                       time=6,
                                       freq="W",
                                       discount_rate=0.01)

    rfm["cltv_p"] = cltv

    # Minmaxscaler
    scaler = MinMaxScaler(feature_range=(1, 100))
    scaler.fit(rfm[["cltv_p"]])
    rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]])

    # rfm.fillna(0, inplace=True)

    # cltv_p_segment
    rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"])

    # recency_cltv_p, recency_weekly_cltv_p
    rfm = rfm[["recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p", "T_weekly",
               "exp_sales_1_month", "exp_sales_3_month", "expected_average_profit",
               "cltv_p", "cltv_p_segment"]]

    return rfm
Ejemplo n.º 30
0
def create_cltv_p(dataframe):
    today_date = dt.datetime(2011, 12, 11)

    rfm = dataframe.groupby('Customer ID').agg({
        'InvoiceDate': [
            lambda date: (date.max() - date.min()).days, lambda date:
            (today_date - date.min()).days
        ],
        'Invoice':
        lambda num: num.nunique(),
        'TotalPrice':
        lambda price: price.sum()
    })
    rfm.columns = rfm.columns.droplevel(0)

    rfm.columns = ['recency_cltv_p', 'T', 'frequency', 'monetary']
    rfm['monetary'] = rfm['monetary'] / rfm['frequency']

    rfm.rename(columns={'monetary': 'monetary_avg'}, inplace=True)

    rfm["recency_weekly_cltv_p"] = rfm['recency_cltv_p'] / 7
    rfm['T_weekly'] = rfm['T'] / 7

    rfm = rfm[rfm['monetary_avg'] > 0]
    rfm = rfm[(rfm['frequency'] > 1)]
    rfm['frequency'] = rfm['frequency'].astype(int)

    #BGNBD
    bgf = BetaGeoFitter(penalizer_coef=0.01)
    bgf.fit(rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly'])

    rfm["exp_sales_1_month"] = bgf.predict(4, rfm['frequency'],
                                           rfm['recency_weekly_cltv_p'],
                                           rfm['T_weekly'])
    rfm["exp_sales_3_month"] = bgf.predict(12, rfm['frequency'],
                                           rfm['recency_weekly_cltv_p'],
                                           rfm['T_weekly'])

    #Gamma Gamma
    ggf = GammaGammaFitter(penalizer_coef=0.01)
    ggf.fit(rfm['frequency'], rfm['monetary_avg'])
    rfm["expected_average_profit"] = ggf.conditional_expected_average_profit(
        rfm['frequency'], rfm['monetary_avg'])

    cltv = ggf.customer_lifetime_value(bgf,
                                       rfm['frequency'],
                                       rfm['recency_weekly_cltv_p'],
                                       rfm['T_weekly'],
                                       rfm['monetary_avg'],
                                       time=6,
                                       freq='W',
                                       discount_rate=0.01)

    rfm["cltv_p"] = cltv

    scaler = MinMaxScaler(feature_range=(1, 100))
    scaler.fit(rfm[["cltv_p"]])
    rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]])

    rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"])

    rfm = rfm[[
        "recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p",
        "T_weekly", "exp_sales_1_month", "exp_sales_3_month",
        "expected_average_profit", "cltv_p", "cltv_p_segment"
    ]]

    return rfm