def calc_clv(clv_recs, end, months=12):
    df = pandas.DataFrame(clv_recs)
    df = df[['player_id', 'start_date', 'theo_win']]
    df['theo_win'] = df['theo_win'].astype(float)
    
    end_date = parse(end)
    summary = summary_data_from_transaction_data(df, 
                                                 'player_id', 
                                                 'start_date', 
                                                 monetary_value_col='theo_win', 
                                                 observation_period_end=end_date)
    bgf = BetaGeoFitter(penalizer_coef=0.0)
    bgf.fit(summary['frequency'], summary['recency'], summary['T'])
    
    ggf = GammaGammaFitter(penalizer_coef = 0)
    ggf.fit(summary['frequency'], summary['monetary_value'])
    
    ggf_clv = ggf.customer_lifetime_value(
        bgf, #the model to use to predict the number of future transactions
        summary['frequency'],
        summary['recency'],
        summary['T'],
        summary['monetary_value'],
        time=months, 
        discount_rate=0.0
    )
    clv_df = pandas.DataFrame(ggf_clv)
    clv_df=clv_df.dropna()
    clv_df[clv_df['clv']<0] = 0.0
    summary=summary.merge(clv_df, left_index=True, right_index=True, how='inner')

    return summary
 def summaryOutput(self, discount_rate=0.12, months=12):
     '''
     Fit beta geometric model to calculate CLV, and use GG model to calculate expected profit
     Per customer
     Write out CLV and profits to csv, print out averages to screen
     '''
     beta_model = BetaGeoFitter()
     #calulate average transaction value
     self.summary_monetary['avg_transaction_value'] = self.ggf.conditional_expected_average_profit(
     self.summary_monetary['frequency'],
     self.summary_monetary['monetary_value'])
     #fit beta geo model
     beta_model.fit(self.summary_monetary['frequency'], self.summary_monetary['recency'], self.summary_monetary['T'])
     #calculate clv, with discount rate calulated over year (default)
     disc_rate = discount_rate/months/30
     self.summary_monetary['clv'] = self.ggf.customer_lifetime_value(
     beta_model, #the model to use to predict the number of future transactions
     self.summary_monetary['frequency'],
     self.summary_monetary['recency'],
     self.summary_monetary['T'],
     self.summary_monetary['monetary_value'], time=months, # months
     discount_rate=disc_rate # monthly discount rate ~ 12.7% annually
     )
     #print customer data with calculations
     self.summary_monetary.to_csv("CLV_AVG_transactionValue_perCustomer.csv", index=False)
     #print summary stats
     print("Expected conditional average profit: {}, Average profit: {}".format(
     self.ggf.conditional_expected_average_profit(
         self.summary_monetary['frequency'],
         self.summary_monetary['monetary_value']).mean(),
     self.summary_monetary[self.summary_monetary['frequency']>0]['monetary_value'].mean()))
Esempio n. 3
0
def train_metric(d, metric, plot=True, penalty=0):
    frequency = metric + "_frequency"
    recency = metric + "_recency"
    T = metric + "_T"
    train = d
    train = train[(train[frequency] > 0) & (train[recency] >= 0)]
    train[frequency] = train[frequency] - 1

    bgf = BetaGeoFitter(penalizer_coef=penalty)
    bgf.fit(train[frequency], train[recency], train[T])
    n = bgf.data.shape[0]
    simulated_data = bgf.generate_new_data(size=n)

    model_counts = pd.DataFrame(
        bgf.data["frequency"].value_counts().sort_index().iloc[:28])
    simulated_counts = pd.DataFrame(
        simulated_data["frequency"].value_counts().sort_index().iloc[:28])
    combined_counts = model_counts.merge(simulated_counts,
                                         how="outer",
                                         left_index=True,
                                         right_index=True).fillna(0)
    combined_counts.columns = ["Actual", "Model"]
    if plot:
        combined_counts.plot.bar()
        display()
    return combined_counts, bgf
def single_customer_evaluation(time_units=243):
    """
    Predicts Number of Purchases of a randomly chosen customer from the dataset.
    (conditional_expected_number_of_purchases_up_to_time)

    Parameters
    ----------
        time_units: int, default=243.
            Number of days for prediction.

    Returns
    -------
        (frequency_predicted, frequency_holdout)
    """
    # Loading Calibration Model.
    cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0)
    cal_bg_nbd.load_model(path="models/calibration_model.pkl")

    # Loading summary_cal_holdout dataset.
    summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv")

    # Randomly sample single customer.
    individual = summary_cal_holdout.sample()
    frequency_prediction = cal_bg_nbd.predict(
        t=time_units,
        frequency=individual["frequency_cal"],
        recency=individual["recency_cal"],
        T=individual["T_cal"])
    frequency_holdout = individual["frequency_holdout"]

    return frequency_prediction, frequency_holdout
Esempio n. 5
0
    def test_plot_incremental_transactions(self):
        """Test plotting incremental transactions with CDNOW example."""
        transactions = load_dataset('CDNOW_sample.txt', header=None, sep='\s+')
        transactions.columns = [
            'id_total', 'id_sample', 'date', 'num_cd_purc', 'total_value'
        ]
        t = 39
        freq = 'W'

        transactions_summary = utils.summary_data_from_transaction_data(
            transactions,
            'id_sample',
            'date',
            datetime_format='%Y%m%d',
            observation_period_end='19970930',
            freq=freq)

        bgf = BetaGeoFitter(penalizer_coef=0.01)
        bgf.fit(transactions_summary['frequency'],
                transactions_summary['recency'], transactions_summary['T'])

        plt.figure()
        plotting.plot_incremental_transactions(bgf,
                                               transactions,
                                               'date',
                                               'id_sample',
                                               2 * t,
                                               t,
                                               freq=freq,
                                               xlabel='week',
                                               datetime_format='%Y%m%d')
        return plt.gcf()
Esempio n. 6
0
def bgf(cd_data):
    bgf_model = BetaGeoFitter()
    bgf_model.fit(cd_data['frequency'],
                  cd_data['recency'],
                  cd_data['T'],
                  iterative_fitting=1)
    return bgf_model
Esempio n. 7
0
def fitted_bg(example_summary_data):
    bg = BetaGeoFitter()
    bg.fit(example_summary_data['frequency'],
           example_summary_data['recency'],
           example_summary_data['T'],
           iterative_fitting=0)
    return bg
def readBetaGeoFitterModel():

    betaGeoFitterModel = BetaGeoFitter()

    betaGeoFitterModel.load_model("BetaGeoFitterModel.pkl")

    return betaGeoFitterModel
def create_cltv_pred(dataframe, w=4, m=1):
    """
    Gamagama and BGNBD model and prediction
    Parameters
    ----------
    dataframe
    w: int, week information for BGNBD model
    m: int, month information for gamama model

    Returns
    Dataframe
    -------

    """
    # BGNBD

    dataframe = dataframe[dataframe["monetary_avg"] > 0]
    dataframe["frequency"] = dataframe["frequency"].astype(int)

    bgf = BetaGeoFitter(penalizer_coef=0.001)
    bgf.fit(dataframe['frequency'], dataframe['recency_weekly'],
            dataframe['T_weekly'])

    dataframe[f'exp_sales_{w}_week'] = bgf.predict(w, dataframe['frequency'],
                                                   dataframe['recency_weekly'],
                                                   dataframe['T_weekly'])

    # Gamagama - expected_average_profit
    ggf = GammaGammaFitter(penalizer_coef=0.001)
    ggf.fit(dataframe['frequency'], dataframe['monetary_avg'])
    dataframe[
        "expected_average_profit"] = ggf.conditional_expected_average_profit(
            dataframe['frequency'], dataframe['monetary_avg'])

    # CLTV Prediction
    cltv = ggf.customer_lifetime_value(bgf,
                                       dataframe['frequency'],
                                       dataframe['recency_weekly'],
                                       dataframe['T_weekly'],
                                       dataframe['monetary_avg'],
                                       time=m,
                                       freq="W",
                                       discount_rate=0.01)

    dataframe[f'cltv_p_{m}_month'] = cltv

    scaler = MinMaxScaler(feature_range=(1, 100))
    dataframe['cltv_p_score'] = scaler.fit_transform(
        dataframe[[f'cltv_p_{m}_month']])

    # cltv_p Segment
    dataframe['cltv_p_segment'] = pd.qcut(dataframe['cltv_p_score'],
                                          3,
                                          labels=['C', 'B', 'A'])

    new_col = dataframe.columns[~dataframe.columns.
                                isin(['recency', 'frequency', 'monetary'])]
    dataframe = dataframe[new_col]

    return dataframe
Esempio n. 10
0
def test_expected_cumulative_transactions_date_index(cdnow_transactions):
    """
    Test set_index as date for cumulative transactions and bgf fitter.

    Get first 14 cdnow transactions dates and validate that date index,
    freq_multiplier = 1 working and compare with tested data for last 4 records.

    dates = ['1997-01-11', '1997-01-12', '1997-01-13', '1997-01-14']
    actual_trans = [11, 12, 15, 19]
    expected_trans = [10.67, 12.67, 14.87, 17.24]

    """
    datetime_col = "date"
    customer_id_col = "id_sample"
    t = 14
    datetime_format = "%Y%m%d"
    freq = "D"
    observation_period_end = "19970930"
    freq_multiplier = 1

    transactions_summary = utils.summary_data_from_transaction_data(
        cdnow_transactions,
        customer_id_col,
        datetime_col,
        datetime_format=datetime_format,
        freq=freq,
        freq_multiplier=freq_multiplier,
        observation_period_end=observation_period_end,
    )

    transactions_summary = transactions_summary.reset_index()

    model = BetaGeoFitter()
    model.fit(transactions_summary["frequency"],
              transactions_summary["recency"], transactions_summary["T"])

    df_cum = utils.expected_cumulative_transactions(
        model,
        cdnow_transactions,
        datetime_col,
        customer_id_col,
        t,
        datetime_format,
        freq,
        set_index_date=True,
        freq_multiplier=freq_multiplier,
    )

    dates = ["1997-01-11", "1997-01-12", "1997-01-13", "1997-01-14"]
    actual_trans = [11, 12, 15, 19]
    expected_trans = [10.67, 12.67, 14.87, 17.24]

    date_index = df_cum.iloc[-4:].index.to_timestamp().astype(str)
    actual = df_cum["actual"].iloc[-4:].values
    predicted = df_cum["predicted"].iloc[-4:].values.round(2)

    assert all(dates == date_index)
    assert_allclose(actual, actual_trans)
    assert_allclose(predicted, expected_trans, atol=1e-2)
Esempio n. 11
0
def bgf_transactions(cdnow_transactions):
    transactions_summary = utils.summary_data_from_transaction_data(
        cdnow_transactions, 'id_sample', 'date', datetime_format='%Y%m%d',
        observation_period_end='19970930', freq='W')

    bgf = BetaGeoFitter(penalizer_coef=0.01)
    bgf.fit(transactions_summary['frequency'],
            transactions_summary['recency'], transactions_summary['T'])
    return bgf
Esempio n. 12
0
def fitted_bg(example_summary_data):
    bg = BetaGeoFitter()
    bg.fit(
        example_summary_data["frequency"],
        example_summary_data["recency"],
        example_summary_data["T"],
        iterative_fitting=2,
        tol=1e-6,
    )
    return bg
Esempio n. 13
0
def bgnbd_model(summary):
    """Instantiate and fit a BG/NBD model.

  Args:
    summary: RFM transaction data
  Returns:
    bgnbd model fit to the data
  """
    bgf = BetaGeoFitter(penalizer_coef=PENALIZER_COEF)
    bgf.fit(summary['frequency'], summary['recency'], summary['T'])
    return bgf
Esempio n. 14
0
def rfm_model(data, end_date, f, p):
    rfm1 = lifetimes.utils.summary_data_from_transaction_data(
        data,
        'customer_id',
        'date',
        monetary_value_col='amount',
        observation_period_end=end_date,
        freq=f)
    rfm1 = rfm1[rfm1.monetary_value < 600]
    bgf = BetaGeoFitter(penalizer_coef=p)
    bgf.fit(rfm1['frequency'], rfm1['recency'], rfm1['T'])
    return rfm1, bgf
def evaluation_plots(plot_type):
    """
    Evaluation Plots:
    - Tracking Cumulative Transactions
    - Tracking Daily Transactions
    - Frequency of Repeated Transactions
    - Calibration vs Holdout.

    Parameters
    ----------
        plot_type: str.
            "tracking" - Tracking Cumulative and Tracking Daily Transactions.
            "repeated" - Frequency of Repeated Transactions.
            "calibration_holdout" - Calibration vs Holdout Purchases.
    """
    # Loading Calibration Model.
    cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0)
    cal_bg_nbd.load_model(path="models/calibration_model.pkl")

    # Loading summary_cal_holdout dataset.
    summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv")

    # Loading Transactions.
    transactions = pd.read_csv("datasets/transactions.csv")

    if plot_type == "tracking":
        fig = plt.figure(figsize=(20, 4))
        plot_cumulative_transactions(model=cal_bg_nbd,
                                     transactions=transactions,
                                     datetime_col="order_purchase_timestamp",
                                     customer_id_col="customer_unique_id",
                                     t=604,
                                     t_cal=512,
                                     freq="D",
                                     ax=fig.add_subplot(121))

        plot_incremental_transactions(model=cal_bg_nbd,
                                      transactions=transactions,
                                      datetime_col="order_purchase_timestamp",
                                      customer_id_col="customer_unique_id",
                                      t=604,
                                      t_cal=512,
                                      freq="D",
                                      ax=fig.add_subplot(122))
    elif plot_type == "repeated":
        plot_period_transactions(model=cal_bg_nbd)

    elif plot_type == "calibration_holdout":
        plot_calibration_purchases_vs_holdout_purchases(
            model=cal_bg_nbd, calibration_holdout_matrix=summary_cal_holdout)
    return
Esempio n. 16
0
def upload():
    # -*- coding: utf-8 -*-
    if request.method == 'POST':
        f = request.files['file']

        basepath = os.path.dirname(__file__)
        file_path = os.path.join(basepath, 'uploads',
                                 secure_filename(f.filename))
        f.save(file_path)
        df = pd.read_csv(file_path)

        df['salesDate'] = pd.to_datetime(df['salesDate'])

        cols_of_interest = ['memberID', 'salesDate', 'sales']
        df = df[cols_of_interest]

        df['memberID'] = df['memberID'].apply(lambda x: format(x, '.0f'))

        max_date = df['salesDate'].max()
        min_date = max_date - relativedelta(months=+12)

        df = df.loc[(df['salesDate'] >= min_date)
                    & (df['salesDate'] <= max_date)]

        min_order = df['salesDate'].min()
        max_order = df['salesDate'].max()
        data = summary_data_from_transaction_data(
            df,
            'memberID',
            'salesDate',
            monetary_value_col='sales',
            observation_period_end=max_order)

        d2 = data.sort_values('frequency', ascending=False)

        bgf = BetaGeoFitter(penalizer_coef=0.0001)
        bgf.fit(data['frequency'], data['recency'], data['T'])

        t = 30
        data[
            'customer_livelyhood'] = bgf.conditional_expected_number_of_purchases_up_to_time(
                t, data['frequency'], data['recency'], data['T'])

        data.sort_values(by='customer_livelyhood',
                         ascending=False,
                         inplace=True)

        return data.to_html()
    return None
Esempio n. 17
0
def trainBetaGeoFitterModel():

    summaryDataFromTransactionDataForCLV = readsummaryDataFromTransactionDataForCLV(
    )

    #training model
    betaGeoFitterModel = BetaGeoFitter(penalizer_coef=0.0)

    betaGeoFitterModel.fit(summaryDataFromTransactionDataForCLV["frequency"],
                           summaryDataFromTransactionDataForCLV["recency"],
                           summaryDataFromTransactionDataForCLV["T"])

    #saving the model in pickle file
    saveBetaGeoFitterModel(betaGeoFitterModel)

    print(betaGeoFitterModel.summary)
Esempio n. 18
0
    def fit(self,months=96):
        """
        Computes CLV estimates for the next n months and stores results in self.results
        INPUT
            months (int) number of months to predict, default = 96 (8 years)
        """
        ### PREDICT NUMBER OF PURCHASES
        self.bgf = BetaGeoFitter() # see lifetimes module documentation for details
        self.bgf.fit(self.data['frequency'], self.data['recency'], self.data['T'])
        # 8 years = 96 months
        self.data['predicted_purchases'] = self.bgf.conditional_expected_number_of_purchases_up_to_time(
                months,
                self.data['frequency'],
                self.data['recency'],
                self.data['T'])

        ### PREDICT FUTURE PURCHASE AMOUNT
        self.ggf = GammaGammaFitter(penalizer_coef = 0)
        self.ggf.fit(self.data['frequency'], self.data['monetary_value'])
        # predict next transaction
        self.data['predicted_trans_profit'] = self.ggf.conditional_expected_average_profit(
                frequency = self.data['frequency'],
                monetary_value = self.data['monetary_value'])
        
        ### ESTIMATE CLV
        self.data['clv_estimation'] = self.data['predicted_trans_profit'] * self.data['predicted_purchases']
        self.data['prob_alive'] = self.bgf.conditional_probability_alive(
                self.data['frequency'],
                self.data['recency'],
                self.data['T'])
        self.results = self.data.sort_values(by='clv_estimation',ascending=False)
        # store results
        self.results.to_csv(self.outfile2,index=False)
def probability_alive(historical_rfm_data):
    """
    Predicted Conditional Probability Alive.

    Parameters
    ----------
        historical_rfm_data: Historical Frequency, Recency & T of an individual

    Returns
    -------
        Conditional Probability Alive.
    """
    clv_model = BetaGeoFitter(penalizer_coef=0.0)
    clv_model.load_model(path="models/customer_lifetime_estimator.pkl")

    alive_probability = clv_model.conditional_probability_alive(
        frequency=historical_rfm_data["frequency"],
        recency=historical_rfm_data["recency"],
        T=historical_rfm_data["T"])
    return alive_probability
def root_mean_squared_error(time_units=243):
    """
    Calculates Root Mean Squared Error of all predictions.

    Parameters
    ----------
        time_units: int, default=243.
            Number of days for prediction.

    Yields
    ------
        summary_cal_holdout_preds.csv.

    Returns
    ------
        rmse
    """
    # Loading Calibration Model.
    cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0)
    cal_bg_nbd.load_model(path="models/calibration_model.pkl")

    # Loading summary_cal_holdout dataset.
    summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv")
    frequency_holdout = summary_cal_holdout["frequency_holdout"].copy()

    # Predictions.
    frequency_predictions = cal_bg_nbd.predict(
        t=time_units,
        frequency=summary_cal_holdout["frequency_cal"],
        recency=summary_cal_holdout["recency_cal"],
        T=summary_cal_holdout["T_cal"])

    # Adding Predictions to Summary dataset.
    summary_cal_holdout["frequency_predictions"] = frequency_predictions.copy()
    file_path = Path.cwd() / "datasets/summary_cal_holdout_preds.csv"
    summary_cal_holdout.to_csv(file_path, index=False)

    rmse = mean_squared_error(frequency_holdout,
                              frequency_predictions,
                              squared=False)
    return rmse
def number_of_purchases(historical_rfm_data, time_units=30):
    """
    Predicted Conditional Expected Number of Purchases.

    Parameters
    ----------
        historical_rfm_data: Historical Frequency, Recency & T of an individual

        time_units: int, default=30.
            Number of days for predictions.
    Returns
    -------
        expected number of purchases.
    """
    clv_model = BetaGeoFitter(penalizer_coef=0.0)
    clv_model.load_model(path="models/customer_lifetime_estimator.pkl")
    frequency_predictions = clv_model.predict(
        t=time_units,
        frequency=historical_rfm_data["frequency"],
        recency=historical_rfm_data["recency"],
        T=historical_rfm_data["T"])
    return frequency_predictions
Esempio n. 22
0
def estimate_clv_model(summary, model_penalizer=None):
  #set default values if they are not stated
  if model_penalizer is None:
    model_penalizer = 0

  # Building the Model using BG/NBD
  bgf = BetaGeoFitter(penalizer_coef=model_penalizer)
  bgf.fit(summary['frequency'], summary['recency'], summary['T'])

  # There cannot be non-positive values in the monetary_value or frequency vector
  summary_with_value_and_returns = summary[(summary['monetary_value']>0) & (summary['frequency']>0)]
  # Setting up Gamma Gamma model
  ggf = GammaGammaFitter(penalizer_coef = 0)
  ggf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) 

  # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors
  if not (len(x) == 0 for x in [summary_with_value_and_returns['recency'],summary_with_value_and_returns['frequency'],summary_with_value_and_returns['T']]):
    bgf.fit(summary_with_value_and_returns['frequency'],summary_with_value_and_returns['recency'],summary_with_value_and_returns['T'])

  return [bgf, ggf]
def _calibration_model():
    """
    Trains BG/NBD Calibration Model.

    Yields
    ------
        calibration_model.pkl
    """
    summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv")

    # Training Calibration Model.
    cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0)
    cal_bg_nbd.fit(frequency=summary_cal_holdout["frequency_cal"],
                   recency=summary_cal_holdout["recency_cal"],
                   T=summary_cal_holdout["T_cal"],
                   verbose=True)

    # Saving Model.
    file_path = Path.cwd() / "models/calibration_model.pkl"
    cal_bg_nbd.save_model(path=file_path)
    return
def _clv_model():
    """
    Trains BG/NBD Model on entire RFM data, final fit.

    Yields
    ------
        customer_lifetime_estimator.pkl
    """
    summary = pd.read_csv("datasets/summary.csv")

    # Training Calibration Model.
    clv = BetaGeoFitter(penalizer_coef=0.0)
    clv.fit(frequency=summary["frequency"],
            recency=summary["recency"],
            T=summary["T"],
            verbose=True)

    # Saving Model.
    file_path = Path.cwd() / "models/customer_lifetime_estimator.pkl"
    clv.save_model(path=file_path)
    return
Esempio n. 25
0
customer’s purchases divided by the total number of purchases. Note that the denominator here is different than 
the frequency described above. 
"""
data = summary_data_from_transaction_data(
    df,
    customer_id,
    date_col,
    monetary_value_col='Sales',
)
# observation_period_end='2011-12-9') # default period end date is the date when the last transaction happened

### Basic Frequency/Recency analysis using the BG/NBD model ###
"""
BG/NBD is an attractive alternative to the Pareto/NBD, which costs less computation and yields similar results.
"""
bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(data['frequency'], data['recency'], data['T'])
print(bgf)
# For small samples sizes, the parameters can get implausibly large, so by adding an l2 penalty the likelihood,
# we can control how large these parameters can be. This is implemented as setting as positive penalizer_coef in the
# initialization of the model. In typical applications, penalizers on the order of 0.001 to 0.1 are effective.

# Model fit
plot_period_transactions(bgf)  # Calibration

summary_cal_holdout = calibration_and_holdout_data(
    df,
    customer_id,
    date_col,
    calibration_period_end='2011-06-08',
    observation_period_end='2011-12-9')
def test_everything(X_train, y_train, X_test, y_test):
    '''
    1) test whether Full AdaBoost model performs better than BG/NBD
    2) test whether AdaBoost model trained on same 
        variables performs better
    3) test Adaboost splitted in 8 RFM groups
         vs AdaBoost at once vs AdaBoost at RFM
    4) alternative test/train split
    '''
    #####################
    ##  FULL ADABOOST  ##
    #####################
    print_annotation('FULL ADABOOST')
    ada = AdaBoostClassifier()
    n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)]
    learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)]

    random_grid = {'n_estimators': n_estimators,
                'learning_rate': learning_rate}
    clf = GridSearchCV(ada, random_grid, verbose=False,
                    n_jobs=3, scoring='f1').fit(X_train, y_train)

    # print(clf.best_params_)
    y_pred_full_ada = clf.predict(X_test)
    print(confusion_matrix(y_test, y_pred_full_ada))
    print(classification_report(y_test, y_pred_full_ada))


    ########################
    ##  PARTIAL ADABOOST  ##
    ########################
    print_annotation('PARTIAL ADABOOST')
    ada = AdaBoostClassifier()
    n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)]
    learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)]

    random_grid = {'n_estimators': n_estimators,
                   'learning_rate': learning_rate}
    clf = GridSearchCV(ada, random_grid, verbose=False,
                       n_jobs=3, scoring='f1') \
                       .fit(X_train[['txn_total', 'recency_true', 'T']], y_train)
    y_pred_part_ada = clf.predict(X_test[['txn_total', 'recency_true', 'T']])
    print(confusion_matrix(y_test, y_pred_part_ada))
    print(classification_report(y_test, y_pred_part_ada))


    ##################
    ###   BG/NBD   ###
    ##################
    print_annotation('BG/NBD')
    bgf = BetaGeoFitter(penalizer_coef=0.0)
    bgf.fit(X_train['txn_total'], X_train['recency_true'] / 7,
        X_train['T'] / 7)
    
    t = 52
    y_pred_bgnbd = bgf \
        .conditional_expected_number_of_purchases_up_to_time(
                t, X_test['txn_total'], 
                X_test['recency_true'] / 7,
                X_test['T'] / 7
            )
    for threshold in np.linspace(0.7, 1.8, 4):
        threshold = round(threshold, 2)
        print('_' * 25)
        print(f"BG/NBD threshold: {threshold}")
        y_pred_bgnbd_tf = y_pred_bgnbd < threshold
        print('churn rate: ' + str(sum(y_pred_bgnbd_tf) / len(y_pred_bgnbd_tf)))
        print(confusion_matrix(y_test, y_pred_bgnbd_tf))
        print(classification_report(y_test, y_pred_bgnbd_tf))


    #############################
    ###   ALTERNATIVE SPLIT   ###
    #############################
    print('_' * 25)
    print('_,-*-,' * 4)
    print('_' * 25)
    print_annotation('FULL ADABOOST alt split')

    X_train_alt, X_test_alt, y_train_alt, y_test_alt = \
        train_test_split(X_test, y_test, test_size=0.33, random_state=42)
    
    
    ada = AdaBoostClassifier()
    n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)]
    learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)]

    random_grid = {'n_estimators': n_estimators,
                   'learning_rate': learning_rate}
    clf = GridSearchCV(ada, random_grid, verbose=False,
                       n_jobs=3, scoring='f1').fit(X_train_alt, y_train_alt)

    # print(clf.best_params_)
    y_pred_ada_alt = clf.predict(X_test_alt)
    print(confusion_matrix(y_test_alt, y_pred_ada_alt))
    print(classification_report(y_test_alt, y_pred_ada_alt))
    ######################################

    print_annotation('PARTIAL ADABOOST alt split')
    ada = AdaBoostClassifier()
    n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)]
    learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)]

    random_grid = {'n_estimators': n_estimators,
                   'learning_rate': learning_rate}
    clf = GridSearchCV(ada, random_grid, verbose=False,
                       n_jobs=3, scoring='f1') \
        .fit(X_train_alt[['txn_total', 'recency_true', 'T']], y_train_alt)
    y_pred_part_ada_alt = clf.predict(
        X_test_alt[['txn_total', 'recency_true', 'T']])
    print(confusion_matrix(y_test_alt, y_pred_part_ada_alt))
    print(classification_report(y_test_alt, y_pred_part_ada_alt))


    ######################################
    print_annotation('BD/NBD alt split')
    bgf = BetaGeoFitter(penalizer_coef=0.0)
    bgf.fit(X_train_alt['txn_total'], X_train_alt['recency_true'] / 7,
            X_train_alt['T'] / 7)

    t = 52
    y_pred_bgnbd_ALT = bgf \
        .conditional_expected_number_of_purchases_up_to_time(
            t, X_test_alt['txn_total'],
            X_test_alt['recency_true'] / 7,
            X_test_alt['T'] / 7
        )
    for threshold in np.linspace(0.2, 2.5, 6):
        print('_' * 25)
        print(f"BG/NBD threshold: {threshold}")
        y_pred_bgnbd_tf_alt = y_pred_bgnbd_ALT < threshold
        print('churn rate: ' + str(sum(y_pred_bgnbd_tf_alt) / len(y_pred_bgnbd_tf_alt)))
        print(confusion_matrix(y_test_alt, y_pred_bgnbd_tf_alt))
        print(classification_report(y_test_alt, y_pred_bgnbd_tf_alt))
yr_pred = clf.predict(X_test)
print(confusion_matrix(y_test, yr_pred))
print(classification_report(y_test, yr_pred))

X_test['churn'] = y_test2
X_test['pred_8m'] = y_pred
X_test['pred_1m'] = yr_pred

#%%
X_test.to_csv('matrix.csv')

#%%
from lifetimes import BetaGeoFitter

# similar API to scikit-learn and lifelines.
bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(X_train['txn_total'], X_train['recency_true']/7,
        X_train['T']/7)
print(bgf)

%matplotlib inline
from lifetimes.plotting import plot_frequency_recency_matrix

plot_frequency_recency_matrix(bgf)

#%%
from lifetimes.plotting import plot_probability_alive_matrix

f=plot_probability_alive_matrix(bgf)

t=52
Esempio n. 28
0
# removal of test records and negative value
df.drop(df[df["RECENCY"] > df["T"]].index, inplace=True)
#df.drop(df[df["MONETARY_VALUE"] <= 10.00].index, inplace = True)

# ==========================================================================
# Data check
# ==========================================================================
# Order distribution by frequency
df["FREQUENCY"].plot(kind="hist", bins=50)

# ==========================================================================
# BG/NBD model
# ==========================================================================

bgf = BetaGeoFitter(penalizer_coef=0.01)
bgf.fit(df["FREQUENCY"], df["RECENCY"], df["T"])

bgf.summary

plotting.plot_frequency_recency_matrix(bgf)
plotting.plot_probability_alive_matrix(bgf)

# Repeat transaction model check
plotting.plot_period_transactions(bgf)

# ==========================================================================
# Ranking reps from best to worst
# ==========================================================================

t = 1
Esempio n. 29
0
def load_data_and_model():
    """Loads Customer Lifetime Estimator Model"""
    model = BetaGeoFitter(penalizer_coef=0.0)
    model.load_model("../models/calibration_model.pkl")
    summary_cal_holdout = pd.read_csv("../datasets/summary_cal_holdout.csv")
    return model, summary_cal_holdout
Esempio n. 30
0
import lifetimes
from lifetimes import BetaGeoFitter
from lifetimes.plotting import plot_frequency_recency_matrix
from lifetimes.plotting import plot_probability_alive_matrix

import pandas as pd

data = pd.read_csv('lifetimes')


bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(data['frequency'], data['recency'], data['T'])

print bgf

plot_frequency_recency_matrix(bgf)

#plot_probability_alive_matrix(bgf)
Esempio n. 31
0
import os
import pandas as pd
import pytest

import matplotlib

matplotlib.use("AGG")  # use a non-interactive backend
from matplotlib import pyplot as plt

from lifetimes import plotting
from lifetimes import BetaGeoFitter, ParetoNBDFitter, ModifiedBetaGeoFitter
from lifetimes.datasets import load_cdnow, load_transaction_data
from lifetimes import utils

bgf = BetaGeoFitter()
cd_data = load_cdnow()
bgf.fit(cd_data["frequency"], cd_data["recency"], cd_data["T"], iterative_fitting=0)


@pytest.mark.plottest
class TestPlotting:
    @pytest.mark.mpl_image_compare(tolerance=30)
    def test_plot_period_transactions(self):
        plt.figure()
        plotting.plot_period_transactions(bgf)
        return plt.gcf()

    @pytest.mark.mpl_image_compare(tolerance=30)
    def test_plot_period_transactions_parento(self):
        pnbd = ParetoNBDFitter()
        pnbd.fit(cd_data["frequency"], cd_data["recency"], cd_data["T"], iterative_fitting=0)
Esempio n. 32
0
class CLV(object):
    """
    INPUT
        pmg_num (int) the product market group number, default = 1
        outfile1 (str) the filename indicating where to store the raw data before analysis, default = '../data/clvtrainingset01.csv'
        outfile2 (str) the filename containing the results, default = '../data/clv01.csv'
        date_range (list) the start date and end date of the years to analyze, default = ['2008-09-01','2016-09-01']
    attributes other than those listed above
        self.data (DataFrame) a pandas DataFrame object of the data to be used for analysis
        self.bgf (from lifetimes) a statistical model object from the lifetimes package
        self.ggf (from lifetimes) a statistical model object from the lifetimes package
        self.results (DataFrame) a pandas DataFrame object of the results of analysis
    """
    def __init__(self,pmg_num=1,outfile1='../data/clvtrainingset01.csv',outfile2='../data/clv01.csv',date_range=['2008-09-01','2016-09-01']):
        self.pmg_num = pmg_num
        # outfile1 stores a clean version of the raw data used for analysis; this is important for reproducibility
        self.outfile1 = outfile1
        # outfile2 stores the clv estimation results
        self.outfile2 = outfile2
        self.date_range = date_range
        self.data = None
        self.bgf = None
        self.ggf = None
        self.results = None

    def get_data_from_server(self,cmd=None):
        """
        Gets data from sales_db and stores the query results in self.data
        INPUT
            cmd (str) the default sql query is below

            The default query has been replaced. The original query was an 8 line select command.
        """
        # server name
        dsn = "THE SERVER NAME"
        cnxn_name = "DSN=%s" % dsn
        connection = odbc.connect(cnxn_name) # use to access the database
        c = connection.cursor() # generate cursor object
        
        # Grab transaction data from Postgres
        if not cmd:
            cmd = """SQL DEFAULT COMMAND GOES HERE""" % (self.pmg_num,self.date_range[0],self.date_range[1])
        
        c.execute(cmd) # execute the sql command
        
        # list to store the query data
        transaction_data = []
        
        # create a dictionary to convert customer ids to name
        to_name = dict(np.genfromtxt('../data/names.csv',dtype=str,delimiter='\t'))
        
        for row in c:
            cust, rsv_date, sales = row # pull data from each row of the query data
            cust_id = str(int(cust))
            name = to_name[cust_id]
            # check to see if customer is inactive
            if use(name):
                rsv_date1_readable = rsv_date.strftime('%Y-%m-%d') # date formatting
                sales_float = float(sales) # convert to float; represents the transaction amount
                transaction_data.append({"id":cust, "date":rsv_date, "sales":sales_float}) # add dictionary of data to list
        
        # convert to dataframe
        df = pd.DataFrame(transaction_data, columns=['id', 'date', 'sales'])
        # store results
        df.to_csv(self.outfile1,index=False)
        # IMPORTANT: use correct observation_period_end date
        self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M')

    def get_data_from_file(self,filename,**kwargs):
        df = pd.read_csv(filename,**kwargs)
        self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M')

    def fit(self,months=96):
        """
        Computes CLV estimates for the next n months and stores results in self.results
        INPUT
            months (int) number of months to predict, default = 96 (8 years)
        """
        ### PREDICT NUMBER OF PURCHASES
        self.bgf = BetaGeoFitter() # see lifetimes module documentation for details
        self.bgf.fit(self.data['frequency'], self.data['recency'], self.data['T'])
        # 8 years = 96 months
        self.data['predicted_purchases'] = self.bgf.conditional_expected_number_of_purchases_up_to_time(
                months,
                self.data['frequency'],
                self.data['recency'],
                self.data['T'])

        ### PREDICT FUTURE PURCHASE AMOUNT
        self.ggf = GammaGammaFitter(penalizer_coef = 0)
        self.ggf.fit(self.data['frequency'], self.data['monetary_value'])
        # predict next transaction
        self.data['predicted_trans_profit'] = self.ggf.conditional_expected_average_profit(
                frequency = self.data['frequency'],
                monetary_value = self.data['monetary_value'])
        
        ### ESTIMATE CLV
        self.data['clv_estimation'] = self.data['predicted_trans_profit'] * self.data['predicted_purchases']
        self.data['prob_alive'] = self.bgf.conditional_probability_alive(
                self.data['frequency'],
                self.data['recency'],
                self.data['T'])
        self.results = self.data.sort_values(by='clv_estimation',ascending=False)
        # store results
        self.results.to_csv(self.outfile2,index=False)

    def plot_matrices(self):
        """
        plots three matrices:
            probability alive matrix: displays the probability that a customer is active
            frequency recency matrix: displays frequency and recency with color corresponding
                                        to monetary value
            period transactions: displays predicted and actual transaction values over time
            (check documentation in lifetimes for more details)
        """
        plot_probability_alive_matrix(self.bgf,cmap='viridis')
        plot_frequency_recency_matrix(self.bgf,cmap='viridis')
        plot_period_transactions(self.bgf)
Esempio n. 33
0
def generate_clv_table(data, clv_prediction_time=None, model_penalizer=None):

    #set default values if they are not stated
    if clv_prediction_time is None:
        clv_prediction_time = 12
    if model_penalizer is None:
        model_penalizer = 0

    # Reformat csv as a Pandas dataframe
    #data = pd.read_csv(csv_file)

    #Remove non search sessions
    data = data[data['Searches'] > 0]

    max_date = data['activity_date'].max()

    # Using "summary_data_from_transaction_data" function to agregate the activity stream into the appropriate metrics
    # Model requires 'activity_date' column name.  For our purpose this is synonymous with submission_date.
    summary = summary_data_from_transaction_data(
        data,
        'client_id',
        'activity_date',
        'Revenue',
        observation_period_end=max_date)

    # Building the Model using BG/NBD
    bgf = BetaGeoFitter(penalizer_coef=model_penalizer)
    bgf.fit(summary['frequency'], summary['recency'], summary['T'])

    # Conditional expected purchases
    # These are the expected purchases expected from each individual given the time specified

    # t = days in to future
    t = 14
    summary[
        'predicted_searches'] = bgf.conditional_expected_number_of_purchases_up_to_time(
            t, summary['frequency'], summary['recency'], summary['T'])

    #Conditional Alive Probability
    summary['alive_prob'] = summary.apply(
        lambda row: calc_alive_prob(row, bgf), axis=1)
    summary['alive_prob'] = summary['alive_prob'].astype(float)
    #print summary['alive_prob']

    # There cannot be non-positive values in the monetary_value or frequency vector
    summary_with_value_and_returns = summary[(summary['monetary_value'] > 0)
                                             & (summary['frequency'] > 0)]

    # There cannot be zero length vectors in one of frequency, recency or T
    #summary_with_value_and_returns =
    #print summary_with_value_and_returns[
    #    (len(summary_with_value_and_returns['recency'])>0) &
    #    (len(summary_with_value_and_returns['frequency'])>0) &
    #    (len(summary_with_value_and_returns['T'])>0)
    #]

    if any(
            len(x) == 0 for x in [
                summary_with_value_and_returns['recency'],
                summary_with_value_and_returns['frequency'],
                summary_with_value_and_returns['T']
            ]):
        logger.debug(data['client_id'])

    # Setting up Gamma Gamma model
    ggf = GammaGammaFitter(penalizer_coef=0)
    ggf.fit(summary_with_value_and_returns['frequency'],
            summary_with_value_and_returns['monetary_value'])

    # Output average profit per tranaction by client ID
    ggf_output = ggf.conditional_expected_average_profit(
        summary_with_value_and_returns['frequency'],
        summary_with_value_and_returns['monetary_value'])

    # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors
    if not (len(x) == 0 for x in [
            summary_with_value_and_returns['recency'],
            summary_with_value_and_returns['frequency'],
            summary_with_value_and_returns['T']
    ]):
        bgf.fit(summary_with_value_and_returns['frequency'],
                summary_with_value_and_returns['recency'],
                summary_with_value_and_returns['T'])

    # Getting Customer lifetime value using the Gamma Gamma output
    # NOTE: the time can be adjusted, but is currently set to 12 months

    customer_predicted_value = ggf.customer_lifetime_value(
        bgf,  #the model to use to predict the number of future transactions
        summary_with_value_and_returns['frequency'],
        summary_with_value_and_returns['recency'],
        summary_with_value_and_returns['T'],
        summary_with_value_and_returns['monetary_value'],
        time=clv_prediction_time,  # months
        discount_rate=0.01  # monthly discount rate ~ 12.7% annually
    )

    # Converting to dataframe
    df_cpv = pd.DataFrame({
        'client_id': customer_predicted_value.index,
        'pred_values': customer_predicted_value.values
    })

    # Setting client_id as index
    df_cpv = df_cpv.set_index('client_id')

    # Merge with original summary
    df_merged = pd.merge(summary,
                         df_cpv,
                         left_index=True,
                         right_index=True,
                         how='outer')

    # Historical CLV
    data_hist = data.groupby(
        ['client_id'])['Searches',
                       'Revenue'].apply(lambda x: x.astype(float).sum())

    # Merge with original summary
    df_final = pd.merge(df_merged,
                        data_hist,
                        left_index=True,
                        right_index=True,
                        how='outer')

    # Prevent NaN on the pred_clv column
    df_final.pred_values[df_final.frequency == 0] = 0.0

    # Create column that combines historical and predicted customer value
    df_final['total_clv'] = df_final['pred_values'] + df_final['Revenue']

    # Create column which calculates in days the number of days since they were last active
    df_final['last_active'] = df_final['T'] - df_final['recency']

    # Create a column which labels users inactive over 14 days as "Expired" ELSE "Active"
    df_final['user_status'] = np.where(df_final['last_active'] > 14, 'Expired',
                                       'Active')

    # Add column with date of calculation
    # Set calc_date to max submission date
    df_final['calc_date'] = max_date.date()  #pd.Timestamp('today').date()

    # Rename columns as appropriate
    df_final.columns = [
        'frequency', 'recency', 'customer_age', 'avg_session_value',
        'predicted_searches_14_days', 'alive_probability',
        'predicted_clv_12_months', 'historical_searches', 'historical_clv',
        'total_clv', 'days_since_last_active', 'user_status', 'calc_date'
    ]

    #Prevent non returning customers from having 100% alive probability
    df_final.alive_probability[df_final.frequency == 0] = 0.0

    return df_final
Esempio n. 34
0
def fitted_bg(example_summary_data):
    bg = BetaGeoFitter()
    bg.fit(example_summary_data['frequency'], example_summary_data['recency'], example_summary_data['T'], iterative_fitting=0)
    return bg
Esempio n. 35
0
import os
import pandas as pd
import pytest

import matplotlib
matplotlib.use('AGG')  # use a non-interactive backend
from matplotlib import pyplot as plt

from lifetimes import plotting
from lifetimes import BetaGeoFitter, ParetoNBDFitter, ModifiedBetaGeoFitter
from lifetimes.datasets import load_cdnow, load_transaction_data
from lifetimes import utils

bgf = BetaGeoFitter()
cd_data = load_cdnow()
bgf.fit(cd_data['frequency'],
        cd_data['recency'],
        cd_data['T'],
        iterative_fitting=1)


@pytest.mark.plottest
class TestPlotting():
    @pytest.mark.mpl_image_compare(tolerance=30)
    def test_plot_period_transactions(self):
        plt.figure()
        plotting.plot_period_transactions(bgf)
        return plt.gcf()

    @pytest.mark.mpl_image_compare(tolerance=30)
    def test_plot_period_transactions_parento(self):
Esempio n. 36
0
    recency.rename(columns={"step": "recency"}, inplace=True)
    frequency.rename(columns={"step": "frequency"}, inplace=True)
    T.rename(columns={"step": "T"}, inplace=True)
    monetary.rename(columns={"amount": "monetary_value"}, inplace=True)

    df_rfm = pd.concat([recency, T, monetary, frequency], axis=1)
    ggf = GammaGammaFitter(penalizer_coef=0)
    ggf.fit(frequency=df_rfm["frequency"],
            monetary_value=df_rfm["monetary_value"])

    df_rfm["expected_monetary_value"] = df_rfm.apply(
        lambda row: ggf.conditional_expected_average_profit(
            row["frequency"], row["monetary_value"]),
        axis=1)

    bgf = BetaGeoFitter(penalizer_coef=1)
    bgf.fit(frequency=df_rfm["frequency"],
            recency=df_rfm["recency"],
            T=df_rfm["T"])

    df_rfm[
        "pred_nb_purchases"] = bgf.conditional_expected_number_of_purchases_up_to_time(
            t=180,
            frequency=df_rfm["frequency"],
            recency=df_rfm["recency"],
            T=df_rfm["T"])

    df_rfm["pred_revenue"] = df_rfm.apply(
        lambda row: row["pred_nb_purchases"] * row["expected_monetary_value"],
        axis=1)
Esempio n. 37
0
import os
import pandas as pd
import pytest

import matplotlib
matplotlib.use('AGG') # use a non-interactive backend
from matplotlib import pyplot as plt

from lifetimes import plotting
from lifetimes import BetaGeoFitter, ParetoNBDFitter, ModifiedBetaGeoFitter
from lifetimes.datasets import load_cdnow, load_transaction_data
from lifetimes import utils

bgf = BetaGeoFitter()
cd_data = load_cdnow()
bgf.fit(cd_data['frequency'], cd_data['recency'], cd_data['T'], iterative_fitting=1)

@pytest.mark.plottest
class TestPlotting():
    
    @pytest.mark.mpl_image_compare(tolerance=30)
    def test_plot_period_transactions(self):
        plt.figure()
        plotting.plot_period_transactions(bgf)
        return plt.gcf()

    @pytest.mark.mpl_image_compare(tolerance=30)
    def test_plot_period_transactions_parento(self):
        pnbd = ParetoNBDFitter()
        pnbd.fit(cd_data['frequency'], cd_data['recency'], cd_data['T'], iterative_fitting=1)