Ejemplo n.º 1
9
def test_summary_data_from_transaction_data_returns_correct_results(transaction_level_data):
    today = '2015-02-07'
    actual = utils.summary_data_from_transaction_data(transaction_level_data, 'id', 'date', observation_period_end=today)
    expected = pd.DataFrame([[1, 1., 5., 6.],
                             [2, 0., 0., 37.],
                             [3, 2., 4., 37.]], columns=['id', 'frequency', 'recency', 'T']).set_index('id')
    assert_frame_equal(actual, expected)
def calc_clv(clv_recs, end, months=12):
    df = pandas.DataFrame(clv_recs)
    df = df[['player_id', 'start_date', 'theo_win']]
    df['theo_win'] = df['theo_win'].astype(float)
    
    end_date = parse(end)
    summary = summary_data_from_transaction_data(df, 
                                                 'player_id', 
                                                 'start_date', 
                                                 monetary_value_col='theo_win', 
                                                 observation_period_end=end_date)
    bgf = BetaGeoFitter(penalizer_coef=0.0)
    bgf.fit(summary['frequency'], summary['recency'], summary['T'])
    
    ggf = GammaGammaFitter(penalizer_coef = 0)
    ggf.fit(summary['frequency'], summary['monetary_value'])
    
    ggf_clv = ggf.customer_lifetime_value(
        bgf, #the model to use to predict the number of future transactions
        summary['frequency'],
        summary['recency'],
        summary['T'],
        summary['monetary_value'],
        time=months, 
        discount_rate=0.0
    )
    clv_df = pandas.DataFrame(ggf_clv)
    clv_df=clv_df.dropna()
    clv_df[clv_df['clv']<0] = 0.0
    summary=summary.merge(clv_df, left_index=True, right_index=True, how='inner')

    return summary
Ejemplo n.º 3
0
    def test_purchase_predictions_do_not_differ_much_if_looking_at_hourly_or_daily_frequencies(
            self):
        transaction_data = load_transaction_data(parse_dates=['date'])
        daily_summary = utils.summary_data_from_transaction_data(
            transaction_data,
            'id',
            'date',
            observation_period_end=max(transaction_data.date),
            freq='D')
        hourly_summary = utils.summary_data_from_transaction_data(
            transaction_data,
            'id',
            'date',
            observation_period_end=max(transaction_data.date),
            freq='h')
        thirty_days = 30
        hours_in_day = 24
        mbfg = estimation.ModifiedBetaGeoFitter()

        np.random.seed(0)
        mbfg.fit(daily_summary['frequency'], daily_summary['recency'],
                 daily_summary['T'])
        thirty_day_prediction_from_daily_data = mbfg.expected_number_of_purchases_up_to_time(
            thirty_days)

        np.random.seed(0)
        mbfg.fit(hourly_summary['frequency'], hourly_summary['recency'],
                 hourly_summary['T'])
        thirty_day_prediction_from_hourly_data = mbfg.expected_number_of_purchases_up_to_time(
            thirty_days * hours_in_day)

        npt.assert_almost_equal(thirty_day_prediction_from_daily_data,
                                thirty_day_prediction_from_hourly_data)
Ejemplo n.º 4
0
def test_summary_data_from_transaction_data_will_choose_the_correct_first_order_to_drop_in_monetary_transactions(
):
    # this is the correct behaviour. See https://github.com/CamDavidsonPilon/lifetimes/issues/85
    # and test_summary_statistics_are_indentical_to_hardies_paper_confirming_correct_aggregations
    cust = pd.Series([2, 2, 2])
    dates_ordered = pd.to_datetime(
        pd.Series([
            '2014-03-14 00:00:00', '2014-04-09 00:00:00', '2014-05-21 00:00:00'
        ]))
    sales = pd.Series([10, 20, 25])
    transaction_data = pd.DataFrame({
        'date': dates_ordered,
        'id': cust,
        'sales': sales
    })
    summary_ordered_data = utils.summary_data_from_transaction_data(
        transaction_data, 'id', 'date', 'sales')

    dates_unordered = pd.to_datetime(
        pd.Series([
            '2014-04-09 00:00:00', '2014-03-14 00:00:00', '2014-05-21 00:00:00'
        ]))
    sales = pd.Series([20, 10, 25])
    transaction_data = pd.DataFrame({
        'date': dates_unordered,
        'id': cust,
        'sales': sales
    })
    summary_unordered_data = utils.summary_data_from_transaction_data(
        transaction_data, 'id', 'date', 'sales')

    assert_frame_equal(summary_ordered_data, summary_unordered_data)
    assert summary_ordered_data['monetary_value'].loc[2] == 22.5
Ejemplo n.º 5
0
def test_summary_data_from_transaction_data_works_with_string_customer_ids(transaction_level_data):
    d = [
        ['X', '2015-02-01'],
        ['X', '2015-02-06'],
        ['Y', '2015-01-01'],
        ['Y', '2015-01-01'],
        ['Y', '2015-01-02'],
        ['Y', '2015-01-05'],
    ]
    df = pd.DataFrame(d, columns=['id', 'date'])
    utils.summary_data_from_transaction_data(df, 'id', 'date')
Ejemplo n.º 6
0
def test_summary_data_from_transaction_data_works_with_string_customer_ids(
        transaction_level_data):
    d = [
        ["X", "2015-02-01"],
        ["X", "2015-02-06"],
        ["Y", "2015-01-01"],
        ["Y", "2015-01-01"],
        ["Y", "2015-01-02"],
        ["Y", "2015-01-05"],
    ]
    df = pd.DataFrame(d, columns=["id", "date"])
    utils.summary_data_from_transaction_data(df, "id", "date")
Ejemplo n.º 7
0
def test_summary_data_from_transaction_data():
    transactions = pd.read_csv(
        'lifetimes/datasets/glovo_example_transactions.csv')
    actual = utils.summary_data_from_transaction_data(
        transactions,
        customer_id_col='customer_id',
        datetime_col='date',
        observation_period_end=datetime(2019, 2, 19).date(),
        freq='D',
        monetary_value_col='gtv_eur',
        money_first_transaction=True,
        save=False)
    expected_columns = [
        'customer_id', 'frequency', 'recency', 'T', 'orders_per_period',
        'monetary_value', 'margin'
    ]
    expected = pd.DataFrame([[213, 0., 0., 1435., 1., 5.5, 82.6446281],
                             [240, 0., 0., 1429., 1., 28.99, 15.67938788],
                             [272, 0., 0., 1431., 1., 11.9, 38.19709702],
                             [382, 0., 0., 1451., 1., 17.9, 25.39360081],
                             [438, 0., 0., 1433., 1., 25.67, 17.70726352],
                             [501, 0., 0., 1434., 1., 50., 9.09090909],
                             [587, 0., 0., 1428., 1., 5.5, 82.6446281],
                             [688, 0., 0., 1431., 1., 11.1, 20.47502048],
                             [885, 0., 0., 1434., 1., 8., 56.81818182]],
                            columns=expected_columns).set_index('customer_id')
    assert_frame_equal(actual, expected)
Ejemplo n.º 8
0
def test_beta_geometric_nbd_model_transactional_data(T, r, alpha, a, b,
                                                     observation_period_end,
                                                     freq, size):
    np.random.seed(188898)
    transaction_data = beta_geometric_nbd_model_transactional_data(
        T=T,
        r=r,
        alpha=alpha,
        a=a,
        b=b,
        observation_period_end=observation_period_end,
        freq=freq,
        size=size)
    actual = summary_data_from_transaction_data(
        transactions=transaction_data,
        customer_id_col="customer_id",
        datetime_col="date",
        observation_period_end=observation_period_end,
        freq=freq,
    )
    np.random.seed(188898)
    expected = beta_geometric_nbd_model(T=T,
                                        r=r,
                                        alpha=alpha,
                                        a=a,
                                        b=b,
                                        size=size)[[
                                            "frequency", "recency", "T"
                                        ]]
    expected["recency"] = expected["recency"].apply(np.ceil)
    expected = expected.reset_index(drop=True)
    actual = actual.reset_index(drop=True)
    assert expected.equals(actual)
Ejemplo n.º 9
0
    def test_plot_incremental_transactions(self):
        """Test plotting incremental transactions with CDNOW example."""
        transactions = load_dataset('CDNOW_sample.txt', header=None, sep='\s+')
        transactions.columns = [
            'id_total', 'id_sample', 'date', 'num_cd_purc', 'total_value'
        ]
        t = 39
        freq = 'W'

        transactions_summary = utils.summary_data_from_transaction_data(
            transactions,
            'id_sample',
            'date',
            datetime_format='%Y%m%d',
            observation_period_end='19970930',
            freq=freq)

        bgf = BetaGeoFitter(penalizer_coef=0.01)
        bgf.fit(transactions_summary['frequency'],
                transactions_summary['recency'], transactions_summary['T'])

        plt.figure()
        plotting.plot_incremental_transactions(bgf,
                                               transactions,
                                               'date',
                                               'id_sample',
                                               2 * t,
                                               t,
                                               freq=freq,
                                               xlabel='week',
                                               datetime_format='%Y%m%d')
        return plt.gcf()
Ejemplo n.º 10
0
def df_cum_transactions(cdnow_transactions):
    datetime_col = 'date'
    customer_id_col = 'id_sample'
    t = 25 * 7
    datetime_format = '%Y%m%d'
    freq = 'D'
    observation_period_end = '19970930'
    freq_multiplier = 7

    transactions_summary = utils.summary_data_from_transaction_data(
        cdnow_transactions, customer_id_col, datetime_col,
        datetime_format=datetime_format, freq=freq, freq_multiplier=freq_multiplier,
        observation_period_end=observation_period_end)

    transactions_summary = transactions_summary.reset_index()

    model = ParetoNBDFitter()
    model.fit(transactions_summary['frequency'],
              transactions_summary['recency'],
              transactions_summary['T'])

    df_cum = utils.expected_cumulative_transactions(
        model, cdnow_transactions, datetime_col, customer_id_col, t,
        datetime_format, freq, set_index_date=False, freq_multiplier=freq_multiplier)
    return df_cum
Ejemplo n.º 11
0
def test_summary_data_from_transaction_data_returns_correct_results(transaction_level_data):
    today = '2015-02-07'
    actual = utils.summary_data_from_transaction_data(transaction_level_data, 'id', 'date', observation_period_end=today)
    expected = pd.DataFrame([[1, 1., 5., 6.],
                             [2, 0., 0., 37.],
                             [3, 2., 4., 37.]], columns=['id', 'frequency', 'recency', 'T']).set_index('id')
    assert_frame_equal(actual, expected)
Ejemplo n.º 12
0
def test_expected_cumulative_transactions_date_index(cdnow_transactions):
    """
    Test set_index as date for cumulative transactions and bgf fitter.

    Get first 14 cdnow transactions dates and validate that date index,
    freq_multiplier = 1 working and compare with tested data for last 4 records.

    dates = ['1997-01-11', '1997-01-12', '1997-01-13', '1997-01-14']
    actual_trans = [11, 12, 15, 19]
    expected_trans = [10.67, 12.67, 14.87, 17.24]

    """
    datetime_col = "date"
    customer_id_col = "id_sample"
    t = 14
    datetime_format = "%Y%m%d"
    freq = "D"
    observation_period_end = "19970930"
    freq_multiplier = 1

    transactions_summary = utils.summary_data_from_transaction_data(
        cdnow_transactions,
        customer_id_col,
        datetime_col,
        datetime_format=datetime_format,
        freq=freq,
        freq_multiplier=freq_multiplier,
        observation_period_end=observation_period_end,
    )

    transactions_summary = transactions_summary.reset_index()

    model = BetaGeoFitter()
    model.fit(transactions_summary["frequency"],
              transactions_summary["recency"], transactions_summary["T"])

    df_cum = utils.expected_cumulative_transactions(
        model,
        cdnow_transactions,
        datetime_col,
        customer_id_col,
        t,
        datetime_format,
        freq,
        set_index_date=True,
        freq_multiplier=freq_multiplier,
    )

    dates = ["1997-01-11", "1997-01-12", "1997-01-13", "1997-01-14"]
    actual_trans = [11, 12, 15, 19]
    expected_trans = [10.67, 12.67, 14.87, 17.24]

    date_index = df_cum.iloc[-4:].index.to_timestamp().astype(str)
    actual = df_cum["actual"].iloc[-4:].values
    predicted = df_cum["predicted"].iloc[-4:].values.round(2)

    assert all(dates == date_index)
    assert_allclose(actual, actual_trans)
    assert_allclose(predicted, expected_trans, atol=1e-2)
Ejemplo n.º 13
0
    def test_purchase_predictions_do_not_differ_much_if_looking_at_hourly_or_daily_frequencies(self):
        transaction_data = load_transaction_data(parse_dates=['date'])
        daily_summary = utils.summary_data_from_transaction_data(transaction_data, 'id', 'date', observation_period_end=max(transaction_data.date), freq='D')
        hourly_summary = utils.summary_data_from_transaction_data(transaction_data, 'id', 'date', observation_period_end=max(transaction_data.date), freq='h')
        thirty_days = 30
        hours_in_day = 24
        mbfg = estimation.ModifiedBetaGeoFitter()

        np.random.seed(0)
        mbfg.fit(daily_summary['frequency'], daily_summary['recency'], daily_summary['T'])
        thirty_day_prediction_from_daily_data = mbfg.expected_number_of_purchases_up_to_time(thirty_days)

        np.random.seed(0)
        mbfg.fit(hourly_summary['frequency'], hourly_summary['recency'], hourly_summary['T'])
        thirty_day_prediction_from_hourly_data = mbfg.expected_number_of_purchases_up_to_time(thirty_days * hours_in_day)

        npt.assert_almost_equal(thirty_day_prediction_from_daily_data, thirty_day_prediction_from_hourly_data)
Ejemplo n.º 14
0
def test_summary_data_from_transaction_data_squashes_period_purchases_to_one_purchase(
):
    transactions = pd.DataFrame([[1, '2015-01-01'], [1, '2015-01-01']],
                                columns=['id', 't'])
    actual = utils.summary_data_from_transaction_data(transactions,
                                                      'id',
                                                      't',
                                                      freq='W')
    assert actual.ix[1]['frequency'] == 1. - 1.
Ejemplo n.º 15
0
def test_summary_data_from_transaction_data_returns_correct_results(
        transaction_level_data):
    today = "2015-02-07"
    actual = utils.summary_data_from_transaction_data(
        transaction_level_data, "id", "date", observation_period_end=today)
    expected = pd.DataFrame(
        [[1, 1.0, 5.0, 6.0], [2, 0.0, 0.0, 37.0], [3, 2.0, 4.0, 37.0]],
        columns=["id", "frequency", "recency", "T"]).set_index("id")
    assert_frame_equal(actual, expected)
 def summary_create(self, df):
     '''
     Subset df on sales data, create trans summary
     '''
     sales = subset_data(df, 'OrderType', 1)
     #make sure all sales kosher - keep only +0 sales
     sales = sales[sales.OrderTotal>0]
     self.transaction_data = sales[['OrderDate', 'CustomerNo']]
     return summary_data_from_transaction_data(self.transaction_data, 'CustomerNo', 'OrderDate', observation_period_end='2017-02-08')
Ejemplo n.º 17
0
def test_summary_data_from_transaction_data_squashes_period_purchases_to_one_purchase(
):
    transactions = pd.DataFrame([[1, "2015-01-01"], [1, "2015-01-01"]],
                                columns=["id", "t"])
    actual = utils.summary_data_from_transaction_data(transactions,
                                                      "id",
                                                      "t",
                                                      freq="W")
    assert actual.loc[1]["frequency"] == 1.0 - 1.0
Ejemplo n.º 18
0
def test_summary_data_from_transaction_data_with_specific_datetime_format(transaction_level_data):
    transaction_level_data['date'] = transaction_level_data['date'].map(lambda x: x.replace('-',''))
    format = '%Y%m%d'
    today = '20150207'
    actual = utils.summary_data_from_transaction_data(transaction_level_data, 'id', 'date', observation_period_end=today, datetime_format=format)
    expected = pd.DataFrame([[1, 1., 5., 6.],
                             [2, 0., 0., 37.],
                             [3, 2., 4., 37.]], columns=['id', 'frequency', 'recency', 'T']).set_index('id')
    assert_frame_equal(actual, expected)
Ejemplo n.º 19
0
def bgf_transactions(cdnow_transactions):
    transactions_summary = utils.summary_data_from_transaction_data(
        cdnow_transactions, 'id_sample', 'date', datetime_format='%Y%m%d',
        observation_period_end='19970930', freq='W')

    bgf = BetaGeoFitter(penalizer_coef=0.01)
    bgf.fit(transactions_summary['frequency'],
            transactions_summary['recency'], transactions_summary['T'])
    return bgf
Ejemplo n.º 20
0
def test_summary_data_from_transaction_data_with_specific_datetime_format(transaction_level_data):
    transaction_level_data['date'] = transaction_level_data['date'].map(lambda x: x.replace('-',''))
    format = '%Y%m%d'
    today = '20150207'
    actual = utils.summary_data_from_transaction_data(transaction_level_data, 'id', 'date', observation_period_end=today, datetime_format=format)
    expected = pd.DataFrame([[1, 1., 5., 6.],
                             [2, 0., 0., 37.],
                             [3, 2., 4., 37.]], columns=['id', 'frequency', 'recency', 'T']).set_index('id')
    assert_frame_equal(actual, expected)
Ejemplo n.º 21
0
def test_summary_date_from_transaction_data_with_specific_non_daily_frequency(large_transaction_level_data):
    today = '20150207'
    actual = utils.summary_data_from_transaction_data(large_transaction_level_data, 'id', 'date', observation_period_end=today, freq='W')
    expected = pd.DataFrame([[1, 1., 5., 5.],
                             [2, 0., 0., 5.],
                             [3, 1., 1., 5.],
                             [4, 1., 3., 3.],
                             [5, 0., 0., 3.],
                             [6, 0., 0., 0.]], columns=['id', 'frequency', 'recency', 'T']).set_index('id')
    assert_frame_equal(actual, expected)
Ejemplo n.º 22
0
def test_summary_date_from_transaction_with_monetary_values(large_transaction_level_data_with_monetary_value):
    today = '20150207'
    actual = utils.summary_data_from_transaction_data(large_transaction_level_data_with_monetary_value, 'id', 'date', monetary_value_col='monetary_value', observation_period_end=today)
    expected = pd.DataFrame([[1, 1., 36., 37., 2],
                             [2, 0.,  0., 37., 0],
                             [3, 2.,  4., 37., 3],
                             [4, 2., 20., 22., 3],
                             [5, 2.,  2., 22., 4.5],
                             [6, 0.,  0.,  5., 0]], columns=['id', 'frequency', 'recency', 'T', 'monetary_value']).set_index('id')
    assert_frame_equal(actual, expected)
Ejemplo n.º 23
0
def test_summary_date_from_transaction_data_with_specific_non_daily_frequency(large_transaction_level_data):
    today = '20150207'
    actual = utils.summary_data_from_transaction_data(large_transaction_level_data, 'id', 'date', observation_period_end=today, freq='W')
    expected = pd.DataFrame([[1, 1., 5., 5.],
                             [2, 0., 0., 5.],
                             [3, 1., 1., 5.],
                             [4, 1., 3., 3.],
                             [5, 0., 0., 3.],
                             [6, 0., 0., 0.]], columns=['id', 'frequency', 'recency', 'T']).set_index('id')
    assert_frame_equal(actual, expected)
Ejemplo n.º 24
0
def test_summary_date_from_transaction_with_monetary_values(large_transaction_level_data_with_monetary_value):
    today = '20150207'
    actual = utils.summary_data_from_transaction_data(large_transaction_level_data_with_monetary_value, 'id', 'date', monetary_value_col='monetary_value', observation_period_end=today)
    expected = pd.DataFrame([[1, 1., 36., 37., 2],
                             [2, 0.,  0., 37., 0],
                             [3, 2.,  4., 37., 3],
                             [4, 2., 20., 22., 3],
                             [5, 2.,  2., 22., 4.5],
                             [6, 0.,  0.,  5., 0]], columns=['id', 'frequency', 'recency', 'T', 'monetary_value']).set_index('id')
    assert_frame_equal(actual, expected)
def summaryDataFromTransactionData(
        clvWithBG_NBDGammaGammModelProcessedDataset):

    summaryDataFromTransactionDataForCLV = summary_data_from_transaction_data(
        clvWithBG_NBDGammaGammModelProcessedDataset,
        "CustomerID",
        "InvoiceDate",
        monetary_value_col="Total_Sales",
        observation_period_end="2011-12-9")

    return summaryDataFromTransactionDataForCLV
 def summary_trans_create(self, df):
     '''
     Subset df on sales data, return trans summary with monetary spend
     '''
     sales = subset_data(df, 'OrderType', 1)
     sales = sales[sales.OrderTotal>0]
     transaction_data_monetary = sales[['OrderDate', 'CustomerNo', 'OrderTotal']]
     self.summary_monetary = summary_data_from_transaction_data(transaction_data_monetary, 'CustomerNo', 'OrderDate', 'OrderTotal', observation_period_end='2017-02-08')
     #keep customers with more than one spend
     self.return_customers = self.summary_monetary[self.summary_monetary['frequency']>0]
     return self.return_customers
Ejemplo n.º 27
0
def test_summary_data_from_transaction_data_works_with_int_customer_ids_and_doesnt_coerce_to_float(transaction_level_data):
    d = [
        [1, '2015-02-01'],
        [1, '2015-02-06'],
        [1, '2015-01-01'],
        [2, '2015-01-01'],
        [2, '2015-01-02'],
        [2, '2015-01-05'],
    ]
    df = pd.DataFrame(d, columns=['id', 'date'])
    actual = utils.summary_data_from_transaction_data(df, 'id', 'date')
    assert actual.index.dtype == 'int64'
Ejemplo n.º 28
0
def test_summary_data_from_transaction_data_works_with_int_customer_ids_and_doesnt_coerce_to_float(
        transaction_level_data):
    d = [
        [1, "2015-02-01"],
        [1, "2015-02-06"],
        [1, "2015-01-01"],
        [2, "2015-01-01"],
        [2, "2015-01-02"],
        [2, "2015-01-05"],
    ]
    df = pd.DataFrame(d, columns=["id", "date"])
    actual = utils.summary_data_from_transaction_data(df, "id", "date")
    assert actual.index.dtype == "int64"
Ejemplo n.º 29
0
def predictSpending(customerId):
    # initialize the data dictionary that will be returned
    data = {"success": False, "result": {"customerId": "", "y": 0.0}}

    # ensure the customer ID was properly uploaded to our endpoint
    if customerId:
        print("* get data")
        data = pandas.read_csv("sample_transactions.csv")
        #data = pandas.read_json(baseURL + "/api/transactions")
        #data = data.drop(columns="_id")

        print("* prepare data")
        # prepare and shaping the data
        # columns -
        #   customerId
        # 	frequency : number of repeat purchase transactions
        #	recency: time (in days) between first purchase and latest purchase
        #	T: time (in days) between first purchase and end of the period under study
        #	monetary_value: average transactions amount
        today = pandas.to_datetime(datetime.date.today())
        summaryData = summary_data_from_transaction_data(
            data,
            "customerId",
            "transactionDate",
            monetary_value_col="transactionAmount",
            observation_period_end=today)
        # filter the customer data that has no transaction
        analysisData = summaryData[summaryData["frequency"] > 0]

        # get the stat of the particular customer
        customer = analysisData.loc[customerId]

        # load model
        ggf_loaded = GammaGammaFitter()
        ggf_loaded.load_model('ggf.pkl')

        # estimate the average transaction amount
        predict = ggf_loaded.conditional_expected_average_profit(
            customer["frequency"], customer['monetary_value'])

        # add the input and predicted output to the return data
        data = {
            "success": True,
            "result": {
                "customerId": customerId,
                "y": predict
            }
        }

    # return the data dictionary as a JSON response
    return flask.jsonify(data)
def add_rfm_features(features, calib_invoices, period_end):
    features = features.copy()
    rfm_features = summary_data_from_transaction_data(
        transactions=calib_invoices,
        customer_id_col='CustomerID',
        datetime_col='InvoiceDate',
        monetary_value_col='Revenue',
        observation_period_end=period_end,
        freq='D')
    rfm_features[
        'T_Minus_Recency'] = rfm_features['T'] - rfm_features['recency']
    features = features.merge(rfm_features, how='left', on='CustomerID')

    return features
Ejemplo n.º 31
0
def test_summary_statistics_are_indentical_to_hardies_paper_confirming_correct_aggregations():
    # see http://brucehardie.com/papers/rfm_clv_2005-02-16.pdf
    # RFM and CLV: Using Iso-value Curves for Customer Base Analysis
    df = pd.read_csv('lifetimes/datasets/CDNOW_sample.txt', sep='\s+', header=None, names=['_id', 'id', 'date', 'cds_bought', 'spent'])
    df['date'] = pd.to_datetime(df['date'].astype(unicode))
    df_train = df[df['date'] < '1997-10-01']
    summary = utils.summary_data_from_transaction_data(df_train, 'id', 'date', 'spent')
    results = summary[summary['frequency'] > 0]['monetary_value'].describe()
    assert np.round(results.ix['mean']) == 35
    assert np.round(results.ix['std']) == 30
    assert np.round(results.ix['min']) == 3
    assert np.round(results.ix['50%']) == 27
    assert np.round(results.ix['max']) == 300
    assert np.round(results.ix['count']) == 946
Ejemplo n.º 32
0
def get_rfm_features(features, cohort_invoices):
    cohort_invoices = cohort_invoices.copy()
    features = features.copy()

    rfm_features = summary_data_from_transaction_data(
        transactions=cohort_invoices,
        customer_id_col='CustomerID',
        datetime_col='InvoiceDate',
        monetary_value_col='Revenue',
        freq='D').reset_index()
    features = features.merge(rfm_features, how='left', on='CustomerID')
    features['T_Minus_Recency'] = rfm_features['T'] - rfm_features['recency']

    return features
Ejemplo n.º 33
0
def test_summary_data_from_transaction_data_with_specific_datetime_format(
        transaction_level_data):
    transaction_level_data["date"] = transaction_level_data["date"].map(
        lambda x: x.replace("-", ""))
    format = "%Y%m%d"
    today = "20150207"
    actual = utils.summary_data_from_transaction_data(
        transaction_level_data,
        "id",
        "date",
        observation_period_end=today,
        datetime_format=format)
    expected = pd.DataFrame(
        [[1, 1.0, 5.0, 6.0], [2, 0.0, 0.0, 37.0], [3, 2.0, 4.0, 37.0]],
        columns=["id", "frequency", "recency", "T"]).set_index("id")
    assert_frame_equal(actual, expected)
Ejemplo n.º 34
0
    def get_data_from_server(self,cmd=None):
        """
        Gets data from sales_db and stores the query results in self.data
        INPUT
            cmd (str) the default sql query is below

            The default query has been replaced. The original query was an 8 line select command.
        """
        # server name
        dsn = "THE SERVER NAME"
        cnxn_name = "DSN=%s" % dsn
        connection = odbc.connect(cnxn_name) # use to access the database
        c = connection.cursor() # generate cursor object
        
        # Grab transaction data from Postgres
        if not cmd:
            cmd = """SQL DEFAULT COMMAND GOES HERE""" % (self.pmg_num,self.date_range[0],self.date_range[1])
        
        c.execute(cmd) # execute the sql command
        
        # list to store the query data
        transaction_data = []
        
        # create a dictionary to convert customer ids to name
        to_name = dict(np.genfromtxt('../data/names.csv',dtype=str,delimiter='\t'))
        
        for row in c:
            cust, rsv_date, sales = row # pull data from each row of the query data
            cust_id = str(int(cust))
            name = to_name[cust_id]
            # check to see if customer is inactive
            if use(name):
                rsv_date1_readable = rsv_date.strftime('%Y-%m-%d') # date formatting
                sales_float = float(sales) # convert to float; represents the transaction amount
                transaction_data.append({"id":cust, "date":rsv_date, "sales":sales_float}) # add dictionary of data to list
        
        # convert to dataframe
        df = pd.DataFrame(transaction_data, columns=['id', 'date', 'sales'])
        # store results
        df.to_csv(self.outfile1,index=False)
        # IMPORTANT: use correct observation_period_end date
        self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M')
Ejemplo n.º 35
0
def test_summary_date_from_transaction_data_with_specific_non_daily_frequency(
        large_transaction_level_data):
    today = "20150207"
    actual = utils.summary_data_from_transaction_data(
        large_transaction_level_data,
        "id",
        "date",
        observation_period_end=today,
        freq="W")
    expected = pd.DataFrame(
        [
            [1, 1.0, 5.0, 5.0],
            [2, 0.0, 0.0, 5.0],
            [3, 1.0, 1.0, 5.0],
            [4, 1.0, 3.0, 3.0],
            [5, 0.0, 0.0, 3.0],
            [6, 0.0, 0.0, 0.0],
        ],
        columns=["id", "frequency", "recency", "T"],
    ).set_index("id")
    assert_frame_equal(actual, expected)
Ejemplo n.º 36
0
def rfm_model(df, customer_column, date_column, monetary_column):
    """Return an RFM score for each customer using the Lifetimes RFM model.
    This score is calculated across the whole DataFrame, so if you have a
    customer with numerous orders, it will calculate one value and apply
    it across all orders and won't calculate the figure historically.

    Args:
        :param df: Pandas DataFrame
        :param monetary_column: Column containing monetary value of order
        :param date_column: Column containing date
        :param customer_column: Column containing customer

    Returns:
        New DataFrame containing RFM data by customer.
        T is equal to days since first order and end of period.
        Customers with 1 order will be assigned 0 for RFM scores.
    """

    # Ensure that inf and NaN values are filled
    rfm_df = summary_data_from_transaction_data(
        df, customer_column, date_column, monetary_value_col=monetary_column)
    return rfm_df
Ejemplo n.º 37
0
def test_summary_statistics_are_indentical_to_hardies_paper_confirming_correct_aggregations(
):
    # see http://brucehardie.com/papers/rfm_clv_2005-02-16.pdf
    # RFM and CLV: Using Iso-value Curves for Customer Base Analysis
    df = pd.read_csv(
        "lifetimes/datasets/CDNOW_sample.txt",
        sep=r"\s+",
        header=None,
        names=["_id", "id", "date", "cds_bought", "spent"],
    )
    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
    df_train = df[df["date"] < "1997-10-01"]
    summary = utils.summary_data_from_transaction_data(df_train, "id", "date",
                                                       "spent")
    results = summary[summary["frequency"] > 0]["monetary_value"].describe()

    assert np.round(results.loc["mean"]) == 35
    assert np.round(results.loc["std"]) == 30
    assert np.round(results.loc["min"]) == 3
    assert np.round(results.loc["50%"]) == 27
    assert np.round(results.loc["max"]) == 300
    assert np.round(results.loc["count"]) == 946
Ejemplo n.º 38
0
def test_summary_data_from_transaction_data_squashes_period_purchases_to_one_purchase():
    transactions = pd.DataFrame([[1, '2015-01-01'], [1, '2015-01-01']], columns=['id', 't'])
    actual = utils.summary_data_from_transaction_data(transactions, 'id', 't', freq='W')
    assert actual.ix[1]['frequency'] == 1. - 1.
Ejemplo n.º 39
0
def example_summary_data(example_transaction_data):
    return utils.summary_data_from_transaction_data(example_transaction_data, 'id', 'date', observation_period_end=max(example_transaction_data.date))
Ejemplo n.º 40
0
 def get_data_from_file(self,filename,**kwargs):
     df = pd.read_csv(filename,**kwargs)
     self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M')