Esempio n. 1
0
def upload():
    # -*- coding: utf-8 -*-
    if request.method == 'POST':
        f = request.files['file']

        basepath = os.path.dirname(__file__)
        file_path = os.path.join(basepath, 'uploads',
                                 secure_filename(f.filename))
        f.save(file_path)
        df = pd.read_csv(file_path)

        df['salesDate'] = pd.to_datetime(df['salesDate'])

        cols_of_interest = ['memberID', 'salesDate', 'sales']
        df = df[cols_of_interest]

        df['memberID'] = df['memberID'].apply(lambda x: format(x, '.0f'))

        max_date = df['salesDate'].max()
        min_date = max_date - relativedelta(months=+12)

        df = df.loc[(df['salesDate'] >= min_date)
                    & (df['salesDate'] <= max_date)]

        min_order = df['salesDate'].min()
        max_order = df['salesDate'].max()
        data = summary_data_from_transaction_data(
            df,
            'memberID',
            'salesDate',
            monetary_value_col='sales',
            observation_period_end=max_order)

        d2 = data.sort_values('frequency', ascending=False)

        bgf = BetaGeoFitter(penalizer_coef=0.0001)
        bgf.fit(data['frequency'], data['recency'], data['T'])

        t = 30
        data[
            'customer_livelyhood'] = bgf.conditional_expected_number_of_purchases_up_to_time(
                t, data['frequency'], data['recency'], data['T'])

        data.sort_values(by='customer_livelyhood',
                         ascending=False,
                         inplace=True)

        return data.to_html()
    return None
Esempio n. 2
0
def generate_clv_table(data, clv_prediction_time=None, model_penalizer=None):

    #set default values if they are not stated
    if clv_prediction_time is None:
        clv_prediction_time = 12
    if model_penalizer is None:
        model_penalizer = 0

    # Reformat csv as a Pandas dataframe
    #data = pd.read_csv(csv_file)

    #Remove non search sessions
    data = data[data['Searches'] > 0]

    max_date = data['activity_date'].max()

    # Using "summary_data_from_transaction_data" function to agregate the activity stream into the appropriate metrics
    # Model requires 'activity_date' column name.  For our purpose this is synonymous with submission_date.
    summary = summary_data_from_transaction_data(
        data,
        'client_id',
        'activity_date',
        'Revenue',
        observation_period_end=max_date)

    # Building the Model using BG/NBD
    bgf = BetaGeoFitter(penalizer_coef=model_penalizer)
    bgf.fit(summary['frequency'], summary['recency'], summary['T'])

    # Conditional expected purchases
    # These are the expected purchases expected from each individual given the time specified

    # t = days in to future
    t = 14
    summary[
        'predicted_searches'] = bgf.conditional_expected_number_of_purchases_up_to_time(
            t, summary['frequency'], summary['recency'], summary['T'])

    #Conditional Alive Probability
    summary['alive_prob'] = summary.apply(
        lambda row: calc_alive_prob(row, bgf), axis=1)
    summary['alive_prob'] = summary['alive_prob'].astype(float)
    #print summary['alive_prob']

    # There cannot be non-positive values in the monetary_value or frequency vector
    summary_with_value_and_returns = summary[(summary['monetary_value'] > 0)
                                             & (summary['frequency'] > 0)]

    # There cannot be zero length vectors in one of frequency, recency or T
    #summary_with_value_and_returns =
    #print summary_with_value_and_returns[
    #    (len(summary_with_value_and_returns['recency'])>0) &
    #    (len(summary_with_value_and_returns['frequency'])>0) &
    #    (len(summary_with_value_and_returns['T'])>0)
    #]

    if any(
            len(x) == 0 for x in [
                summary_with_value_and_returns['recency'],
                summary_with_value_and_returns['frequency'],
                summary_with_value_and_returns['T']
            ]):
        logger.debug(data['client_id'])

    # Setting up Gamma Gamma model
    ggf = GammaGammaFitter(penalizer_coef=0)
    ggf.fit(summary_with_value_and_returns['frequency'],
            summary_with_value_and_returns['monetary_value'])

    # Output average profit per tranaction by client ID
    ggf_output = ggf.conditional_expected_average_profit(
        summary_with_value_and_returns['frequency'],
        summary_with_value_and_returns['monetary_value'])

    # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors
    if not (len(x) == 0 for x in [
            summary_with_value_and_returns['recency'],
            summary_with_value_and_returns['frequency'],
            summary_with_value_and_returns['T']
    ]):
        bgf.fit(summary_with_value_and_returns['frequency'],
                summary_with_value_and_returns['recency'],
                summary_with_value_and_returns['T'])

    # Getting Customer lifetime value using the Gamma Gamma output
    # NOTE: the time can be adjusted, but is currently set to 12 months

    customer_predicted_value = ggf.customer_lifetime_value(
        bgf,  #the model to use to predict the number of future transactions
        summary_with_value_and_returns['frequency'],
        summary_with_value_and_returns['recency'],
        summary_with_value_and_returns['T'],
        summary_with_value_and_returns['monetary_value'],
        time=clv_prediction_time,  # months
        discount_rate=0.01  # monthly discount rate ~ 12.7% annually
    )

    # Converting to dataframe
    df_cpv = pd.DataFrame({
        'client_id': customer_predicted_value.index,
        'pred_values': customer_predicted_value.values
    })

    # Setting client_id as index
    df_cpv = df_cpv.set_index('client_id')

    # Merge with original summary
    df_merged = pd.merge(summary,
                         df_cpv,
                         left_index=True,
                         right_index=True,
                         how='outer')

    # Historical CLV
    data_hist = data.groupby(
        ['client_id'])['Searches',
                       'Revenue'].apply(lambda x: x.astype(float).sum())

    # Merge with original summary
    df_final = pd.merge(df_merged,
                        data_hist,
                        left_index=True,
                        right_index=True,
                        how='outer')

    # Prevent NaN on the pred_clv column
    df_final.pred_values[df_final.frequency == 0] = 0.0

    # Create column that combines historical and predicted customer value
    df_final['total_clv'] = df_final['pred_values'] + df_final['Revenue']

    # Create column which calculates in days the number of days since they were last active
    df_final['last_active'] = df_final['T'] - df_final['recency']

    # Create a column which labels users inactive over 14 days as "Expired" ELSE "Active"
    df_final['user_status'] = np.where(df_final['last_active'] > 14, 'Expired',
                                       'Active')

    # Add column with date of calculation
    # Set calc_date to max submission date
    df_final['calc_date'] = max_date.date()  #pd.Timestamp('today').date()

    # Rename columns as appropriate
    df_final.columns = [
        'frequency', 'recency', 'customer_age', 'avg_session_value',
        'predicted_searches_14_days', 'alive_probability',
        'predicted_clv_12_months', 'historical_searches', 'historical_clv',
        'total_clv', 'days_since_last_active', 'user_status', 'calc_date'
    ]

    #Prevent non returning customers from having 100% alive probability
    df_final.alive_probability[df_final.frequency == 0] = 0.0

    return df_final
        X_train['T']/7)
print(bgf)

%matplotlib inline
from lifetimes.plotting import plot_frequency_recency_matrix

plot_frequency_recency_matrix(bgf)

#%%
from lifetimes.plotting import plot_probability_alive_matrix

f=plot_probability_alive_matrix(bgf)

t=52
X_train['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(
    t, X_train['txn_total'], X_train['recency_true']/7,
    X_train['T']/7)
#%%
from lifetimes.plotting import plot_period_transactions
f = plot_period_transactions(bgf)

#%%
X_train.sort_values('predicted_purchases')
#%%
# X_train.sort_values(by='predicted_purchases').head(5)
from lifetimes.plotting import plot_period_transactions
f = plot_period_transactions(bgf)
#%%
import matplotlib.pyplot as plt

f = plt.figure()
Esempio n. 4
0
bgf.fit(df["FREQUENCY"], df["RECENCY"], df["T"])

bgf.summary

plotting.plot_frequency_recency_matrix(bgf)
plotting.plot_probability_alive_matrix(bgf)

# Repeat transaction model check
plotting.plot_period_transactions(bgf)

# ==========================================================================
# Ranking reps from best to worst
# ==========================================================================

t = 1
df["predicted_purchases"] = bgf.conditional_expected_number_of_purchases_up_to_time(
    t, df["FREQUENCY"], df["RECENCY"], df["T"])
df.sort_values(by="predicted_purchases").tail(10)

# ==========================================================================
# Gamma Gamme Model
# Model assumes that there is no relationship between the monetary value and the purchase frequency
# ==========================================================================

df[["MONETARY_VALUE", "FREQUENCY"]].corr()

ggf = GammaGammaFitter(penalizer_coef=0)
ggf.fit(df["FREQUENCY"], df["MONETARY_VALUE"])

ggf.conditional_expected_average_profit(df["FREQUENCY"],
                                        df["MONETARY_VALUE"]).head(10)
Esempio n. 5
0
    monetary.rename(columns={"amount": "monetary_value"}, inplace=True)

    df_rfm = pd.concat([recency, T, monetary, frequency], axis=1)
    ggf = GammaGammaFitter(penalizer_coef=0)
    ggf.fit(frequency=df_rfm["frequency"],
            monetary_value=df_rfm["monetary_value"])

    df_rfm["expected_monetary_value"] = df_rfm.apply(
        lambda row: ggf.conditional_expected_average_profit(
            row["frequency"], row["monetary_value"]),
        axis=1)

    bgf = BetaGeoFitter(penalizer_coef=1)
    bgf.fit(frequency=df_rfm["frequency"],
            recency=df_rfm["recency"],
            T=df_rfm["T"])

    df_rfm[
        "pred_nb_purchases"] = bgf.conditional_expected_number_of_purchases_up_to_time(
            t=180,
            frequency=df_rfm["frequency"],
            recency=df_rfm["recency"],
            T=df_rfm["T"])

    df_rfm["pred_revenue"] = df_rfm.apply(
        lambda row: row["pred_nb_purchases"] * row["expected_monetary_value"],
        axis=1)

    df_rfm.sort_values(by="pred_revenue", inplace=True)
    df_rfm.to_csv("{}/clv.csv".format(output_data_dir))
Esempio n. 6
0
class CLV(object):
    """
    INPUT
        pmg_num (int) the product market group number, default = 1
        outfile1 (str) the filename indicating where to store the raw data before analysis, default = '../data/clvtrainingset01.csv'
        outfile2 (str) the filename containing the results, default = '../data/clv01.csv'
        date_range (list) the start date and end date of the years to analyze, default = ['2008-09-01','2016-09-01']
    attributes other than those listed above
        self.data (DataFrame) a pandas DataFrame object of the data to be used for analysis
        self.bgf (from lifetimes) a statistical model object from the lifetimes package
        self.ggf (from lifetimes) a statistical model object from the lifetimes package
        self.results (DataFrame) a pandas DataFrame object of the results of analysis
    """
    def __init__(self,pmg_num=1,outfile1='../data/clvtrainingset01.csv',outfile2='../data/clv01.csv',date_range=['2008-09-01','2016-09-01']):
        self.pmg_num = pmg_num
        # outfile1 stores a clean version of the raw data used for analysis; this is important for reproducibility
        self.outfile1 = outfile1
        # outfile2 stores the clv estimation results
        self.outfile2 = outfile2
        self.date_range = date_range
        self.data = None
        self.bgf = None
        self.ggf = None
        self.results = None

    def get_data_from_server(self,cmd=None):
        """
        Gets data from sales_db and stores the query results in self.data
        INPUT
            cmd (str) the default sql query is below

            The default query has been replaced. The original query was an 8 line select command.
        """
        # server name
        dsn = "THE SERVER NAME"
        cnxn_name = "DSN=%s" % dsn
        connection = odbc.connect(cnxn_name) # use to access the database
        c = connection.cursor() # generate cursor object
        
        # Grab transaction data from Postgres
        if not cmd:
            cmd = """SQL DEFAULT COMMAND GOES HERE""" % (self.pmg_num,self.date_range[0],self.date_range[1])
        
        c.execute(cmd) # execute the sql command
        
        # list to store the query data
        transaction_data = []
        
        # create a dictionary to convert customer ids to name
        to_name = dict(np.genfromtxt('../data/names.csv',dtype=str,delimiter='\t'))
        
        for row in c:
            cust, rsv_date, sales = row # pull data from each row of the query data
            cust_id = str(int(cust))
            name = to_name[cust_id]
            # check to see if customer is inactive
            if use(name):
                rsv_date1_readable = rsv_date.strftime('%Y-%m-%d') # date formatting
                sales_float = float(sales) # convert to float; represents the transaction amount
                transaction_data.append({"id":cust, "date":rsv_date, "sales":sales_float}) # add dictionary of data to list
        
        # convert to dataframe
        df = pd.DataFrame(transaction_data, columns=['id', 'date', 'sales'])
        # store results
        df.to_csv(self.outfile1,index=False)
        # IMPORTANT: use correct observation_period_end date
        self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M')

    def get_data_from_file(self,filename,**kwargs):
        df = pd.read_csv(filename,**kwargs)
        self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M')

    def fit(self,months=96):
        """
        Computes CLV estimates for the next n months and stores results in self.results
        INPUT
            months (int) number of months to predict, default = 96 (8 years)
        """
        ### PREDICT NUMBER OF PURCHASES
        self.bgf = BetaGeoFitter() # see lifetimes module documentation for details
        self.bgf.fit(self.data['frequency'], self.data['recency'], self.data['T'])
        # 8 years = 96 months
        self.data['predicted_purchases'] = self.bgf.conditional_expected_number_of_purchases_up_to_time(
                months,
                self.data['frequency'],
                self.data['recency'],
                self.data['T'])

        ### PREDICT FUTURE PURCHASE AMOUNT
        self.ggf = GammaGammaFitter(penalizer_coef = 0)
        self.ggf.fit(self.data['frequency'], self.data['monetary_value'])
        # predict next transaction
        self.data['predicted_trans_profit'] = self.ggf.conditional_expected_average_profit(
                frequency = self.data['frequency'],
                monetary_value = self.data['monetary_value'])
        
        ### ESTIMATE CLV
        self.data['clv_estimation'] = self.data['predicted_trans_profit'] * self.data['predicted_purchases']
        self.data['prob_alive'] = self.bgf.conditional_probability_alive(
                self.data['frequency'],
                self.data['recency'],
                self.data['T'])
        self.results = self.data.sort_values(by='clv_estimation',ascending=False)
        # store results
        self.results.to_csv(self.outfile2,index=False)

    def plot_matrices(self):
        """
        plots three matrices:
            probability alive matrix: displays the probability that a customer is active
            frequency recency matrix: displays frequency and recency with color corresponding
                                        to monetary value
            period transactions: displays predicted and actual transaction values over time
            (check documentation in lifetimes for more details)
        """
        plot_probability_alive_matrix(self.bgf,cmap='viridis')
        plot_frequency_recency_matrix(self.bgf,cmap='viridis')
        plot_period_transactions(self.bgf)
Esempio n. 7
0
plt.clf()
plt.cla()
plt.close()


from lifetimes.plotting import plot_probability_alive_matrix

plot_probability_alive_matrix(bgf)
plt.savefig('probability_alive_matrix.png', dpi=200)
plt.clf()
plt.cla()
plt.close()


t = 1
data['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(
    t, data['frequency'], data['recency'], data['T'])
data.sort_values(by='predicted_purchases', ascending=False).head(5)

from lifetimes.plotting import plot_period_transactions
plot_period_transactions(bgf)

plt.savefig('period_transactions.png', dpi=200)
plt.clf()
plt.cla()
plt.close()


transaction_data = pd.read_csv('transaction_data_clean.csv')

from lifetimes.plotting import plot_history_alive
id = 14096
#from lifetimes.plotting import plot_probability_alive_matrix
from lifetimes.plotting import plot_frequency_recency_matrix
plot_frequency_recency_matrix(mbgnbd)

# In[10]:

from lifetimes.plotting import plot_period_transactions
plot_period_transactions(bgf)

# In[11]:

t = 90  # days to predict in the future
customer_detail[
    'pred_90d_bgf'] = bgf.conditional_expected_number_of_purchases_up_to_time(
        t, customer_detail['frequency'], customer_detail['recency'],
        customer_detail['T'])
customer_detail.sort_values(by='pred_90d_bgf').tail(5)

# In[12]:

#highest expected purchases in the next period

customer_detail[
    'pred_90d_mbgnbd'] = mbgnbd.conditional_expected_number_of_purchases_up_to_time(
        t, customer_detail['frequency'], customer_detail['recency'],
        customer_detail['T'])
customer_detail.head()

# In[13]:
Esempio n. 9
0
bgf = BetaGeoFitter(penalizer_coef=0.001)  # initiating the model object

bgf.fit(rfm_cltv["Frequency"],
        rfm_cltv["Recency_weekly"],
        rfm_cltv["T_weekly"])

# Out[62]: <lifetimes.BetaGeoFitter: fitted with 4338 subjects, a: 1.52, alpha: 0.07, b: 5.69, r: 0.28>
"""In BG/NBD model, there are alpha and beta models that execute probability distribution by taking into consideration
all customers' purchase frequency. Model learns a pattern of customers' purchase frequency and predict. 
"""

# ==================================================================================
# What are the 10 customers to be expected to make the purchase the most in 1 week?
# ==================================================================================
bgf.conditional_expected_number_of_purchases_up_to_time(1,
                                                        rfm_cltv["Frequency"],
                                                        rfm_cltv["Recency_weekly"],
                                                        rfm_cltv["T_weekly"]).sort_values(ascending=False).head(10)

# Customer ID
# 16000   3.47241
# 12713   2.61763
# 15520   1.87669
# 13298   1.87669
# 14569   1.87669
# 13436   1.87669
# 15060   1.82989
# 18139   1.64053
# 14087   1.47115
# 15471   1.47115
# dtype: float64
class transactions(object):

    def summary_create(self, df):
        '''
        Subset df on sales data, create trans summary
        '''
        sales = subset_data(df, 'OrderType', 1)
        #make sure all sales kosher - keep only +0 sales
        sales = sales[sales.OrderTotal>0]
        self.transaction_data = sales[['OrderDate', 'CustomerNo']]
        return summary_data_from_transaction_data(self.transaction_data, 'CustomerNo', 'OrderDate', observation_period_end='2017-02-08')

    def fit_bgf(self, df, t):
        self.bgf = BetaGeoFitter()
        self.bgf.fit(df['frequency'], df['recency'], df['T'])
        self.viz_bgf(t)

    def viz_bgf(self, t):
        #visualize customer frequency and recency matrix
        plot_frequency_recency_matrix(self.bgf, T=t, cmap='coolwarm')
        plt.savefig('sales_frequency_recency_matrix.png')
        plt.close()
        #visualize customer alive probability
        plot_probability_alive_matrix(self.bgf, cmap='coolwarm')
        plt.savefig('probability_alive_matrix.png')
        plt.close()
        #visualize expected repeat Purchases
        plot_expected_repeat_purchases(self.bgf)
        plt.savefig('ProbabilityExpectedRepeatPurchases.png')
        plt.close()
        #visualize the expected number of period transactions
        plot_period_transactions(self.bgf)
        plt.savefig('period_transactions.png')
        plt.close()

    def predict_bgf_indiv(self, df, t, indiv):
        '''
        Predict transactions for a customer for a time frame (days)
        Save transaction visualization for the customer
        '''
        #predict purchases in t days
        df['predicted_purchases'] = df.apply(lambda r: self.bgf.conditional_expected_number_of_purchases_up_to_time(t, r['frequency'], r['recency'], r['T']), axis=1)
        print(df.sort_values('predicted_purchases').tail(5))
        #plot the customer history data with respect to being alive
        self.individual = df.loc[[indiv]]
        self.bgf.predict(t, self.individual['frequency'], self.individual['recency'], self.individual['T'])
        # print(self.bgf.summary())
        self.sp_trans = self.transaction_data.ix[self.transaction_data['CustomerNo'] == self.individual.index[0]]
        self.plot_history_alive_indiv(df, indiv)

    def plot_history_alive_indiv(self, df, indiv):
        '''
        Plot history alive/active for single customer
        '''
        plot_history_alive(self.bgf, int(self.individual['T']), self.sp_trans, 'OrderDate')
        plt.savefig('ProbabilityAliveByHistory_Customer{}.png'.format(indiv))
        plt.close()

    def plot_history_alive_all(self, df, threshold):
        '''
        Plot visualization to make a rule for marketing threshold
        '''
        plot_history_alive_min_thresholds(self.bgf, df, self.transaction_data, threshold)
        #put horizontal line on plot at threshold
        plt.savefig("CustomerThresholdsMinProbabilityActive.png")
        plt.close()

    def calibrate_bgf(self, calib_end_date, period_end_date, viz=False):
        '''
        Visualize the goodness of fit of BGF model
        '''
        summary_cal_holdout = calibration_and_holdout_data(self.transaction_data, 'CustomerNo', 'OrderDate',
                                            calibration_period_end=calib_end_date, #use 75% of data for training
                                            observation_period_end=period_end_date )
        if viz==True:
            print(summary_cal_holdout.head())

        self.bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
        plot_calibration_purchases_vs_holdout_purchases(self.bgf, summary_cal_holdout, colormap='coolwarm', alpha=0.75)
        plt.savefig('calibration_purchases_vs_holdout_purchases.png')
        plt.close()
Esempio n. 11
0
# In[88]:

#predict if the customers are surely alive:
from lifetimes.plotting import plot_probability_alive_matrix

fig = plt.figure(figsize=(12, 8))
plot_probability_alive_matrix(bgf)

# In[89]:

#Predict future transaction in next 10 days i.e.top 10 customers that the model expects them to make purchases
#in the next 10 days, based on historical data

t = 10
lf_tx_data['pred_num_txn'] = round(
    bgf.conditional_expected_number_of_purchases_up_to_time(
        t, lf_tx_data['frequency'], lf_tx_data['recency'], lf_tx_data['T']), 2)
lf_tx_data.sort_values(by='pred_num_txn',
                       ascending=False).head(10).reset_index()

# In[90]:

#Assessing model fit
from lifetimes.plotting import plot_period_transactions

plot_period_transactions(bgf)

# In[91]:

#Customer's future transaction prediction for next 10 days

t = 10
cltv["recency_weekly"] = cltv["recency_cltv_p"] / 7
cltv["T_weekly"] = cltv["T"] / 7

cltv["monetary_avg"][cltv["monetary_avg"] < 0].any ()
cltv[cltv["monetary_avg"]<0]

#######
# BG-NBD
######
bgf = BetaGeoFitter (penalizer_coef=0.001)
bgf.fit (cltv["frequency"], cltv["recency_weekly"], cltv["T_weekly"])

# Expected sales for 1 week
cltv["expected_number_of_purchases"] = bgf.conditional_expected_number_of_purchases_up_to_time (1,
                                                                                                cltv["frequency"],
                                                                                                cltv["recency_weekly"],
                                                                                                cltv["T_weekly"])

cltv.sort_values (by="expected_number_of_purchases", ascending=False).head ()


# Expected sales for whole company for 1 week
bgf.conditional_expected_number_of_purchases_up_to_time (4,
                                                         cltv["frequency"],
                                                         cltv["recency_weekly"],
                                                         cltv["T_weekly"]).sort_values (ascending=False).sum ()

plot_period_transactions (bgf)
plt.show ()

######
Esempio n. 13
0
    #pylab.show()
    #plot_probability_alive_matrix(bgf)
    #pylab.show()
    index=0
    val=(sys.argv[1])
    for row in summary:
        if row[0] == val:
            break 
        else : 
            index+=1
    individual=summary.iloc[index]
    #print(individual)
    
    t=7
    #print("\n\n\nselected customer probability in next week")
    print(bgf.conditional_expected_number_of_purchases_up_to_time(t,individual['frequency'],individual['recency'],individual['T']))
    #summary['predicted_purchases']=(bgf.conditional_expected_number_of_purchases_up_to_time(t, summary['frequency'], summary['recency'], summary['T']))
    #print (summary.head())




    summary2 = summary[summary['frequency']>0]
    ggf = GammaGammaFitter(penalizer_coef = 0)
    ggf.fit(summary2['frequency'],
    summary2['monetary_value'])
    #print (ggf)
    #print("\n\n\nSelected customer clv")
    #print(ggf.conditional_expected_average_profit(individual['frequency'],individual['monetary_value']))
    #summary['clv']=(ggf.conditional_expected_average_profit(summary2['frequency'],summary2['monetary_value']))
    #print(summary.head())
Esempio n. 14
0
File: AI.py Progetto: 0424048/AICRM
def predicted_purchase_time(account, timesteap):
    # df = pd.read_csv('AIexcel/' + account + '.csv' , sep=',', names=['name','uuid','invoiceDate','produce_name','Total'],encoding='utf8',low_memory=False)
    df = pd.read_csv(
        'AIexcel/' + account + '.csv',
        names=['name', 'uuid', 'invoiceDate', 'produce_name', 'Total'],
        sep=',',
        encoding='utf8',
        low_memory=False)
    #df.rename(columns={u'收件人姓名':u'name', u'收件人手機':u'uuid', u'付款日期':u'invoiceDate', u'商品名稱':u'produce_name', u'商品總價':u'Total'}, inplace=True)
    df_ga = pd.read_csv('AIexcel/' + account + '_ga.csv',
                        names=['uuid', 'level', 'next_time'],
                        sep=',',
                        encoding='utf8',
                        low_memory=False)
    df_UserLabel = df_ga['level'][1:].tolist()
    df_ga.drop([0], inplace=True)
    if 'level' in df_ga:
        df_ga['level'] = df_ga.apply(ga_toLevel, axis=1)

    df = df.ix[df.invoiceDate.str.len() == 19]
    df = df.ix[df.name.str.len() <= 10]
    # take three columns
    df1 = df[['uuid', 'invoiceDate', 'Total']]
    # drop price == 1
    df1_ = df1.drop(df1[df1['invoiceDate'] == 1].index)
    # drop non-data
    df_drop = df1_.dropna()
    # change columns name
    dataframe = df_drop
    dataframe['invoiceDate'] = pd.to_datetime(dataframe['invoiceDate']).dt.date
    dataframe.Total = dataframe.Total.astype(float)
    data = summary_data_from_transaction_data(
        dataframe,
        'uuid',
        'invoiceDate',
        observation_period_end=dataframe.invoiceDate.max())
    data2 = summary_data_from_transaction_data(
        dataframe,
        'uuid',
        'invoiceDate',
        monetary_value_col='Total',
        observation_period_end=dataframe.invoiceDate.max())

    bgf = BetaGeoFitter(penalizer_coef=0.01)
    bgf.fit(data['frequency'], data['recency'], data['T'])
    purchase_time = data
    purchase_time[
        'predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(
            30, data['frequency'], data['recency'], data['T'])
    predicted_purchases_df = purchase_time[[
        'predicted_purchases'
    ]].sort_values(by='predicted_purchases', ascending=False)
    predicted_purchases_df['cycle'] = data['recency'] / data['frequency']
    returning_customers_summary = data2[(data2['frequency'] > 0)
                                        & (data2['monetary_value'] != 0)]
    ggf = GammaGammaFitter(penalizer_coef=0.001)
    ggf.fit(returning_customers_summary['frequency'],
            returning_customers_summary['monetary_value'])
    income = ggf.conditional_expected_average_profit(
        returning_customers_summary['frequency'],
        returning_customers_summary['monetary_value']).to_frame()
    income.columns = ['predicted_price']
    predicted_purchases_df = predicted_purchases_df.merge(income,
                                                          on=['uuid'],
                                                          how='left')
    predicted_purchases_df.reset_index(inplace=True)

    mask = predicted_purchases_df.predicted_purchases > 1
    predicted_purchases_df.loc[mask, 'predicted_purchases'] = 1
    predicted_purchases_df['predicted_purchases'] = predicted_purchases_df[
        'predicted_purchases'].astype(float)
    predicted_purchases_df = predicted_purchases_df.sort_values(
        by=['predicted_purchases'], ascending=False)
    predicted_purchases_df['predicted_purchases'] = predicted_purchases_df[
        'predicted_purchases'].apply(lambda x: format(x, '.2%'))

    predicted_purchases_df = predicted_purchases_df.merge(df_ga,
                                                          left_on="uuid",
                                                          right_on="uuid",
                                                          how='left')

    predicted_purchases_df['level'] = predicted_purchases_df.apply(flag_df,
                                                                   axis=1)
    #predicted_purchases_df['level'] = predicted_purchases_df['level'].fillna(1)
    predicted_purchases_df.replace(np.nan, 0, inplace=True)
    predicted_purchases_df.replace(np.inf, 0, inplace=True)
    if 'next_time' not in predicted_purchases_df.columns:
        predicted_purchases_df['next_time'] = np.nan
    predicted_purchases_df['next_time'] = pd.to_datetime(
        predicted_purchases_df['next_time'])

    predicted_purchases_df_N = predicted_purchases_df[~(
        predicted_purchases_df.uuid.isin(
            ((predicted_purchases_df[predicted_purchases_df.next_time >= today]
              .uuid).astype(str)).tolist()))]
    predicted_purchases_df_off = predicted_purchases_df[(
        predicted_purchases_df.uuid.isin(
            ((predicted_purchases_df[predicted_purchases_df.next_time >= today]
              .uuid).astype(str)).tolist()))]
    new_df = predicted_purchases_df_N.append(predicted_purchases_df_off,
                                             ignore_index=True)
    predicted_purchases_df_N['cycle'] = (
        predicted_purchases_df_N['cycle'] *
        predicted_purchases_df_N['level']).round(0).astype(int)
    predicted_purchases_df_N[
        'next_time'] = today + predicted_purchases_df_N.apply(time_df, axis=1)
    predicted_purchases_df_NQ = predicted_purchases_df_N.dropna()
    predicted_purchases_df_off = predicted_purchases_df_off.drop(
        columns=['predicted_purchases', 'cycle', 'predicted_price'])
    predicted_purchases_df_NQ = predicted_purchases_df_NQ.drop(
        columns=['predicted_purchases', 'cycle', 'predicted_price'])
    df_ga = df_ga.merge(predicted_purchases_df_off,
                        left_on="uuid",
                        right_on="uuid",
                        how='left')
    df_ga = df_ga.merge(predicted_purchases_df_NQ,
                        left_on="uuid",
                        right_on="uuid",
                        how='left')
    notNull_df = df_ga[
        df_ga['level'].notnull() & df_ga['next_time'].notnull()].drop(
            columns=['level_y', 'next_time_y', 'next_time_x', 'level_x'])
    notNull_df2 = df_ga[
        df_ga['level_y'].notnull() & df_ga['next_time_y'].notnull()].drop(
            columns=['level', 'next_time', 'next_time_x', 'level_x'])
    notNull_df2.columns = ['uuid', 'level', 'next_time']
    res = pd.concat([notNull_df, notNull_df2], axis=0, ignore_index=True)
    res.rename(columns={u'uuid': u'收件人手機'}, inplace=True)
    res['UserLabel'] = pd.Series(df_UserLabel)
    res = res[[u'收件人手機', u'UserLabel', u'next_time']]
    # res.to_csv('AIexcel/' + account + '_ga.csv',index=False,encoding='utf8')
    predicted_purchases_df_N = predicted_purchases_df_N.drop(
        columns=['level', 'cycle', 'next_time'])
    predicted_purchases_df_N.columns = [u'收件人手機', u'顧客購買機率', u'平均交易金額']

    return predicted_purchases_df_N


# print(predicted_purchase_time(account,30)[:30])
Esempio n. 15
0
##############################################################
# 2. BG/NBD Modelinin Kurulması
##############################################################

# pip install lifetimes

bgf = BetaGeoFitter(penalizer_coef=0.001)

bgf.fit(rfm['frequency'], rfm['recency_weekly_p'], rfm['T_weekly'])

################################################################
# 1 hafta içinde en çok satın alma beklediğimiz 10 müşteri kimdir?
################################################################

bgf.conditional_expected_number_of_purchases_up_to_time(
    1, rfm['frequency'], rfm['recency_weekly_p'],
    rfm['T_weekly']).sort_values(ascending=False).head(10)

rfm["expected_number_of_purchases"] = bgf.predict(1, rfm['frequency'],
                                                  rfm['recency_weekly_p'],
                                                  rfm['T_weekly'])

rfm.head()

################################################################
# 1 ay içinde en çok satın alma beklediğimiz 10 müşteri kimdir?
################################################################

bgf.predict(4, rfm['frequency'], rfm['recency_weekly_p'],
            rfm['T_weekly']).sort_values(ascending=False).head(10)