def upload(): # -*- coding: utf-8 -*- if request.method == 'POST': f = request.files['file'] basepath = os.path.dirname(__file__) file_path = os.path.join(basepath, 'uploads', secure_filename(f.filename)) f.save(file_path) df = pd.read_csv(file_path) df['salesDate'] = pd.to_datetime(df['salesDate']) cols_of_interest = ['memberID', 'salesDate', 'sales'] df = df[cols_of_interest] df['memberID'] = df['memberID'].apply(lambda x: format(x, '.0f')) max_date = df['salesDate'].max() min_date = max_date - relativedelta(months=+12) df = df.loc[(df['salesDate'] >= min_date) & (df['salesDate'] <= max_date)] min_order = df['salesDate'].min() max_order = df['salesDate'].max() data = summary_data_from_transaction_data( df, 'memberID', 'salesDate', monetary_value_col='sales', observation_period_end=max_order) d2 = data.sort_values('frequency', ascending=False) bgf = BetaGeoFitter(penalizer_coef=0.0001) bgf.fit(data['frequency'], data['recency'], data['T']) t = 30 data[ 'customer_livelyhood'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, data['frequency'], data['recency'], data['T']) data.sort_values(by='customer_livelyhood', ascending=False, inplace=True) return data.to_html() return None
def generate_clv_table(data, clv_prediction_time=None, model_penalizer=None): #set default values if they are not stated if clv_prediction_time is None: clv_prediction_time = 12 if model_penalizer is None: model_penalizer = 0 # Reformat csv as a Pandas dataframe #data = pd.read_csv(csv_file) #Remove non search sessions data = data[data['Searches'] > 0] max_date = data['activity_date'].max() # Using "summary_data_from_transaction_data" function to agregate the activity stream into the appropriate metrics # Model requires 'activity_date' column name. For our purpose this is synonymous with submission_date. summary = summary_data_from_transaction_data( data, 'client_id', 'activity_date', 'Revenue', observation_period_end=max_date) # Building the Model using BG/NBD bgf = BetaGeoFitter(penalizer_coef=model_penalizer) bgf.fit(summary['frequency'], summary['recency'], summary['T']) # Conditional expected purchases # These are the expected purchases expected from each individual given the time specified # t = days in to future t = 14 summary[ 'predicted_searches'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, summary['frequency'], summary['recency'], summary['T']) #Conditional Alive Probability summary['alive_prob'] = summary.apply( lambda row: calc_alive_prob(row, bgf), axis=1) summary['alive_prob'] = summary['alive_prob'].astype(float) #print summary['alive_prob'] # There cannot be non-positive values in the monetary_value or frequency vector summary_with_value_and_returns = summary[(summary['monetary_value'] > 0) & (summary['frequency'] > 0)] # There cannot be zero length vectors in one of frequency, recency or T #summary_with_value_and_returns = #print summary_with_value_and_returns[ # (len(summary_with_value_and_returns['recency'])>0) & # (len(summary_with_value_and_returns['frequency'])>0) & # (len(summary_with_value_and_returns['T'])>0) #] if any( len(x) == 0 for x in [ summary_with_value_and_returns['recency'], summary_with_value_and_returns['frequency'], summary_with_value_and_returns['T'] ]): logger.debug(data['client_id']) # Setting up Gamma Gamma model ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Output average profit per tranaction by client ID ggf_output = ggf.conditional_expected_average_profit( summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors if not (len(x) == 0 for x in [ summary_with_value_and_returns['recency'], summary_with_value_and_returns['frequency'], summary_with_value_and_returns['T'] ]): bgf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['recency'], summary_with_value_and_returns['T']) # Getting Customer lifetime value using the Gamma Gamma output # NOTE: the time can be adjusted, but is currently set to 12 months customer_predicted_value = ggf.customer_lifetime_value( bgf, #the model to use to predict the number of future transactions summary_with_value_and_returns['frequency'], summary_with_value_and_returns['recency'], summary_with_value_and_returns['T'], summary_with_value_and_returns['monetary_value'], time=clv_prediction_time, # months discount_rate=0.01 # monthly discount rate ~ 12.7% annually ) # Converting to dataframe df_cpv = pd.DataFrame({ 'client_id': customer_predicted_value.index, 'pred_values': customer_predicted_value.values }) # Setting client_id as index df_cpv = df_cpv.set_index('client_id') # Merge with original summary df_merged = pd.merge(summary, df_cpv, left_index=True, right_index=True, how='outer') # Historical CLV data_hist = data.groupby( ['client_id'])['Searches', 'Revenue'].apply(lambda x: x.astype(float).sum()) # Merge with original summary df_final = pd.merge(df_merged, data_hist, left_index=True, right_index=True, how='outer') # Prevent NaN on the pred_clv column df_final.pred_values[df_final.frequency == 0] = 0.0 # Create column that combines historical and predicted customer value df_final['total_clv'] = df_final['pred_values'] + df_final['Revenue'] # Create column which calculates in days the number of days since they were last active df_final['last_active'] = df_final['T'] - df_final['recency'] # Create a column which labels users inactive over 14 days as "Expired" ELSE "Active" df_final['user_status'] = np.where(df_final['last_active'] > 14, 'Expired', 'Active') # Add column with date of calculation # Set calc_date to max submission date df_final['calc_date'] = max_date.date() #pd.Timestamp('today').date() # Rename columns as appropriate df_final.columns = [ 'frequency', 'recency', 'customer_age', 'avg_session_value', 'predicted_searches_14_days', 'alive_probability', 'predicted_clv_12_months', 'historical_searches', 'historical_clv', 'total_clv', 'days_since_last_active', 'user_status', 'calc_date' ] #Prevent non returning customers from having 100% alive probability df_final.alive_probability[df_final.frequency == 0] = 0.0 return df_final
X_train['T']/7) print(bgf) %matplotlib inline from lifetimes.plotting import plot_frequency_recency_matrix plot_frequency_recency_matrix(bgf) #%% from lifetimes.plotting import plot_probability_alive_matrix f=plot_probability_alive_matrix(bgf) t=52 X_train['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, X_train['txn_total'], X_train['recency_true']/7, X_train['T']/7) #%% from lifetimes.plotting import plot_period_transactions f = plot_period_transactions(bgf) #%% X_train.sort_values('predicted_purchases') #%% # X_train.sort_values(by='predicted_purchases').head(5) from lifetimes.plotting import plot_period_transactions f = plot_period_transactions(bgf) #%% import matplotlib.pyplot as plt f = plt.figure()
bgf.fit(df["FREQUENCY"], df["RECENCY"], df["T"]) bgf.summary plotting.plot_frequency_recency_matrix(bgf) plotting.plot_probability_alive_matrix(bgf) # Repeat transaction model check plotting.plot_period_transactions(bgf) # ========================================================================== # Ranking reps from best to worst # ========================================================================== t = 1 df["predicted_purchases"] = bgf.conditional_expected_number_of_purchases_up_to_time( t, df["FREQUENCY"], df["RECENCY"], df["T"]) df.sort_values(by="predicted_purchases").tail(10) # ========================================================================== # Gamma Gamme Model # Model assumes that there is no relationship between the monetary value and the purchase frequency # ========================================================================== df[["MONETARY_VALUE", "FREQUENCY"]].corr() ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(df["FREQUENCY"], df["MONETARY_VALUE"]) ggf.conditional_expected_average_profit(df["FREQUENCY"], df["MONETARY_VALUE"]).head(10)
monetary.rename(columns={"amount": "monetary_value"}, inplace=True) df_rfm = pd.concat([recency, T, monetary, frequency], axis=1) ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(frequency=df_rfm["frequency"], monetary_value=df_rfm["monetary_value"]) df_rfm["expected_monetary_value"] = df_rfm.apply( lambda row: ggf.conditional_expected_average_profit( row["frequency"], row["monetary_value"]), axis=1) bgf = BetaGeoFitter(penalizer_coef=1) bgf.fit(frequency=df_rfm["frequency"], recency=df_rfm["recency"], T=df_rfm["T"]) df_rfm[ "pred_nb_purchases"] = bgf.conditional_expected_number_of_purchases_up_to_time( t=180, frequency=df_rfm["frequency"], recency=df_rfm["recency"], T=df_rfm["T"]) df_rfm["pred_revenue"] = df_rfm.apply( lambda row: row["pred_nb_purchases"] * row["expected_monetary_value"], axis=1) df_rfm.sort_values(by="pred_revenue", inplace=True) df_rfm.to_csv("{}/clv.csv".format(output_data_dir))
class CLV(object): """ INPUT pmg_num (int) the product market group number, default = 1 outfile1 (str) the filename indicating where to store the raw data before analysis, default = '../data/clvtrainingset01.csv' outfile2 (str) the filename containing the results, default = '../data/clv01.csv' date_range (list) the start date and end date of the years to analyze, default = ['2008-09-01','2016-09-01'] attributes other than those listed above self.data (DataFrame) a pandas DataFrame object of the data to be used for analysis self.bgf (from lifetimes) a statistical model object from the lifetimes package self.ggf (from lifetimes) a statistical model object from the lifetimes package self.results (DataFrame) a pandas DataFrame object of the results of analysis """ def __init__(self,pmg_num=1,outfile1='../data/clvtrainingset01.csv',outfile2='../data/clv01.csv',date_range=['2008-09-01','2016-09-01']): self.pmg_num = pmg_num # outfile1 stores a clean version of the raw data used for analysis; this is important for reproducibility self.outfile1 = outfile1 # outfile2 stores the clv estimation results self.outfile2 = outfile2 self.date_range = date_range self.data = None self.bgf = None self.ggf = None self.results = None def get_data_from_server(self,cmd=None): """ Gets data from sales_db and stores the query results in self.data INPUT cmd (str) the default sql query is below The default query has been replaced. The original query was an 8 line select command. """ # server name dsn = "THE SERVER NAME" cnxn_name = "DSN=%s" % dsn connection = odbc.connect(cnxn_name) # use to access the database c = connection.cursor() # generate cursor object # Grab transaction data from Postgres if not cmd: cmd = """SQL DEFAULT COMMAND GOES HERE""" % (self.pmg_num,self.date_range[0],self.date_range[1]) c.execute(cmd) # execute the sql command # list to store the query data transaction_data = [] # create a dictionary to convert customer ids to name to_name = dict(np.genfromtxt('../data/names.csv',dtype=str,delimiter='\t')) for row in c: cust, rsv_date, sales = row # pull data from each row of the query data cust_id = str(int(cust)) name = to_name[cust_id] # check to see if customer is inactive if use(name): rsv_date1_readable = rsv_date.strftime('%Y-%m-%d') # date formatting sales_float = float(sales) # convert to float; represents the transaction amount transaction_data.append({"id":cust, "date":rsv_date, "sales":sales_float}) # add dictionary of data to list # convert to dataframe df = pd.DataFrame(transaction_data, columns=['id', 'date', 'sales']) # store results df.to_csv(self.outfile1,index=False) # IMPORTANT: use correct observation_period_end date self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M') def get_data_from_file(self,filename,**kwargs): df = pd.read_csv(filename,**kwargs) self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M') def fit(self,months=96): """ Computes CLV estimates for the next n months and stores results in self.results INPUT months (int) number of months to predict, default = 96 (8 years) """ ### PREDICT NUMBER OF PURCHASES self.bgf = BetaGeoFitter() # see lifetimes module documentation for details self.bgf.fit(self.data['frequency'], self.data['recency'], self.data['T']) # 8 years = 96 months self.data['predicted_purchases'] = self.bgf.conditional_expected_number_of_purchases_up_to_time( months, self.data['frequency'], self.data['recency'], self.data['T']) ### PREDICT FUTURE PURCHASE AMOUNT self.ggf = GammaGammaFitter(penalizer_coef = 0) self.ggf.fit(self.data['frequency'], self.data['monetary_value']) # predict next transaction self.data['predicted_trans_profit'] = self.ggf.conditional_expected_average_profit( frequency = self.data['frequency'], monetary_value = self.data['monetary_value']) ### ESTIMATE CLV self.data['clv_estimation'] = self.data['predicted_trans_profit'] * self.data['predicted_purchases'] self.data['prob_alive'] = self.bgf.conditional_probability_alive( self.data['frequency'], self.data['recency'], self.data['T']) self.results = self.data.sort_values(by='clv_estimation',ascending=False) # store results self.results.to_csv(self.outfile2,index=False) def plot_matrices(self): """ plots three matrices: probability alive matrix: displays the probability that a customer is active frequency recency matrix: displays frequency and recency with color corresponding to monetary value period transactions: displays predicted and actual transaction values over time (check documentation in lifetimes for more details) """ plot_probability_alive_matrix(self.bgf,cmap='viridis') plot_frequency_recency_matrix(self.bgf,cmap='viridis') plot_period_transactions(self.bgf)
plt.clf() plt.cla() plt.close() from lifetimes.plotting import plot_probability_alive_matrix plot_probability_alive_matrix(bgf) plt.savefig('probability_alive_matrix.png', dpi=200) plt.clf() plt.cla() plt.close() t = 1 data['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, data['frequency'], data['recency'], data['T']) data.sort_values(by='predicted_purchases', ascending=False).head(5) from lifetimes.plotting import plot_period_transactions plot_period_transactions(bgf) plt.savefig('period_transactions.png', dpi=200) plt.clf() plt.cla() plt.close() transaction_data = pd.read_csv('transaction_data_clean.csv') from lifetimes.plotting import plot_history_alive id = 14096
#from lifetimes.plotting import plot_probability_alive_matrix from lifetimes.plotting import plot_frequency_recency_matrix plot_frequency_recency_matrix(mbgnbd) # In[10]: from lifetimes.plotting import plot_period_transactions plot_period_transactions(bgf) # In[11]: t = 90 # days to predict in the future customer_detail[ 'pred_90d_bgf'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, customer_detail['frequency'], customer_detail['recency'], customer_detail['T']) customer_detail.sort_values(by='pred_90d_bgf').tail(5) # In[12]: #highest expected purchases in the next period customer_detail[ 'pred_90d_mbgnbd'] = mbgnbd.conditional_expected_number_of_purchases_up_to_time( t, customer_detail['frequency'], customer_detail['recency'], customer_detail['T']) customer_detail.head() # In[13]:
bgf = BetaGeoFitter(penalizer_coef=0.001) # initiating the model object bgf.fit(rfm_cltv["Frequency"], rfm_cltv["Recency_weekly"], rfm_cltv["T_weekly"]) # Out[62]: <lifetimes.BetaGeoFitter: fitted with 4338 subjects, a: 1.52, alpha: 0.07, b: 5.69, r: 0.28> """In BG/NBD model, there are alpha and beta models that execute probability distribution by taking into consideration all customers' purchase frequency. Model learns a pattern of customers' purchase frequency and predict. """ # ================================================================================== # What are the 10 customers to be expected to make the purchase the most in 1 week? # ================================================================================== bgf.conditional_expected_number_of_purchases_up_to_time(1, rfm_cltv["Frequency"], rfm_cltv["Recency_weekly"], rfm_cltv["T_weekly"]).sort_values(ascending=False).head(10) # Customer ID # 16000 3.47241 # 12713 2.61763 # 15520 1.87669 # 13298 1.87669 # 14569 1.87669 # 13436 1.87669 # 15060 1.82989 # 18139 1.64053 # 14087 1.47115 # 15471 1.47115 # dtype: float64
class transactions(object): def summary_create(self, df): ''' Subset df on sales data, create trans summary ''' sales = subset_data(df, 'OrderType', 1) #make sure all sales kosher - keep only +0 sales sales = sales[sales.OrderTotal>0] self.transaction_data = sales[['OrderDate', 'CustomerNo']] return summary_data_from_transaction_data(self.transaction_data, 'CustomerNo', 'OrderDate', observation_period_end='2017-02-08') def fit_bgf(self, df, t): self.bgf = BetaGeoFitter() self.bgf.fit(df['frequency'], df['recency'], df['T']) self.viz_bgf(t) def viz_bgf(self, t): #visualize customer frequency and recency matrix plot_frequency_recency_matrix(self.bgf, T=t, cmap='coolwarm') plt.savefig('sales_frequency_recency_matrix.png') plt.close() #visualize customer alive probability plot_probability_alive_matrix(self.bgf, cmap='coolwarm') plt.savefig('probability_alive_matrix.png') plt.close() #visualize expected repeat Purchases plot_expected_repeat_purchases(self.bgf) plt.savefig('ProbabilityExpectedRepeatPurchases.png') plt.close() #visualize the expected number of period transactions plot_period_transactions(self.bgf) plt.savefig('period_transactions.png') plt.close() def predict_bgf_indiv(self, df, t, indiv): ''' Predict transactions for a customer for a time frame (days) Save transaction visualization for the customer ''' #predict purchases in t days df['predicted_purchases'] = df.apply(lambda r: self.bgf.conditional_expected_number_of_purchases_up_to_time(t, r['frequency'], r['recency'], r['T']), axis=1) print(df.sort_values('predicted_purchases').tail(5)) #plot the customer history data with respect to being alive self.individual = df.loc[[indiv]] self.bgf.predict(t, self.individual['frequency'], self.individual['recency'], self.individual['T']) # print(self.bgf.summary()) self.sp_trans = self.transaction_data.ix[self.transaction_data['CustomerNo'] == self.individual.index[0]] self.plot_history_alive_indiv(df, indiv) def plot_history_alive_indiv(self, df, indiv): ''' Plot history alive/active for single customer ''' plot_history_alive(self.bgf, int(self.individual['T']), self.sp_trans, 'OrderDate') plt.savefig('ProbabilityAliveByHistory_Customer{}.png'.format(indiv)) plt.close() def plot_history_alive_all(self, df, threshold): ''' Plot visualization to make a rule for marketing threshold ''' plot_history_alive_min_thresholds(self.bgf, df, self.transaction_data, threshold) #put horizontal line on plot at threshold plt.savefig("CustomerThresholdsMinProbabilityActive.png") plt.close() def calibrate_bgf(self, calib_end_date, period_end_date, viz=False): ''' Visualize the goodness of fit of BGF model ''' summary_cal_holdout = calibration_and_holdout_data(self.transaction_data, 'CustomerNo', 'OrderDate', calibration_period_end=calib_end_date, #use 75% of data for training observation_period_end=period_end_date ) if viz==True: print(summary_cal_holdout.head()) self.bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal']) plot_calibration_purchases_vs_holdout_purchases(self.bgf, summary_cal_holdout, colormap='coolwarm', alpha=0.75) plt.savefig('calibration_purchases_vs_holdout_purchases.png') plt.close()
# In[88]: #predict if the customers are surely alive: from lifetimes.plotting import plot_probability_alive_matrix fig = plt.figure(figsize=(12, 8)) plot_probability_alive_matrix(bgf) # In[89]: #Predict future transaction in next 10 days i.e.top 10 customers that the model expects them to make purchases #in the next 10 days, based on historical data t = 10 lf_tx_data['pred_num_txn'] = round( bgf.conditional_expected_number_of_purchases_up_to_time( t, lf_tx_data['frequency'], lf_tx_data['recency'], lf_tx_data['T']), 2) lf_tx_data.sort_values(by='pred_num_txn', ascending=False).head(10).reset_index() # In[90]: #Assessing model fit from lifetimes.plotting import plot_period_transactions plot_period_transactions(bgf) # In[91]: #Customer's future transaction prediction for next 10 days t = 10
cltv["recency_weekly"] = cltv["recency_cltv_p"] / 7 cltv["T_weekly"] = cltv["T"] / 7 cltv["monetary_avg"][cltv["monetary_avg"] < 0].any () cltv[cltv["monetary_avg"]<0] ####### # BG-NBD ###### bgf = BetaGeoFitter (penalizer_coef=0.001) bgf.fit (cltv["frequency"], cltv["recency_weekly"], cltv["T_weekly"]) # Expected sales for 1 week cltv["expected_number_of_purchases"] = bgf.conditional_expected_number_of_purchases_up_to_time (1, cltv["frequency"], cltv["recency_weekly"], cltv["T_weekly"]) cltv.sort_values (by="expected_number_of_purchases", ascending=False).head () # Expected sales for whole company for 1 week bgf.conditional_expected_number_of_purchases_up_to_time (4, cltv["frequency"], cltv["recency_weekly"], cltv["T_weekly"]).sort_values (ascending=False).sum () plot_period_transactions (bgf) plt.show () ######
#pylab.show() #plot_probability_alive_matrix(bgf) #pylab.show() index=0 val=(sys.argv[1]) for row in summary: if row[0] == val: break else : index+=1 individual=summary.iloc[index] #print(individual) t=7 #print("\n\n\nselected customer probability in next week") print(bgf.conditional_expected_number_of_purchases_up_to_time(t,individual['frequency'],individual['recency'],individual['T'])) #summary['predicted_purchases']=(bgf.conditional_expected_number_of_purchases_up_to_time(t, summary['frequency'], summary['recency'], summary['T'])) #print (summary.head()) summary2 = summary[summary['frequency']>0] ggf = GammaGammaFitter(penalizer_coef = 0) ggf.fit(summary2['frequency'], summary2['monetary_value']) #print (ggf) #print("\n\n\nSelected customer clv") #print(ggf.conditional_expected_average_profit(individual['frequency'],individual['monetary_value'])) #summary['clv']=(ggf.conditional_expected_average_profit(summary2['frequency'],summary2['monetary_value'])) #print(summary.head())
def predicted_purchase_time(account, timesteap): # df = pd.read_csv('AIexcel/' + account + '.csv' , sep=',', names=['name','uuid','invoiceDate','produce_name','Total'],encoding='utf8',low_memory=False) df = pd.read_csv( 'AIexcel/' + account + '.csv', names=['name', 'uuid', 'invoiceDate', 'produce_name', 'Total'], sep=',', encoding='utf8', low_memory=False) #df.rename(columns={u'收件人姓名':u'name', u'收件人手機':u'uuid', u'付款日期':u'invoiceDate', u'商品名稱':u'produce_name', u'商品總價':u'Total'}, inplace=True) df_ga = pd.read_csv('AIexcel/' + account + '_ga.csv', names=['uuid', 'level', 'next_time'], sep=',', encoding='utf8', low_memory=False) df_UserLabel = df_ga['level'][1:].tolist() df_ga.drop([0], inplace=True) if 'level' in df_ga: df_ga['level'] = df_ga.apply(ga_toLevel, axis=1) df = df.ix[df.invoiceDate.str.len() == 19] df = df.ix[df.name.str.len() <= 10] # take three columns df1 = df[['uuid', 'invoiceDate', 'Total']] # drop price == 1 df1_ = df1.drop(df1[df1['invoiceDate'] == 1].index) # drop non-data df_drop = df1_.dropna() # change columns name dataframe = df_drop dataframe['invoiceDate'] = pd.to_datetime(dataframe['invoiceDate']).dt.date dataframe.Total = dataframe.Total.astype(float) data = summary_data_from_transaction_data( dataframe, 'uuid', 'invoiceDate', observation_period_end=dataframe.invoiceDate.max()) data2 = summary_data_from_transaction_data( dataframe, 'uuid', 'invoiceDate', monetary_value_col='Total', observation_period_end=dataframe.invoiceDate.max()) bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(data['frequency'], data['recency'], data['T']) purchase_time = data purchase_time[ 'predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time( 30, data['frequency'], data['recency'], data['T']) predicted_purchases_df = purchase_time[[ 'predicted_purchases' ]].sort_values(by='predicted_purchases', ascending=False) predicted_purchases_df['cycle'] = data['recency'] / data['frequency'] returning_customers_summary = data2[(data2['frequency'] > 0) & (data2['monetary_value'] != 0)] ggf = GammaGammaFitter(penalizer_coef=0.001) ggf.fit(returning_customers_summary['frequency'], returning_customers_summary['monetary_value']) income = ggf.conditional_expected_average_profit( returning_customers_summary['frequency'], returning_customers_summary['monetary_value']).to_frame() income.columns = ['predicted_price'] predicted_purchases_df = predicted_purchases_df.merge(income, on=['uuid'], how='left') predicted_purchases_df.reset_index(inplace=True) mask = predicted_purchases_df.predicted_purchases > 1 predicted_purchases_df.loc[mask, 'predicted_purchases'] = 1 predicted_purchases_df['predicted_purchases'] = predicted_purchases_df[ 'predicted_purchases'].astype(float) predicted_purchases_df = predicted_purchases_df.sort_values( by=['predicted_purchases'], ascending=False) predicted_purchases_df['predicted_purchases'] = predicted_purchases_df[ 'predicted_purchases'].apply(lambda x: format(x, '.2%')) predicted_purchases_df = predicted_purchases_df.merge(df_ga, left_on="uuid", right_on="uuid", how='left') predicted_purchases_df['level'] = predicted_purchases_df.apply(flag_df, axis=1) #predicted_purchases_df['level'] = predicted_purchases_df['level'].fillna(1) predicted_purchases_df.replace(np.nan, 0, inplace=True) predicted_purchases_df.replace(np.inf, 0, inplace=True) if 'next_time' not in predicted_purchases_df.columns: predicted_purchases_df['next_time'] = np.nan predicted_purchases_df['next_time'] = pd.to_datetime( predicted_purchases_df['next_time']) predicted_purchases_df_N = predicted_purchases_df[~( predicted_purchases_df.uuid.isin( ((predicted_purchases_df[predicted_purchases_df.next_time >= today] .uuid).astype(str)).tolist()))] predicted_purchases_df_off = predicted_purchases_df[( predicted_purchases_df.uuid.isin( ((predicted_purchases_df[predicted_purchases_df.next_time >= today] .uuid).astype(str)).tolist()))] new_df = predicted_purchases_df_N.append(predicted_purchases_df_off, ignore_index=True) predicted_purchases_df_N['cycle'] = ( predicted_purchases_df_N['cycle'] * predicted_purchases_df_N['level']).round(0).astype(int) predicted_purchases_df_N[ 'next_time'] = today + predicted_purchases_df_N.apply(time_df, axis=1) predicted_purchases_df_NQ = predicted_purchases_df_N.dropna() predicted_purchases_df_off = predicted_purchases_df_off.drop( columns=['predicted_purchases', 'cycle', 'predicted_price']) predicted_purchases_df_NQ = predicted_purchases_df_NQ.drop( columns=['predicted_purchases', 'cycle', 'predicted_price']) df_ga = df_ga.merge(predicted_purchases_df_off, left_on="uuid", right_on="uuid", how='left') df_ga = df_ga.merge(predicted_purchases_df_NQ, left_on="uuid", right_on="uuid", how='left') notNull_df = df_ga[ df_ga['level'].notnull() & df_ga['next_time'].notnull()].drop( columns=['level_y', 'next_time_y', 'next_time_x', 'level_x']) notNull_df2 = df_ga[ df_ga['level_y'].notnull() & df_ga['next_time_y'].notnull()].drop( columns=['level', 'next_time', 'next_time_x', 'level_x']) notNull_df2.columns = ['uuid', 'level', 'next_time'] res = pd.concat([notNull_df, notNull_df2], axis=0, ignore_index=True) res.rename(columns={u'uuid': u'收件人手機'}, inplace=True) res['UserLabel'] = pd.Series(df_UserLabel) res = res[[u'收件人手機', u'UserLabel', u'next_time']] # res.to_csv('AIexcel/' + account + '_ga.csv',index=False,encoding='utf8') predicted_purchases_df_N = predicted_purchases_df_N.drop( columns=['level', 'cycle', 'next_time']) predicted_purchases_df_N.columns = [u'收件人手機', u'顧客購買機率', u'平均交易金額'] return predicted_purchases_df_N # print(predicted_purchase_time(account,30)[:30])
############################################################## # 2. BG/NBD Modelinin Kurulması ############################################################## # pip install lifetimes bgf = BetaGeoFitter(penalizer_coef=0.001) bgf.fit(rfm['frequency'], rfm['recency_weekly_p'], rfm['T_weekly']) ################################################################ # 1 hafta içinde en çok satın alma beklediğimiz 10 müşteri kimdir? ################################################################ bgf.conditional_expected_number_of_purchases_up_to_time( 1, rfm['frequency'], rfm['recency_weekly_p'], rfm['T_weekly']).sort_values(ascending=False).head(10) rfm["expected_number_of_purchases"] = bgf.predict(1, rfm['frequency'], rfm['recency_weekly_p'], rfm['T_weekly']) rfm.head() ################################################################ # 1 ay içinde en çok satın alma beklediğimiz 10 müşteri kimdir? ################################################################ bgf.predict(4, rfm['frequency'], rfm['recency_weekly_p'], rfm['T_weekly']).sort_values(ascending=False).head(10)