def main(): segmentation = Segmentation() segmentation.check_data() print("\n\n Start Segmenting Users Based on RFM Methods ... ") segmentation.get_rfm_metric() segmentation.get_rfm_index() segmentation.transaction.apply(segmentation.define_rfm_segment, axis=1) segmentation.save_rfm_segment_pie_chart() segmentation.save_rfm_segment_scatter_plot() #label the segment generated by RFM methods segmentation.transaction['high_growth'] = segmentation.transaction.apply(segmentation.Label_Segments, axis=1) user_label = segmentation.transaction[['user', 'high_growth']] segmentation.save_user_group_bar_chart() print("\n\n Finished Segmenting Users Based on RFM Methods ... ") #---- Now start creating features for prediction ----- print("\n\n Start Feature Generation for Classification ... ") final_dataset = segmentation.create_features() final_dataset = final_dataset.merge(user_label) print("\n\n Finished Feature Generation ... ") classification = Classification() X = final_dataset.drop(columns=['user', 'high_growth']) y = final_dataset['high_growth'] #initial prediction of high growth merchant. in full feature set print("\n\nLogistic Regression before feature selection") classification.run_logistic_regression(X, y) #Feature selection using correlation heatmap classification.correlation_heatmap(final_dataset) selected_feature = final_dataset.drop( columns=['monetary', 'fall_count', 'spring_count', 'summer_count', 'winter_count', 'spring_amt', 'summer_amt', 'winter_amt', 'fall_amt']) classification.correlation_heatmap(selected_feature) # New prediction on selected features X = selected_feature.drop(columns=['user', 'high_growth']) y = selected_feature['high_growth'] print("\n\nLogistic Regression after feature selection") classification.run_logistic_regression(X, y) #Now run logistic regression on using cross val with k=10 print("\n\nLogistic Regression using Cross Validation") classification.run_logistic_cross_val(X, y) print("\n\nLogistic Regression with SMOTE Resampling") classification.run_logistic_regression_with_resampling(X,y) print("\n\nRun Decision Tree") classification.run_decision_tree(X, y) print("\n\nRun Random Forest") y_pred_rf = classification.run_random_forest(X, y) print("\n\nRun Support Vector Machine") classification.run_svm(X, y) #finally print list of High Growth Merchant final_dataset['predicted'] = y_pred_rf high_growth_merchant = final_dataset[['user', 'monetary']].loc[final_dataset['predicted'] == True] high_growth_merchant[['user', 'monetary']].sort_values('monetary', ascending=False).to_csv( 'high_growth_merchant.csv', index=False) print( "HIGH GROWTH MERCHANT as Given by RANDOM FOREST: \n", high_growth_merchant[['user', 'monetary']].sort_values('monetary', ascending=False))