def models_combined(): df = load_data() df = process_df(df) X, y = get_features(df) print "running gradient boosted model with 4000 estimators..." gb4000_clf, gb4000_Xtest, gb4000_ytest = run_gradient_boosted(X, y, n_estimators = 4000) print "running gradient boosted model with 1000 estimators..." gb1000_clf, gb1000_Xtest, gb1000_ytest = run_gradient_boosted(X, y, n_estimators = 1000) print "running random forest model..." rf1_clf, rf1_Xtest, rf1_ytest = run_rf_churn(X,y) df = load_data() df = process_df(df) train_df, test_df = train_test_equal_weight(df) X_train, y_train = get_features(train_df) X_test, y_test = get_features(test_df) print "running gradient boosted model with even fraud and non-fraud..." gbEF_clf, gbEF_Xtest, gbEF_ytest = run_gradient_boosted_evenFraud(X_train, y_train, X_test, y_test) print "running random forest model with even fraud and non-fraud..." rf2_clf, rf2_Xtest, rf2_ytest = run_rf_churn2(X_train, y_train, X_test, y_test) return gb4000_clf, gb4000_Xtest, gb4000_ytest, gb1000_clf, gb1000_Xtest, gb1000_ytest, gbEF_clf, gbEF_Xtest, gbEF_ytest, rf1_clf, rf1_Xtest, rf1_ytest, rf2_clf, rf2_Xtest, rf2_ytest
return plt.savefig(filepath, transparent=False, bbox_inches='tight', format='svg', dpi=1200) def map_html_file(map_name, filepath): return map_name.save(filepath) if __name__ == '__main__': # read in week long cleaned dataframe (pride week, small scale model) scooter_june_pride = load_data('../data/small_scooter.csv') scooter_june_pride = drop_cols_update_names(scooter_june_pride, ['Unnamed: 0']) scooter_june_pride = cols_to_datetime(scooter_june_pride, ['Start_Time', 'End_Time']) '''EDA Histograms''' fig, ax = plt.subplots(figsize=(10, 6)) histogram_of_column(scooter_june_pride, 'Trip_Distance', ax, '#DF2C04', 'Scooter Trip Distances - Pride Week June 2019', 'Trip Distance, miles', 0.5, 5000, 0, 10) image_file('../images/trip_distance_hist.svg') '''EDA Bar Plots''' weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]
roc_auc = auc(fpr, tpr) print('auc done...') plt.figure(figsize=(6, 6)) plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Random Forest Fraud Prediction ROC Curve') plt.show() if __name__ == '__main__': # Load Data cleandf, fraud = load_data() # Perform SMOTE-Overbalancing sm = SMOTE(random_state=42) X, y = sm.fit_sample(cleandf, fraud) #Cross Validate on Various Classification Model models = { 'XGBClassifier': XGBClassifier(), 'Linear SVC': LinearSVC(), 'Gradient Boosting': GradientBoostingClassifier(), 'Logistic Regression': LogisticRegression(), 'KNN': KNeighborsClassifier(n_neighbors=9), 'Decision Tree': DecisionTreeClassifier(), 'Naive Bayes': GaussianNB(), 'Random Forest': RandomForestClassifier(), 'AdaBoost': AdaBoostClassifier() }
Args: df: cleaned training dataframe Returns: Factorplot """ ax = sns.factorplot(x='delivery_method', y='fraud', data=df, palette='coolwarm_r', kind='bar', ci=None, estimator=lambda x: sum(x == 1.0) * 100.0 / len(x)) ax.set_axis_labels('Delivery Method', 'Probability of Fraud') plt.title('Probability of Fraud by Delivery Method') plt.tight_layout() plt.show() if __name__ == '__main__': # Load Cleaned Data X, y = load_data() df = pd.concat([X, y], axis=1) # Investigate Most-Correlated Features w/ Fraud print(top_corr(df)) # Display Plots display_plots(df)