예제 #1
0
def models_combined():
	df = load_data()
	df = process_df(df)
	X, y = get_features(df)

	print "running gradient boosted model with 4000 estimators..."
	gb4000_clf, gb4000_Xtest, gb4000_ytest = run_gradient_boosted(X, y, n_estimators = 4000)

	print "running gradient boosted model with 1000 estimators..."
	gb1000_clf, gb1000_Xtest, gb1000_ytest = run_gradient_boosted(X, y, n_estimators = 1000)
	
	print "running random forest model..."
	rf1_clf, rf1_Xtest, rf1_ytest = run_rf_churn(X,y)
	
	df = load_data()
	df = process_df(df)
	train_df, test_df = train_test_equal_weight(df)
	X_train, y_train = get_features(train_df)
	X_test, y_test = get_features(test_df)
	
	print "running gradient boosted model with even fraud and non-fraud..."
	gbEF_clf, gbEF_Xtest, gbEF_ytest = run_gradient_boosted_evenFraud(X_train, y_train, X_test, y_test)
	
	print "running random forest model with even fraud and non-fraud..."
	rf2_clf, rf2_Xtest, rf2_ytest = run_rf_churn2(X_train, y_train, X_test, y_test)

	return gb4000_clf, gb4000_Xtest, gb4000_ytest, gb1000_clf, gb1000_Xtest, gb1000_ytest, gbEF_clf, gbEF_Xtest, gbEF_ytest, rf1_clf, rf1_Xtest, rf1_ytest, rf2_clf, rf2_Xtest, rf2_ytest
    return plt.savefig(filepath,
                       transparent=False,
                       bbox_inches='tight',
                       format='svg',
                       dpi=1200)


def map_html_file(map_name, filepath):
    return map_name.save(filepath)


if __name__ == '__main__':

    # read in week long cleaned dataframe (pride week, small scale model)

    scooter_june_pride = load_data('../data/small_scooter.csv')
    scooter_june_pride = drop_cols_update_names(scooter_june_pride,
                                                ['Unnamed: 0'])
    scooter_june_pride = cols_to_datetime(scooter_june_pride,
                                          ['Start_Time', 'End_Time'])
    '''EDA Histograms'''
    fig, ax = plt.subplots(figsize=(10, 6))
    histogram_of_column(scooter_june_pride, 'Trip_Distance', ax, '#DF2C04',
                        'Scooter Trip Distances - Pride Week June 2019',
                        'Trip Distance, miles', 0.5, 5000, 0, 10)
    image_file('../images/trip_distance_hist.svg')
    '''EDA Bar Plots'''
    weekdays = [
        'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
        'Sunday'
    ]
예제 #3
0
    roc_auc = auc(fpr, tpr)
    print('auc done...')
    plt.figure(figsize=(6, 6))
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Random Forest Fraud Prediction ROC Curve')
    plt.show()


if __name__ == '__main__':
    # Load Data
    cleandf, fraud = load_data()
    # Perform SMOTE-Overbalancing
    sm = SMOTE(random_state=42)
    X, y = sm.fit_sample(cleandf, fraud)
    #Cross Validate on Various Classification Model
    models = {
        'XGBClassifier': XGBClassifier(),
        'Linear SVC': LinearSVC(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'Logistic Regression': LogisticRegression(),
        'KNN': KNeighborsClassifier(n_neighbors=9),
        'Decision Tree': DecisionTreeClassifier(),
        'Naive Bayes': GaussianNB(),
        'Random Forest': RandomForestClassifier(),
        'AdaBoost': AdaBoostClassifier()
    }
예제 #4
0
    Args:
        df: cleaned training dataframe

    Returns:
        Factorplot
    """
    ax = sns.factorplot(x='delivery_method',
                        y='fraud',
                        data=df,
                        palette='coolwarm_r',
                        kind='bar',
                        ci=None,
                        estimator=lambda x: sum(x == 1.0) * 100.0 / len(x))
    ax.set_axis_labels('Delivery Method', 'Probability of Fraud')
    plt.title('Probability of Fraud by Delivery Method')
    plt.tight_layout()
    plt.show()


if __name__ == '__main__':
    # Load Cleaned Data
    X, y = load_data()
    df = pd.concat([X, y], axis=1)

    # Investigate Most-Correlated Features w/ Fraud
    print(top_corr(df))

    # Display Plots
    display_plots(df)