def main(): train_path = "../data/churn_train.csv" test_path = "../data/churn_test.csv" dc_train = DataCleaning(train_path) dc_test = DataCleaning(test_path) X_train, y_train = dc_train.clean() X_test, y_test = dc_test.clean() dc_train_reg = DataCleaning(train_path) dc_test_reg = DataCleaning(test_path) X_train_reg, y_train_reg = dc_train_reg.clean(regression=True) X_test_reg, y_test_reg = dc_test_reg.clean(regression=True) train_col_names = dc_train.get_column_names() train_col_names_reg = dc_train_reg.get_column_names() rf = RandomForestClassifier gb = GradientBoostingClassifier logr = LogisticRegression pipe = Pipeline([rf, gb]) pipe.fit_predict(X_train, y_train) pipe.print_cv_results(train_col_names, X_train, y_train) pipe2 = Pipeline([logr]) pipe2.fit_predict(X_train_reg, y_train_reg) pipe2.print_cv_results(train_col_names_reg, X_train_reg, y_train_reg) plot_rocs([pipe, pipe2], [[X_train, y_train], [X_train_reg, y_train_reg]]) test_scores = pipe.score(X_test, y_test)
def main(): train_path = "data/data.json" #test_path = "data/test.csv" dc_train = DataCleaning(train_path) #dc_test = DataCleaning(test_path) X_train, y_train = dc_train.clean() #X_test, y_test = dc_test.clean() # dc_train_reg = DataCleaning(train_path) # dc_test_reg = DataCleaning(test_path) # X_train_reg, y_train_reg = dc_train_reg.clean(regression=True) # X_test_reg, y_test_reg = dc_test_reg.clean(regression=True) train_col_names = dc_train.get_column_names() # train_col_names_reg = dc_train_reg.get_column_names() rf = RandomForestClassifier gb = GradientBoostingClassifier logr = LogisticRegression svm_model = svm.SVC pipe = Pipeline([gb]) pipe.fit_predict(X_train, y_train) pipe.print_cv_results(train_col_names, X_train, y_train) with open('model.pkl', 'w') as f: pickle.dump(pipe.trained_models[1], f)