from question_query import create_questions_df from answer_query import create_answers_df from data_cleaning import DataCleaner from model_tester import FindOptimalModels if __name__ == '__main__': numrows = 1e6 print("Connecting and getting ~{}".format(numrows)) a = create_answers_df(numrows) print("Got rows, cleaning data") a_train_dc = DataCleaner(a, questions=False, training=True, simple_regression=True, time_split=False, normalize=False) A, b = a_train_dc.get_clean() default_models = [RandomForestRegressor, GradientBoostingRegressor] param_dict = {'rf': {'n_estimators': [50, 100, 5000], 'max_depth': [2, 3, 5]}, 'gbr': {'learning_rate': [.001, .01, .1, .2], 'max_depth': [2, 3, 5], 'n_estimators': [50, 100, 5000]}} print('Finding optimal models') finder = FindOptimalModels(A, b, question=False, time_split=False) finder.baseline_model() fitted_models = finder.run_default_models(default_models) print("starting grid search") opt_params = finder.run_grid_search(fitted_models, param_dict) opt_results = finder.run_optimized_models()
from answer_query import create_answers_df from data_cleaning import DataCleaner from model_tester import FindOptimalModels if __name__ == '__main__': numrows = 1e6 print("Connecting and getting ~{}".format(numrows)) q = create_questions_df(numrows) print("Got rows, cleaning data") q_train_dc = DataCleaner(q, questions=True, training=True, simple_regression=True, time_split=False, normalize=False) X, y = q_train_dc.get_clean() default_models = [RandomForestRegressor, GradientBoostingRegressor] param_dict = { 'rf': { 'n_estimators': [50, 100, 5000], 'max_depth': [2, 3, 5] }, 'gbr': { 'learning_rate': [.001, .01, .1, .2], 'max_depth': [2, 3, 5], 'n_estimators': [50, 100, 5000] } } print('Finding optimal models')